PyPI - unstructured-ingest - Versions diffs - 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

unstructured-ingest 0.0.4py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (14) hide show

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.4" # pragma: no cover
1	+ __version__ = "0.0.5" # pragma: no cover

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -188,22 +188,26 @@ class Pipeline:
         indices = self.indexer_step.run()
         indices_inputs = [{"file_data_path": i} for i in indices]
         if not indices_inputs:
+            logger.info("No files to process after indexer, exiting")
             return
         # Initial filtering on indexed content
         indices_inputs = self.apply_filter(records=indices_inputs)
         if not indices_inputs:
+            logger.info("No files to process after filtering indexed content, exiting")
             return
         # Download associated content to local file system
         downloaded_data = self.downloader_step(indices_inputs)
         downloaded_data = self.clean_results(results=downloaded_data)
         if not downloaded_data:
+            logger.info("No files to process after downloader, exiting")
             return
         # Post download filtering
         downloaded_data = self.apply_filter(records=downloaded_data)
         if not downloaded_data:
+            logger.info("No files to process after filtering downloaded content, exiting")
             return
         # Run uncompress if available
@@ -215,6 +219,7 @@ class Pipeline:
             # Post uncompress filtering
             downloaded_data = self.apply_filter(records=downloaded_data)
             if not downloaded_data:
+                logger.info("No files to process after filtering uncompressed content, exiting")
                 return
         if not downloaded_data:
@@ -224,6 +229,7 @@ class Pipeline:
         elements = self.partitioner_step(downloaded_data)
         elements = self.clean_results(results=elements)
         if not elements:
+            logger.info("No files to process after partitioning, exiting")
             return
         # Run element specific modifiers
@@ -231,6 +237,7 @@ class Pipeline:
             elements = step(elements) if step else elements
             elements = self.clean_results(results=elements)
             if not elements:
+                logger.info(f"No files to process after {step.__class__.__name__}, exiting")
                 return
         # Upload the final result
@@ -333,7 +340,7 @@ class Pipeline:
             )
         if len(destination_entry) != 1:
             raise ValueError(
-                "no entry found in source registry with matching uploader, "
+                "no entry found in destination registry with matching uploader, "
                 "stager and connection configs"
             )

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -13,13 +13,15 @@ from .azure_cognitive_search import azure_cognitive_search_destination_entry
 from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
 from .chroma import chroma_destination_entry
 from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
-from .couchbase import couchbase_destination_entry
+from .couchbase import couchbase_destination_entry, couchbase_source_entry
 from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
 from .databricks_volumes import databricks_volumes_destination_entry
 from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
 from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
 from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
 from .google_drive import google_drive_source_entry
+from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
+from .kdbai import kdbai_destination_entry
 from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
 from .local import local_destination_entry, local_source_entry
 from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
@@ -47,6 +49,7 @@ add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_d
 add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
+add_source_entry(source_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_source_entry)
 add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
 add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
@@ -87,3 +90,5 @@ add_destination_entry(
     destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
     entry=azure_cognitive_search_destination_entry,
 )
+add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -31,19 +31,16 @@ if TYPE_CHECKING:
 CONNECTOR_TYPE = "astradb"
-@dataclass
 class AstraDBAccessConfig(AccessConfig):
     token: str = Field(description="Astra DB Token with access to the database.")
     api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
-@dataclass
 class AstraDBConnectionConfig(ConnectionConfig):
-    connection_type: str = CONNECTOR_TYPE
+    connection_type: str = Field(default=CONNECTOR_TYPE, init=False)
     access_config: Secret[AstraDBAccessConfig]
-@dataclass
 class AstraDBUploadStagerConfig(UploadStagerConfig):
     pass

unstructured_ingest/v2/processes/connectors/couchbase.py CHANGED Viewed

@@ -1,26 +1,42 @@
+import hashlib
 import json
+import sys
+import time
 from dataclasses import dataclass, field
 from datetime import timedelta
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Generator, List
 from pydantic import Field, Secret
-from unstructured_ingest.error import DestinationConnectionError
-from unstructured_ingest.utils.data_prep import batch_generator
+from unstructured_ingest.error import (
+    DestinationConnectionError,
+    SourceConnectionError,
+    SourceConnectionNetworkError,
+)
+from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    DownloadResponse,
+    FileData,
+    FileDataSourceMetadata,
+    Indexer,
+    IndexerConfig,
     UploadContent,
     Uploader,
     UploaderConfig,
     UploadStager,
     UploadStagerConfig,
+    download_responses,
 )
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
+    SourceRegistryEntry,
 )
 if TYPE_CHECKING:
@@ -49,6 +65,19 @@ class CouchbaseConnectionConfig(ConnectionConfig):
     connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
     access_config: Secret[CouchbaseAccessConfig]
+    @requires_dependencies(["couchbase"], extras="couchbase")
+    def connect_to_couchbase(self) -> "Cluster":
+        from couchbase.auth import PasswordAuthenticator
+        from couchbase.cluster import Cluster
+        from couchbase.options import ClusterOptions
+        auth = PasswordAuthenticator(self.username, self.access_config.get_secret_value().password)
+        options = ClusterOptions(auth)
+        options.apply_profile("wan_development")
+        cluster = Cluster(self.connection_string, options)
+        cluster.wait_until_ready(timedelta(seconds=5))
+        return cluster
 class CouchbaseUploadStagerConfig(UploadStagerConfig):
     pass
@@ -98,26 +127,9 @@ class CouchbaseUploader(Uploader):
     upload_config: CouchbaseUploaderConfig
     connector_type: str = CONNECTOR_TYPE
-    @requires_dependencies(["couchbase"], extras="couchbase")
-    def connect_to_couchbase(self) -> "Cluster":
-        from couchbase.auth import PasswordAuthenticator
-        from couchbase.cluster import Cluster
-        from couchbase.options import ClusterOptions
-        connection_string = self.connection_config.connection_string
-        username = self.connection_config.username
-        password = self.connection_config.access_config.get_secret_value().password
-        auth = PasswordAuthenticator(username, password)
-        options = ClusterOptions(auth)
-        options.apply_profile("wan_development")
-        cluster = Cluster(connection_string, options)
-        cluster.wait_until_ready(timedelta(seconds=5))
-        return cluster
     def precheck(self) -> None:
         try:
-            self.connect_to_couchbase()
+            self.connection_config.connect_to_couchbase()
         except Exception as e:
             logger.error(f"Failed to validate connection {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -133,7 +145,7 @@ class CouchbaseUploader(Uploader):
             f"bucket, {self.connection_config.bucket} "
             f"at {self.connection_config.connection_string}",
         )
-        cluster = self.connect_to_couchbase()
+        cluster = self.connection_config.connect_to_couchbase()
         bucket = cluster.bucket(self.connection_config.bucket)
         scope = bucket.scope(self.connection_config.scope)
         collection = scope.collection(self.connection_config.collection)
@@ -142,6 +154,168 @@ class CouchbaseUploader(Uploader):
             collection.upsert_multi({doc_id: doc for doc in chunk for doc_id, doc in doc.items()})
+class CouchbaseIndexerConfig(IndexerConfig):
+    batch_size: int = Field(default=50, description="Number of documents to index per batch")
+@dataclass
+class CouchbaseIndexer(Indexer):
+    connection_config: CouchbaseConnectionConfig
+    index_config: CouchbaseIndexerConfig
+    connector_type: str = CONNECTOR_TYPE
+    def precheck(self) -> None:
+        try:
+            self.connection_config.connect_to_couchbase()
+        except Exception as e:
+            logger.error(f"Failed to validate connection {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    @requires_dependencies(["couchbase"], extras="couchbase")
+    def _get_doc_ids(self) -> List[str]:
+        query = (
+            f"SELECT META(d).id "
+            f"FROM `{self.connection_config.bucket}`."
+            f"`{self.connection_config.scope}`."
+            f"`{self.connection_config.collection}` as d"
+        )
+        max_attempts = 5
+        attempts = 0
+        while attempts < max_attempts:
+            try:
+                cluster = self.connection_config.connect_to_couchbase()
+                result = cluster.query(query)
+                document_ids = [row["id"] for row in result]
+                return document_ids
+            except Exception as e:
+                attempts += 1
+                time.sleep(3)
+                if attempts == max_attempts:
+                    raise SourceConnectionError(f"failed to get document ids: {e}")
+    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        ids = self._get_doc_ids()
+        id_batches = [
+            ids[i * self.index_config.batch_size : (i + 1) * self.index_config.batch_size]
+            for i in range(
+                (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
+            )
+        ]
+        for batch in id_batches:
+            # Make sure the hash is always a positive number to create identified
+            identified = str(hash(tuple(batch)) + sys.maxsize + 1)
+            yield FileData(
+                identifier=identified,
+                connector_type=CONNECTOR_TYPE,
+                metadata=FileDataSourceMetadata(
+                    url=f"{self.connection_config.connection_string}/"
+                    f"{self.connection_config.bucket}",
+                    date_processed=str(time.time()),
+                ),
+                additional_metadata={
+                    "ids": list(batch),
+                    "bucket": self.connection_config.bucket,
+                },
+            )
+class CouchbaseDownloaderConfig(DownloaderConfig):
+    fields: list[str] = field(default_factory=list)
+@dataclass
+class CouchbaseDownloader(Downloader):
+    connection_config: CouchbaseConnectionConfig
+    download_config: CouchbaseDownloaderConfig
+    connector_type: str = CONNECTOR_TYPE
+    def is_async(self) -> bool:
+        return False
+    def get_identifier(self, bucket: str, record_id: str) -> str:
+        f = f"{bucket}-{record_id}"
+        if self.download_config.fields:
+            f = "{}-{}".format(
+                f,
+                hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
+            )
+        return f
+    def map_cb_results(self, cb_results: dict) -> str:
+        doc_body = cb_results
+        flattened_dict = flatten_dict(dictionary=doc_body)
+        str_values = [str(value) for value in flattened_dict.values()]
+        concatenated_values = "\n".join(str_values)
+        return concatenated_values
+    def generate_download_response(
+        self, result: dict, bucket: str, file_data: FileData
+    ) -> DownloadResponse:
+        record_id = result["id"]
+        filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
+        filename = f"{filename_id}.txt"
+        download_path = self.download_dir / Path(filename)
+        logger.debug(
+            f"Downloading results from bucket {bucket} and id {record_id} to {download_path}"
+        )
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            with open(download_path, "w", encoding="utf8") as f:
+                f.write(self.map_cb_results(cb_results=result))
+        except Exception as e:
+            logger.error(
+                f"failed to download from bucket {bucket} "
+                f"and id {record_id} to {download_path}: {e}",
+                exc_info=True,
+            )
+            raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
+        return DownloadResponse(
+            file_data=FileData(
+                identifier=filename_id,
+                connector_type=CONNECTOR_TYPE,
+                metadata=FileDataSourceMetadata(
+                    version=None,
+                    date_processed=str(time.time()),
+                    record_locator={
+                        "connection_string": self.connection_config.connection_string,
+                        "bucket": bucket,
+                        "scope": self.connection_config.scope,
+                        "collection": self.connection_config.collection,
+                        "document_id": record_id,
+                    },
+                ),
+            ),
+            path=download_path,
+        )
+    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        bucket_name: str = file_data.additional_metadata["bucket"]
+        ids: list[str] = file_data.additional_metadata["ids"]
+        cluster = self.connection_config.connect_to_couchbase()
+        bucket = cluster.bucket(bucket_name)
+        scope = bucket.scope(self.connection_config.scope)
+        collection = scope.collection(self.connection_config.collection)
+        download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
+        return list(download_resp)
+    def process_doc_id(self, doc_id, collection, bucket_name, file_data):
+        result = collection.get(doc_id)
+        return self.generate_download_response(
+            result=result.content_as[dict], bucket=bucket_name, file_data=file_data
+        )
+    def process_all_doc_ids(self, ids, collection, bucket_name, file_data):
+        for doc_id in ids:
+            yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
+    async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        raise NotImplementedError()
 couchbase_destination_entry = DestinationRegistryEntry(
     connection_config=CouchbaseConnectionConfig,
     uploader=CouchbaseUploader,
@@ -149,3 +323,11 @@ couchbase_destination_entry = DestinationRegistryEntry(
     upload_stager=CouchbaseUploadStager,
     upload_stager_config=CouchbaseUploadStagerConfig,
 )
+couchbase_source_entry = SourceRegistryEntry(
+    connection_config=CouchbaseConnectionConfig,
+    indexer=CouchbaseIndexer,
+    indexer_config=CouchbaseIndexerConfig,
+    downloader=CouchbaseDownloader,
+    downloader_config=CouchbaseDownloaderConfig,
+)

unstructured_ingest/v2/processes/connectors/kdbai.py ADDED Viewed

@@ -0,0 +1,170 @@
+import json
+import uuid
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Optional
+import numpy as np
+import pandas as pd
+from pydantic import Field, Secret
+from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.utils.data_prep import flatten_dict
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    FileData,
+    UploadContent,
+    Uploader,
+    UploaderConfig,
+    UploadStager,
+    UploadStagerConfig,
+)
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.processes.connector_registry import (
+    DestinationRegistryEntry,
+)
+if TYPE_CHECKING:
+    from kdbai_client import Session, Table
+CONNECTOR_TYPE = "kdbai"
+class KdbaiAccessConfig(AccessConfig):
+    api_key: Optional[str] = Field(
+        default=None,
+        description="A string for the api-key, can be left empty "
+        "when connecting to local KDBAI instance.",
+    )
+SecretKdbaiAccessConfig = Secret[KdbaiAccessConfig]
+class KdbaiConnectionConfig(ConnectionConfig):
+    access_config: SecretKdbaiAccessConfig = Field(
+        default=SecretKdbaiAccessConfig(secret_value=KdbaiAccessConfig())
+    )
+    endpoint: str = Field(
+        default="http://localhost:8082", description="Endpoint url where KDBAI is hosted."
+    )
+    @requires_dependencies(["kdbai_client"], extras="kdbai")
+    def get_session(self) -> "Session":
+        from kdbai_client import Session
+        return Session(
+            api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
+        )
+class KdbaiUploadStagerConfig(UploadStagerConfig):
+    pass
+@dataclass
+class KdbaiUploadStager(UploadStager):
+    upload_stager_config: KdbaiUploadStagerConfig = field(default_factory=KdbaiUploadStagerConfig)
+    def run(
+        self,
+        elements_filepath: Path,
+        file_data: FileData,
+        output_dir: Path,
+        output_filename: str,
+        **kwargs: Any,
+    ) -> Path:
+        with open(elements_filepath) as elements_file:
+            elements_contents = json.load(elements_file)
+        output_path = Path(output_dir) / Path(f"{output_filename}.json")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        data = []
+        for element in elements_contents:
+            data.append(
+                {
+                    "id": str(uuid.uuid4()),
+                    "element_id": element.get("element_id"),
+                    "document": element.pop("text", None),
+                    "embeddings": element.get("embeddings"),
+                    "metadata": flatten_dict(
+                        dictionary=element.get("metadata"),
+                        flatten_lists=True,
+                        remove_none=True,
+                    ),
+                }
+            )
+        logger.debug(f"writing {len(data)} elements to {output_path}")
+        with output_path.open("w") as output_file:
+            json.dump(data, output_file, indent=2)
+        return output_path
+class KdbaiUploaderConfig(UploaderConfig):
+    table_name: str = Field(description="The name of the KDBAI table to write into.")
+    batch_size: int = Field(default=100, description="Number of records per batch")
+@dataclass
+class KdbaiUploader(Uploader):
+    connection_config: KdbaiConnectionConfig
+    upload_config: KdbaiUploaderConfig
+    connector_type: str = field(default=CONNECTOR_TYPE, init=False)
+    def precheck(self) -> None:
+        try:
+            self.get_table()
+        except Exception as e:
+            logger.error(f"Failed to validate connection {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    def get_table(self) -> "Table":
+        session: Session = self.connection_config.get_session()
+        table = session.table(self.upload_config.table_name)
+        return table
+    def upsert_batch(self, batch: pd.DataFrame):
+        table = self.get_table()
+        table.insert(data=batch)
+    def process_dataframe(self, df: pd.DataFrame):
+        logger.debug(
+            f"uploading {len(df)} entries to {self.connection_config.endpoint} "
+            f"db in table {self.upload_config.table_name}"
+        )
+        for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
+            self.upsert_batch(batch=batch_df)
+    def process_csv(self, csv_paths: list[Path]):
+        logger.debug(f"uploading content from {len(csv_paths)} csv files")
+        df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
+        self.process_dataframe(df=df)
+    def process_json(self, json_paths: list[Path]):
+        logger.debug(f"uploading content from {len(json_paths)} json files")
+        all_records = []
+        for p in json_paths:
+            with open(p) as json_file:
+                all_records.extend(json.load(json_file))
+        df = pd.DataFrame(data=all_records)
+        self.process_dataframe(df=df)
+    def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
+        csv_paths = [c.path for c in contents if c.path.suffix == ".csv"]
+        if csv_paths:
+            self.process_csv(csv_paths=csv_paths)
+        json_paths = [c.path for c in contents if c.path.suffix == ".json"]
+        if json_paths:
+            self.process_json(json_paths=json_paths)
+kdbai_destination_entry = DestinationRegistryEntry(
+    connection_config=KdbaiConnectionConfig,
+    uploader=KdbaiUploader,
+    uploader_config=KdbaiUploaderConfig,
+    upload_stager=KdbaiUploadStager,
+    upload_stager_config=KdbaiUploadStagerConfig,
+)

unstructured_ingest/v2/processes/connectors/local.py CHANGED Viewed

@@ -71,9 +71,12 @@ class LocalIndexer(Indexer):
         input_path = self.index_config.path
         if input_path.is_file():
             return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
+        files = []
         if self.index_config.recursive:
-            return list(input_path.rglob("*"))
-        return list(input_path.glob("*"))
+            files.extend(list(input_path.rglob("*")))
+        else:
+            files.extend(list(input_path.glob("*")))
+        return [f for f in files if f.is_file()]
     def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
         stats = path.stat()

unstructured_ingest/v2/processes/connectors/pinecone.py CHANGED Viewed

@@ -42,7 +42,6 @@ SecretPineconeAccessConfig = Secret[PineconeAccessConfig]
 class PineconeConnectionConfig(ConnectionConfig):
     index_name: str = Field(description="Name of the index to connect to.")
-    environment: str = Field(description="Environment to connect to.")
     access_config: SecretPineconeAccessConfig = Field(
         default_factory=lambda: SecretPineconeAccessConfig(secret_value=PineconeAccessConfig())
     )
@@ -155,7 +154,6 @@ class PineconeUploader(Uploader):
         logger.info(
             f"writing document batches to destination"
             f" index named {self.connection_config.index_name}"
-            f" environment named {self.connection_config.environment}"
             f" with batch size {self.upload_config.batch_size}"
             f" with {self.upload_config.num_processes} (number of) processes"
         )

unstructured-ingest 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.4py3-none-any.whl → 0.0.5py3-none-any.whl