PyPI - unstructured-ingest - Versions diffs - 0.0.19__py3-none-any.whl → 0.0.22__py3-none-any.whl - Mend

unstructured-ingest 0.0.19py3-none-any.whl → 0.0.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (47) hide show

unstructured_ingest/embed/vertexai.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 import os
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Annotated, Any, List, Optional
+from typing import TYPE_CHECKING, Annotated, Any, Optional
 import numpy as np
 from pydantic import Field, Secret, ValidationError
@@ -13,7 +13,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
-    from langchain_google_vertexai import VertexAIEmbeddings
+    from vertexai.language_models import TextEmbeddingModel
 def conform_string_to_dict(value: Any) -> dict:
@@ -41,45 +41,53 @@ class VertexAIEmbeddingConfig(EmbeddingConfig):
         os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(application_credentials_path)
     @requires_dependencies(
-        ["langchain", "langchain_google_vertexai"],
+        ["vertexai"],
         extras="embed-vertexai",
     )
-    def get_client(self) -> "VertexAIEmbeddings":
+    def get_client(self) -> "TextEmbeddingModel":
         """Creates a Langchain VertexAI python client to embed elements."""
-        from langchain_google_vertexai import VertexAIEmbeddings
+        from vertexai.language_models import TextEmbeddingModel
         self.register_application_credentials()
-        vertexai_client = VertexAIEmbeddings(model_name=self.embedder_model_name)
-        return vertexai_client
+        return TextEmbeddingModel.from_pretrained(self.embedder_model_name)
 @dataclass
 class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: VertexAIEmbeddingConfig
-    def get_exemplary_embedding(self) -> List[float]:
+    def get_exemplary_embedding(self) -> list[float]:
         return self.embed_query(query="A sample query.")
-    def num_of_dimensions(self):
+    def num_of_dimensions(self) -> tuple[int, ...]:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.shape(exemplary_embedding)
-    def is_unit_vector(self):
+    def is_unit_vector(self) -> bool:
         exemplary_embedding = self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
     def embed_query(self, query):
-        client = self.config.get_client()
-        result = client.embed_query(str(query))
-        return result
+        return self._embed_documents(elements=[query])[0]
-    def embed_documents(self, elements: List[dict]) -> List[dict]:
-        client = self.config.get_client()
-        embeddings = client.embed_documents([e.get("text", "") for e in elements])
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
+        embeddings = self._embed_documents([e.get("text", "") for e in elements])
         elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
         return elements_with_embeddings
-    def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
+    @requires_dependencies(
+        ["vertexai"],
+        extras="embed-vertexai",
+    )
+    def _embed_documents(self, elements: list[str]) -> list[list[float]]:
+        from vertexai.language_models import TextEmbeddingInput
+        client = self.config.get_client()
+        inputs = [TextEmbeddingInput(text=element) for element in elements]
+        embeddings = client.get_embeddings(inputs)
+        return [e.values for e in embeddings]
+    def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
         assert len(elements) == len(embeddings)
         elements_w_embedding = []
         for i, element in enumerate(elements):

unstructured_ingest/embed/voyageai.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 import numpy as np
 from pydantic import Field, SecretStr
@@ -8,7 +8,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
 from unstructured_ingest.utils.dep_check import requires_dependencies
 if TYPE_CHECKING:
-    from langchain_voyageai import VoyageAIEmbeddings
+    from voyageai import Client as VoyageAIClient
 class VoyageAIEmbeddingConfig(EmbeddingConfig):
@@ -16,28 +16,30 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
     embedder_model_name: str = Field(alias="model_name")
     batch_size: Optional[int] = Field(default=None)
     truncation: Optional[bool] = Field(default=None)
+    max_retries: int = 0
+    timeout_in_seconds: Optional[int] = None
     @requires_dependencies(
         ["langchain", "langchain_voyageai"],
         extras="embed-voyageai",
     )
-    def get_client(self) -> "VoyageAIEmbeddings":
+    def get_client(self) -> "VoyageAIClient":
         """Creates a Langchain VoyageAI python client to embed elements."""
-        from langchain_voyageai import VoyageAIEmbeddings
+        from voyageai import Client as VoyageAIClient
-        return VoyageAIEmbeddings(
-            voyage_api_key=self.api_key,
-            model=self.embedder_model_name,
-            batch_size=self.batch_size,
-            truncation=self.truncation,
+        client = VoyageAIClient(
+            api_key=self.api_key.get_secret_value(),
+            max_retries=self.max_retries,
+            timeout=self.timeout_in_seconds,
         )
+        return client
 @dataclass
 class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: VoyageAIEmbeddingConfig
-    def get_exemplary_embedding(self) -> List[float]:
+    def get_exemplary_embedding(self) -> list[float]:
         return self.embed_query(query="A sample query.")
     @property
@@ -50,17 +52,20 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
         exemplary_embedding = self.get_exemplary_embedding()
         return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
-    def embed_documents(self, elements: List[dict]) -> List[dict]:
-        client = self.config.get_client()
-        embeddings = client.embed_documents([e.get("text", "") for e in elements])
+    def _embed_documents(self, elements: list[str]) -> list[list[float]]:
+        client: VoyageAIClient = self.config.get_client()
+        response = client.embed(texts=elements, model=self.config.embedder_model_name)
+        return response.embeddings
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
+        embeddings = self._embed_documents([e.get("text", "") for e in elements])
         return self._add_embeddings_to_elements(elements, embeddings)
-    def embed_query(self, query: str) -> List[float]:
-        client = self.config.get_client()
-        return client.embed_query(query)
+    def embed_query(self, query: str) -> list[float]:
+        return self._embed_documents(elements=[query])[0]
     @staticmethod
-    def _add_embeddings_to_elements(elements, embeddings) -> List[dict]:
+    def _add_embeddings_to_elements(elements, embeddings) -> list[dict]:
         assert len(elements) == len(embeddings)
         elements_w_embedding = []
         for i, element in enumerate(elements):

unstructured_ingest/v2/cli/base/cmd.py CHANGED Viewed

@@ -155,7 +155,7 @@ class BaseCmd(ABC):
     @staticmethod
     def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
         filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
-        if not filterer_configs.dict():
+        if not filterer_configs.model_dump():
             return None
         return Filterer(config=filterer_configs)

unstructured_ingest/v2/interfaces/connector.py CHANGED Viewed

@@ -19,7 +19,7 @@ class ConnectionConfig(BaseModel):
     def get_access_config(self) -> dict[str, Any]:
         if not self.access_config:
             return {}
-        return self.access_config.get_secret_value().dict()
+        return self.access_config.get_secret_value().model_dump()
 ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -187,7 +187,9 @@ class Pipeline:
         return filtered_records
     def _run(self):
-        logger.info(f"running local pipeline: {self} with configs: " f"{self.context.json()}")
+        logger.info(
+            f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
+        )
         if self.context.mp_supported:
             manager = mp.Manager()
             self.context.status = manager.dict()

unstructured_ingest/v2/pipeline/steps/chunk.py CHANGED Viewed

@@ -28,7 +28,7 @@ class ChunkStep(PipelineStep):
         return f"{self.identifier} ({self.process.config.chunking_strategy})"
     def __post_init__(self):
-        config = self.process.config.json() if self.process.config else None
+        config = self.process.config.model_dump_json() if self.process.config else None
         logger.info(f"created {self.identifier} with configs: {config}")
     def should_chunk(self, filepath: Path, file_data: FileData) -> bool:

unstructured_ingest/v2/pipeline/steps/download.py CHANGED Viewed

@@ -31,9 +31,13 @@ class DownloadStep(PipelineStep):
         return f"{self.identifier} ({self.process.__class__.__name__})"
     def __post_init__(self):
-        config = self.process.download_config.json() if self.process.download_config else None
+        config = (
+            self.process.download_config.model_dump_json() if self.process.download_config else None
+        )
         connection_config = (
-            self.process.connection_config.json() if self.process.connection_config else None
+            self.process.connection_config.model_dump_json()
+            if self.process.connection_config
+            else None
         )
         logger.info(
             f"Created {self.identifier} with configs: {config}, "

unstructured_ingest/v2/pipeline/steps/embed.py CHANGED Viewed

@@ -28,7 +28,7 @@ class EmbedStep(PipelineStep):
         return f"{self.identifier} ({self.process.config.embedding_provider})"
     def __post_init__(self):
-        config = self.process.config.json() if self.process.config else None
+        config = self.process.config.model_dump_json() if self.process.config else None
         logger.info(f"created {self.identifier} with configs: {config}")
     def should_embed(self, filepath: Path, file_data: FileData) -> bool:

unstructured_ingest/v2/pipeline/steps/filter.py CHANGED Viewed

@@ -16,7 +16,7 @@ class FilterStep(PipelineStep):
     identifier: str = STEP_ID
     def __post_init__(self):
-        config = self.process.config.json() if self.process.config else None
+        config = self.process.config.model_dump_json() if self.process.config else None
         logger.info(f"created {self.identifier} with configs: {config}")
     async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:

unstructured_ingest/v2/pipeline/steps/index.py CHANGED Viewed

@@ -23,9 +23,11 @@ class IndexStep(PipelineStep):
         return f"{self.identifier} ({self.process.__class__.__name__})"
     def __post_init__(self):
-        config = self.process.index_config.json() if self.process.index_config else None
+        config = self.process.index_config.model_dump_json() if self.process.index_config else None
         connection_config = (
-            self.process.connection_config.json() if self.process.connection_config else None
+            self.process.connection_config.model_dump_json()
+            if self.process.connection_config
+            else None
         )
         logger.info(
             f"created {self.identifier} with configs: {config}, "

unstructured_ingest/v2/pipeline/steps/partition.py CHANGED Viewed

@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
         return f"{self.identifier} ({self.process.config.strategy})"
     def __post_init__(self):
-        config = self.process.config.json()
+        config = self.process.config.model_dump_json()
         logger.info(f"created {self.identifier} with configs: {config}")
     def should_partition(self, filepath: Path, file_data: FileData) -> bool:

unstructured_ingest/v2/pipeline/steps/stage.py CHANGED Viewed

@@ -28,7 +28,9 @@ class UploadStageStep(PipelineStep):
     def __post_init__(self):
         config = (
-            self.process.upload_stager_config.json() if self.process.upload_stager_config else None
+            self.process.upload_stager_config.model_dump_json()
+            if self.process.upload_stager_config
+            else None
         )
         self.cache_dir.mkdir(parents=True, exist_ok=True)
         logger.info(f"created {self.identifier} with configs: {config}")

unstructured_ingest/v2/pipeline/steps/uncompress.py CHANGED Viewed

@@ -22,7 +22,7 @@ class UncompressStep(PipelineStep):
     identifier: str = STEP_ID
     def __post_init__(self):
-        config = self.process.config.json() if self.process.config else None
+        config = self.process.config.model_dump_json() if self.process.config else None
         logger.info(f"created {self.identifier} with configs: {config}")
     async def _run_async(

unstructured_ingest/v2/pipeline/steps/upload.py CHANGED Viewed

@@ -25,9 +25,13 @@ class UploadStep(BatchPipelineStep):
         return f"{self.identifier} ({self.process.__class__.__name__})"
     def __post_init__(self):
-        config = self.process.upload_config.json() if self.process.upload_config else None
+        config = (
+            self.process.upload_config.model_dump_json() if self.process.upload_config else None
+        )
         connection_config = (
-            self.process.connection_config.json() if self.process.connection_config else None
+            self.process.connection_config.model_dump_json()
+            if self.process.connection_config
+            else None
         )
         logger.info(
             f"Created {self.identifier} with configs: {config}, "

unstructured_ingest/v2/processes/chunker.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC
-from dataclasses import dataclass, fields
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Optional
@@ -9,6 +9,7 @@ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces.process import BaseProcess
 from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.unstructured_api import call_api
 CHUNK_MAX_CHARS_DEFAULT: int = 500
 CHUNK_MULTI_PAGE_DEFAULT: bool = True
@@ -111,35 +112,13 @@ class Chunker(BaseProcess, ABC):
     @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
     async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
-        from unstructured_client import UnstructuredClient
-        from unstructured_client.models.operations import PartitionRequest
-        from unstructured_client.models.shared import Files, PartitionParameters
-        client = UnstructuredClient(
-            api_key_auth=self.config.chunk_api_key.get_secret_value(),
+        elements = await call_api(
             server_url=self.config.chunking_endpoint,
+            api_key=self.config.chunk_api_key.get_secret_value(),
+            filename=elements_filepath,
+            api_parameters=self.config.to_chunking_kwargs(),
         )
-        partition_request = self.config.to_chunking_kwargs()
-        possible_fields = [f.name for f in fields(PartitionParameters)]
-        filtered_partition_request = {
-            k: v for k, v in partition_request.items() if k in possible_fields
-        }
-        if len(filtered_partition_request) != len(partition_request):
-            logger.debug(
-                "Following fields were omitted due to not being "
-                "supported by the currently used unstructured client: {}".format(
-                    ", ".join([v for v in partition_request if v not in filtered_partition_request])
-                )
-            )
-        with open(elements_filepath, "rb") as f:
-            files = Files(
-                content=f.read(),
-                file_name=str(elements_filepath.resolve()),
-            )
-            filtered_partition_request["files"] = files
-            partition_params = PartitionParameters(**filtered_partition_request)
-            partition_request_obj = PartitionRequest(partition_params)
-        resp = client.general.partition(partition_request_obj)
-        elements = resp.elements or []
         elements = assign_and_map_hash_ids(elements=elements)
         return elements

unstructured_ingest/v2/processes/connectors/airtable.py CHANGED Viewed

@@ -181,7 +181,7 @@ class AirtableIndexer(Indexer):
             yield FileData(
                 identifier=table_meta.get_id(),
                 connector_type=CONNECTOR_TYPE,
-                additional_metadata=table_meta.dict(),
+                additional_metadata=table_meta.model_dump(),
                 source_identifiers=SourceIdentifiers(
                     filename=str(Path(fullpath).name),
                     fullpath=fullpath,

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -25,7 +25,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
 )
 if TYPE_CHECKING:
-    from astrapy.db import AstraDBCollection
+    from astrapy import Collection as AstraDBCollection
 CONNECTOR_TYPE = "astradb"
@@ -85,7 +86,12 @@ class AstraDBUploaderConfig(UploaderConfig):
     embedding_dimension: int = Field(
         default=384, description="The dimensionality of the embeddings"
     )
-    namespace: Optional[str] = Field(default=None, description="The Astra DB connection namespace.")
+    keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
+    namespace: Optional[str] = Field(
+        default=None,
+        description="The Astra DB connection namespace.",
+        deprecated="Please use 'keyspace' instead.",
+    )
     requested_indexing_policy: Optional[dict[str, Any]] = Field(
         default=None,
         description="The indexing policy to use for the collection.",
@@ -109,33 +115,34 @@ class AstraDBUploader(Uploader):
     @requires_dependencies(["astrapy"], extras="astradb")
     def get_collection(self) -> "AstraDBCollection":
-        from astrapy.db import AstraDB
+        from astrapy import DataAPIClient as AstraDBClient
-        # Get the collection_name and embedding dimension
-        collection_name = self.upload_config.collection_name
-        embedding_dimension = self.upload_config.embedding_dimension
-        requested_indexing_policy = self.upload_config.requested_indexing_policy
+        # Choose keyspace or deprecated namespace
+        keyspace_param = self.upload_config.keyspace or self.upload_config.namespace
-        # If the user has requested an indexing policy, pass it to the Astra DB
-        options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
+        # Get the collection_name
+        collection_name = self.upload_config.collection_name
         # Build the Astra DB object.
-        # caller_name/version for AstraDB tracking
         access_configs = self.connection_config.access_config.get_secret_value()
-        astra_db = AstraDB(
-            api_endpoint=access_configs.api_endpoint,
-            token=access_configs.token,
-            namespace=self.upload_config.namespace,
+        # Create a client object to interact with the Astra DB
+        # caller_name/version for Astra DB tracking
+        my_client = AstraDBClient(
             caller_name=integration_name,
             caller_version=integration_version,
         )
-        # Create and connect to the newly created collection
-        astra_db_collection = astra_db.create_collection(
-            collection_name=collection_name,
-            dimension=embedding_dimension,
-            options=options,
+        # Get the database object
+        astra_db = my_client.get_database(
+            api_endpoint=access_configs.api_endpoint,
+            token=access_configs.token,
+            keyspace=keyspace_param,
         )
+        # Connect to the newly created collection
+        astra_db_collection = astra_db.get_collection(name=collection_name)
         return astra_db_collection
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:

unstructured_ingest/v2/processes/connectors/databricks_volumes.py CHANGED Viewed

@@ -42,8 +42,10 @@ class DatabricksVolumesAccessConfig(AccessConfig):
         description="The Databricks password part of basic authentication. "
         "Only possible when Host is *.cloud.databricks.com (AWS).",
     )
-    client_id: Optional[str] = Field(default=None)
-    client_secret: Optional[str] = Field(default=None)
+    client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
+    client_secret: Optional[str] = Field(
+        default=None, description="Client Secret of the OAuth app."
+    )
     token: Optional[str] = Field(
         default=None,
         description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
@@ -128,7 +130,7 @@ class DatabricksVolumesUploader(Uploader):
         return WorkspaceClient(
             host=self.connection_config.host,
-            **self.connection_config.access_config.get_secret_value().dict(),
+            **self.connection_config.access_config.get_secret_value().model_dump(),
         )
     def precheck(self) -> None:
@@ -140,11 +142,12 @@ class DatabricksVolumesUploader(Uploader):
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         output_path = os.path.join(self.upload_config.path, path.name)
-        self.get_client().files.upload(
-            file_path=output_path,
-            contents=path,
-            overwrite=self.upload_config.overwrite,
-        )
+        with open(path, "rb") as elements_file:
+            self.get_client().files.upload(
+                file_path=output_path,
+                contents=elements_file,
+                overwrite=self.upload_config.overwrite,
+            )
 databricks_volumes_destination_entry = DestinationRegistryEntry(

unstructured_ingest/v2/processes/connectors/elasticsearch.py CHANGED Viewed

@@ -104,8 +104,8 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
         elif access_config.es_api_key:
             client_input_kwargs["api_key"] = access_config.es_api_key
         client_input = ElasticsearchClientInput(**client_input_kwargs)
-        logger.debug(f"elasticsearch client inputs mapped to: {client_input.dict()}")
-        client_kwargs = client_input.dict()
+        logger.debug(f"elasticsearch client inputs mapped to: {client_input.model_dump()}")
+        client_kwargs = client_input.model_dump()
         client_kwargs["basic_auth"] = (
             client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
         )

unstructured_ingest/v2/processes/connectors/fsspec/azure.py CHANGED Viewed

@@ -2,12 +2,13 @@ from __future__ import annotations
 from dataclasses import dataclass, field
 from pathlib import Path
+from time import time
 from typing import Any, Generator, Optional
 from pydantic import Field, Secret
 from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
+from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
@@ -84,7 +85,7 @@ class AzureConnectionConfig(FsspecConnectionConfig):
     def get_access_config(self) -> dict[str, Any]:
         # Avoid injecting None by filtering out k,v pairs where the value is None
         access_configs: dict[str, Any] = {
-            k: v for k, v in self.access_config.get_secret_value().dict().items() if v
+            k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v
         }
         return access_configs
@@ -99,14 +100,39 @@ class AzureIndexer(FsspecIndexer):
     def precheck(self) -> None:
         super().precheck()
-    def sterilize_info(self, path) -> dict:
-        info = self.fs.info(path=path)
-        return sterilize_dict(data=info, default=azure_json_serial)
+    def sterilize_info(self, file_data: dict) -> dict:
+        return sterilize_dict(data=file_data, default=azure_json_serial)
     @requires_dependencies(["adlfs", "fsspec"], extras="azure")
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
         return super().run(**kwargs)
+    def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
+        path = file_data["name"]
+        date_created = (
+            file_data.get("creation_time").timestamp() if "creation_time" in file_data else None
+        )
+        date_modified = (
+            file_data.get("last_modified").timestamp() if "last_modified" in file_data else None
+        )
+        file_size = file_data.get("size") if "size" in file_data else None
+        version = file_data.get("etag")
+        record_locator = {
+            "protocol": self.index_config.protocol,
+            "remote_file_path": self.index_config.remote_url,
+        }
+        return FileDataSourceMetadata(
+            date_created=date_created,
+            date_modified=date_modified,
+            date_processed=str(time()),
+            version=version,
+            url=f"{self.index_config.protocol}://{path}",
+            record_locator=record_locator,
+            filesize_bytes=file_size,
+        )
 class AzureDownloaderConfig(FsspecDownloaderConfig):
     pass

unstructured_ingest/v2/processes/connectors/fsspec/box.py CHANGED Viewed

@@ -2,12 +2,14 @@ from __future__ import annotations
 from dataclasses import dataclass, field
 from pathlib import Path
+from time import time
 from typing import Any, Generator, Optional
+from dateutil import parser
 from pydantic import Field, Secret
 from unstructured_ingest.utils.dep_check import requires_dependencies
-from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
+from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
 from unstructured_ingest.v2.processes.connector_registry import (
     DestinationRegistryEntry,
     SourceRegistryEntry,
@@ -52,7 +54,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
                 ac.box_app_config,
             ),
         }
-        access_config: dict[str, Any] = ac.dict()
+        access_config: dict[str, Any] = ac.model_dump()
         access_config.pop("box_app_config", None)
         access_kwargs_with_oauth.update(access_config)
@@ -73,6 +75,33 @@ class BoxIndexer(FsspecIndexer):
     def precheck(self) -> None:
         super().precheck()
+    def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
+        path = file_data["name"]
+        date_created = None
+        date_modified = None
+        if modified_at_str := file_data.get("modified_at"):
+            date_modified = parser.parse(modified_at_str).timestamp()
+        if created_at_str := file_data.get("created_at"):
+            date_created = parser.parse(created_at_str).timestamp()
+        file_size = file_data.get("size") if "size" in file_data else None
+        version = file_data.get("id")
+        record_locator = {
+            "protocol": self.index_config.protocol,
+            "remote_file_path": self.index_config.remote_url,
+            "file_id": file_data.get("id"),
+        }
+        return FileDataSourceMetadata(
+            date_created=date_created,
+            date_modified=date_modified,
+            date_processed=str(time()),
+            version=version,
+            url=f"{self.index_config.protocol}://{path}",
+            record_locator=record_locator,
+            filesize_bytes=file_size,
+        )
 class BoxDownloaderConfig(FsspecDownloaderConfig):
     pass

unstructured-ingest 0.0.19__py3-none-any.whl → 0.0.22__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.19py3-none-any.whl → 0.0.22py3-none-any.whl