PyPI - unstructured-ingest - Versions diffs - 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl - Mend

unstructured-ingest 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (70) hide show

unstructured_ingest/v2/processes/connectors/airtable.py ADDED Viewed

@@ -0,0 +1,235 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator, Optional
+from uuid import NAMESPACE_DNS, uuid5
+import pandas
+from pydantic import BaseModel, Field, Secret, field_validator
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    FileData,
+    Indexer,
+    IndexerConfig,
+    SourceIdentifiers,
+    download_responses,
+)
+from unstructured_ingest.v2.processes.connector_registry import (
+    SourceRegistryEntry,
+)
+if TYPE_CHECKING:
+    from pyairtable import Api
+    from pyairtable.api.types import RecordDict
+CONNECTOR_TYPE = "airtable"
+class AirtableTableMeta(BaseModel):
+    """Metadata specifying a table id, a base id which the table is stored in,
+    and an t.Optional view id in case particular rows and fields are to be ingested"""
+    base_id: str
+    table_id: str
+    view_id: Optional[str] = None
+    def get_id(self) -> str:
+        id_s = f"{self.base_id}{self.table_id}"
+        id_s = f"{id_s}{self.view_id}" if self.view_id else id_s
+        return str(uuid5(NAMESPACE_DNS, id_s))
+class AirtableAccessConfig(AccessConfig):
+    personal_access_token: str = Field(
+        description="Personal access token to authenticate into Airtable. Check: "
+        "https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens "
+        "for more info"
+    )
+class AirtableConnectionConfig(ConnectionConfig):
+    access_config: Secret[AirtableAccessConfig]
+    @requires_dependencies(["pyairtable"], extras="airtable")
+    def get_client(self) -> "Api":
+        from pyairtable import Api
+        access_config = self.access_config.get_secret_value()
+        return Api(api_key=access_config.personal_access_token)
+class AirtableIndexerConfig(IndexerConfig):
+    list_of_paths: Optional[list[str]] = Field(
+        default=None,
+        description="""
+        A list of paths that specify the locations to ingest data from within Airtable.
+        If this argument is not set, the connector ingests all tables within each and every base.
+        --list-of-paths: path1 path2 path3 ….
+        path: base_id/table_id(optional)/view_id(optional)/
+        To obtain (base, table, view) ids in bulk, check:
+        https://airtable.com/developers/web/api/list-bases (base ids)
+        https://airtable.com/developers/web/api/get-base-schema (table and view ids)
+        https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids)
+        To obtain specific ids from Airtable UI, go to your workspace, and copy any
+        relevant id from the URL structure:
+        https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM
+        appAbcDeF1ghijKlm -> base_id
+        tblABcdEfG1HIJkLm -> table_id
+        viwABCDEfg6hijKLM -> view_id
+        You can also check: https://support.airtable.com/docs/finding-airtable-ids
+        Here is an example for one --list-of-paths:
+            base1/		→ gets the entirety of all tables inside base1
+            base1/table1		→ gets all rows and columns within table1 in base1
+            base1/table1/view1	→ gets the rows and columns that are
+                                  visible in view1 for the table1 in base1
+        Examples to invalid airtable_paths:
+            table1          → has to mention base to be valid
+            base1/view1     → has to mention table to be valid
+                """,
+    )
+    @classmethod
+    def validate_path(cls, path: str):
+        components = path.split("/")
+        if len(components) > 3:
+            raise ValueError(
+                f"Path must be of the format: base_id/table_id/view_id, "
+                f"where table id and view id are optional. Got: {path}"
+            )
+    @field_validator("list_of_paths")
+    @classmethod
+    def validate_format(cls, v: list[str]) -> list[str]:
+        for path in v:
+            cls.validate_path(path=path)
+        return v
+@dataclass
+class AirtableIndexer(Indexer):
+    connector_type: str = CONNECTOR_TYPE
+    connection_config: AirtableConnectionConfig
+    index_config: AirtableIndexerConfig
+    def get_all_table_meta(self) -> list[AirtableTableMeta]:
+        client = self.connection_config.get_client()
+        bases = client.bases()
+        airtable_meta = []
+        for base in bases:
+            for table in base.schema().tables:
+                airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
+        return airtable_meta
+    def get_base_tables_meta(self, base_id: str) -> list[AirtableTableMeta]:
+        client = self.connection_config.get_client()
+        base = client.base(base_id=base_id)
+        airtable_meta = []
+        for table in base.tables():
+            airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
+        return airtable_meta
+    def get_meta_from_list(self) -> list[AirtableTableMeta]:
+        airtable_meta = []
+        for path in self.index_config.list_of_paths:
+            components = path.split("/")
+            if len(components) == 1:
+                airtable_meta.extend(self.get_base_tables_meta(base_id=components[0]))
+            elif len(components) == 2:
+                airtable_meta.append(
+                    AirtableTableMeta(base_id=components[0], table_id=components[1])
+                )
+            elif len(components) == 3:
+                airtable_meta.append(
+                    AirtableTableMeta(
+                        base_id=components[0], table_id=components[1], view_id=components[2]
+                    )
+                )
+            else:
+                raise ValueError(
+                    f"Path must be of the format: base_id/table_id/view_id, "
+                    f"where table id and view id are optional. Got: {path}"
+                )
+        return airtable_meta
+    def get_table_metas(self) -> list[AirtableTableMeta]:
+        if not self.index_config.list_of_paths:
+            return self.get_all_table_meta()
+        return self.get_meta_from_list()
+    def precheck(self) -> None:
+        client = self.connection_config.get_client()
+        client.request(method="HEAD", url=client.build_url("meta", "bases"))
+    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        table_metas = self.get_table_metas()
+        for table_meta in table_metas:
+            fullpath = (
+                f"{table_meta.base_id}/{table_meta.table_id}/{table_meta.view_id}.csv"
+                if table_meta.view_id
+                else f"{table_meta.base_id}/{table_meta.table_id}.csv"
+            )
+            yield FileData(
+                identifier=table_meta.get_id(),
+                connector_type=CONNECTOR_TYPE,
+                additional_metadata=table_meta.dict(),
+                source_identifiers=SourceIdentifiers(
+                    filename=str(Path(fullpath).name),
+                    fullpath=fullpath,
+                ),
+            )
+class AirtableDownloaderConfig(DownloaderConfig):
+    pass
+@dataclass
+class AirtableDownloader(Downloader):
+    connection_config: AirtableConnectionConfig
+    download_config: AirtableDownloaderConfig = field(default_factory=AirtableDownloaderConfig)
+    connector_type: str = CONNECTOR_TYPE
+    def get_table_contents(self, table_meta: AirtableTableMeta) -> list["RecordDict"]:
+        client = self.connection_config.get_client()
+        table = client.table(base_id=table_meta.base_id, table_name=table_meta.table_id)
+        table_fetch_kwargs = {"view": table_meta.view_id} if table_meta.view_id else {}
+        rows = table.all(**table_fetch_kwargs)
+        return rows
+    def _table_row_to_dict(self, table_row: "RecordDict") -> dict:
+        row_dict = {
+            "id": table_row["id"],
+            "created_time": table_row["createdTime"],
+        }
+        row_dict.update(table_row["fields"])
+        return row_dict
+    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
+        table_contents = self.get_table_contents(table_meta=table_meta)
+        df = pandas.DataFrame.from_dict(
+            data=[self._table_row_to_dict(table_row=row) for row in table_contents]
+        ).sort_index(axis=1)
+        download_path = self.get_download_path(file_data=file_data)
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        df.to_csv(path_or_buf=download_path)
+        return self.generate_download_response(file_data=file_data, download_path=download_path)
+airtable_source_entry = SourceRegistryEntry(
+    indexer=AirtableIndexer,
+    indexer_config=AirtableIndexerConfig,
+    downloader=AirtableDownloader,
+    downloader_config=AirtableDownloaderConfig,
+    connection_config=AirtableConnectionConfig,
+)

unstructured_ingest/v2/processes/connectors/elasticsearch.py CHANGED Viewed

@@ -104,7 +104,7 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
         elif access_config.es_api_key:
             client_input_kwargs["api_key"] = access_config.es_api_key
         client_input = ElasticsearchClientInput(**client_input_kwargs)
-        logger.debug(f"Elasticsearch client inputs mapped to: {client_input.dict()}")
+        logger.debug(f"elasticsearch client inputs mapped to: {client_input.dict()}")
         client_kwargs = client_input.dict()
         client_kwargs["basic_auth"] = (
             client_input.basic_auth.get_secret_value() if client_input.basic_auth else None

unstructured_ingest/v2/processes/connectors/fsspec/box.py CHANGED Viewed

@@ -47,7 +47,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
     connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
     def get_access_config(self) -> dict[str, Any]:
-        # Return access_kwargs with oauth. The oauth object can not be stored directly in the config
+        # Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
         # because it is not serializable.
         from boxsdk import JWTAuth

unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py CHANGED Viewed

@@ -317,9 +317,9 @@ class FsspecUploader(Uploader):
         path_str = str(path.resolve())
         upload_path = self.get_upload_path(file_data=file_data)
         if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
-            logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
+            logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
             return
-        logger.debug(f"Writing local file {path_str} to {upload_path}")
+        logger.debug(f"writing local file {path_str} to {upload_path}")
         self.fs.upload(lpath=path_str, rpath=str(upload_path))
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -328,7 +328,7 @@ class FsspecUploader(Uploader):
         # Odd that fsspec doesn't run exists() as async even when client support async
         already_exists = self.fs.exists(path=str(upload_path))
         if already_exists and not self.upload_config.overwrite:
-            logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
+            logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
             return
-        logger.debug(f"Writing local file {path_str} to {upload_path}")
+        logger.debug(f"writing local file {path_str} to {upload_path}")
         self.fs.upload(lpath=path_str, rpath=str(upload_path))

unstructured_ingest/v2/processes/connectors/google_drive.py CHANGED Viewed

@@ -199,7 +199,7 @@ class GoogleDriveIndexer(Indexer):
         if extensions:
             ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions])
             q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')"
-        logger.debug(f"Query used when indexing: {q}")
+        logger.debug(f"query used when indexing: {q}")
         logger.debug("response fields limited to: {}".format(", ".join(self.fields)))
         done = False
         page_token = None

unstructured_ingest/v2/processes/connectors/local.py CHANGED Viewed

@@ -180,14 +180,15 @@ class LocalUploader(Uploader):
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         if source_identifiers := file_data.source_identifiers:
-            identifiers = source_identifiers
             rel_path = (
-                identifiers.relative_path[1:]
-                if identifiers.relative_path.startswith("/")
-                else identifiers.relative_path
+                source_identifiers.relative_path[1:]
+                if source_identifiers.relative_path.startswith("/")
+                else source_identifiers.relative_path
             )
             new_path = self.upload_config.output_path / Path(rel_path)
-            final_path = str(new_path).replace(identifiers.filename, f"{identifiers.filename}.json")
+            final_path = str(new_path).replace(
+                source_identifiers.filename, f"{source_identifiers.filename}.json"
+            )
         else:
             final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
         Path(final_path).parent.mkdir(parents=True, exist_ok=True)

unstructured_ingest/v2/processes/connectors/milvus.py CHANGED Viewed

@@ -71,7 +71,7 @@ class MilvusUploadStagerConfig(UploadStagerConfig):
     fields_to_include: Optional[list[str]] = None
     """If set - list of fields to include in the output.
     Unspecified fields are removed from the elements.
-    This action takse place after metadata flattening.
+    This action takes place after metadata flattening.
     Missing fields will cause stager to throw KeyError."""
     flatten_metadata: bool = True

unstructured_ingest/v2/processes/connectors/onedrive.py CHANGED Viewed

@@ -213,9 +213,9 @@ class OnedriveDownloader(Downloader):
         fsize = file.get_property("size", 0)
         download_path = self.get_download_path(file_data=file_data)
         download_path.parent.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Downloading {file_data.source_identifiers.fullpath} to {download_path}")
+        logger.info(f"downloading {file_data.source_identifiers.fullpath} to {download_path}")
         if fsize > MAX_MB_SIZE:
-            logger.info(f"Downloading file with size: {fsize} bytes in chunks")
+            logger.info(f"downloading file with size: {fsize} bytes in chunks")
             with download_path.open(mode="wb") as f:
                 file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
         else:

unstructured_ingest/v2/processes/connectors/opensearch.py CHANGED Viewed

@@ -101,7 +101,7 @@ class OpenSearchConnectionConfig(ConnectionConfig):
         if self.username and access_config.password:
             client_input_kwargs["http_auth"] = (self.username, access_config.password)
         client_input = OpenSearchClientInput(**client_input_kwargs)
-        logger.debug(f"OpenSearch client inputs mapped to: {client_input.dict()}")
+        logger.debug(f"opensearch client inputs mapped to: {client_input.dict()}")
         client_kwargs = client_input.dict()
         if client_input.http_auth is not None:
             client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()

unstructured_ingest/v2/processes/connectors/pinecone.py CHANGED Viewed

@@ -57,7 +57,7 @@ class PineconeConnectionConfig(ConnectionConfig):
         )
         index = pc.Index(name=self.index_name, **index_kwargs)
-        logger.debug(f"Connected to index: {pc.describe_index(self.index_name)}")
+        logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
         return index
@@ -166,7 +166,7 @@ class PineconeUploader(Uploader):
                 max_batch_size=self.upload_config.batch_size,
             )
         )
-        logger.info(f"Split doc with {len(elements_dict)} elements into {len(chunks)} batches")
+        logger.info(f"split doc with {len(elements_dict)} elements into {len(chunks)} batches")
         max_pool_threads = min(len(chunks), MAX_POOL_THREADS)
         if self.upload_config.pool_threads:

unstructured_ingest/v2/processes/connectors/sharepoint.py CHANGED Viewed

@@ -60,13 +60,16 @@ class SharepointAccessConfig(AccessConfig):
 class SharepointPermissionsConfig(BaseModel):
-    permissions_application_id: str = Field(description="Microsoft Graph API application id")
-    permissions_tenant: str = Field(
+    permissions_application_id: Optional[str] = Field(
+        default=None, description="Microsoft Graph API application id"
+    )
+    permissions_tenant: Optional[str] = Field(
+        default=None,
         description="url to get permissions data within tenant.",
         examples=["https://contoso.onmicrosoft.com"],
     )
-    permissions_client_cred: SecretStr = Field(
-        description="Microsoft Graph API application credentials"
+    permissions_client_cred: Optional[SecretStr] = Field(
+        default=None, description="Microsoft Graph API application credentials"
     )
     authority_url: Optional[SecretStr] = Field(
         repr=False,
@@ -335,7 +338,8 @@ class SharepointIndexer(Indexer):
     @property
     def process_permissions(self) -> bool:
         return (
-            self.connection_config.permissions_config.permissions_tenant
+            self.connection_config.permissions_config is not None
+            and self.connection_config.permissions_config.permissions_tenant
             and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
             and self.connection_config.permissions_config.permissions_application_id
         )

unstructured_ingest/v2/processes/filter.py CHANGED Viewed

@@ -47,7 +47,7 @@ class Filterer(BaseProcess, ABC):
         for pattern in patterns:
             if fnmatch.filter([path], pattern):
                 return True
-        logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
+        logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
         return False
     def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:

unstructured_ingest/v2/processes/partitioner.py CHANGED Viewed

@@ -145,7 +145,7 @@ class Partitioner(BaseProcess, ABC):
         class FileDataSourceMetadata(DataSourceMetadata):
             filesize_bytes: Optional[int] = None
-        logger.debug(f"Using local partition with kwargs: {self.config.to_partition_kwargs()}")
+        logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
         logger.debug(f"partitioning file {filename} with metadata {metadata}")
         elements = partition(
             filename=str(filename.resolve()),
@@ -165,7 +165,7 @@ class Partitioner(BaseProcess, ABC):
         partition_request = self.config.to_partition_kwargs()
-        # Note(austin): PartitionParameters is a Pydantic model in v0.26.0
+        # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
         # Prior to this it was a dataclass which doesn't have .__fields
         try:
             possible_fields = PartitionParameters.__fields__
@@ -182,7 +182,7 @@ class Partitioner(BaseProcess, ABC):
                     ", ".join([v for v in partition_request if v not in filtered_partition_request])
                 )
             )
-        logger.debug(f"Using hosted partitioner with kwargs: {partition_request}")
+        logger.debug(f"using hosted partitioner with kwargs: {partition_request}")
         with open(filename, "rb") as f:
             files = Files(
                 content=f.read(),

unstructured_ingest/v2/utils.py CHANGED Viewed

@@ -20,6 +20,11 @@ def is_secret(value: Any) -> bool:
 def serialize_base_model(model: BaseModel) -> dict:
     # To get the full serialized dict regardless of if values are marked as Secret
     model_dict = model.dict()
+    return serialize_base_dict(model_dict=model_dict)
+def serialize_base_dict(model_dict: dict) -> dict:
+    model_dict = model_dict.copy()
     for k, v in model_dict.items():
         if isinstance(v, _SecretBase):
             secret_value = v.get_secret_value()
@@ -27,6 +32,8 @@ def serialize_base_model(model: BaseModel) -> dict:
                 model_dict[k] = serialize_base_model(model=secret_value)
             else:
                 model_dict[k] = secret_value
+        if isinstance(v, dict):
+            model_dict[k] = serialize_base_dict(model_dict=v)
     return model_dict

unstructured-ingest 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.14py3-none-any.whl → 0.0.16py3-none-any.whl