PyPI - unstructured-ingest - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

unstructured-ingest 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (30) hide show

test/integration/connectors/utils/validation.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import filecmp
 import json
 import os
 import shutil
@@ -5,15 +6,31 @@ from dataclasses import dataclass, field, replace
 from pathlib import Path
 from typing import Callable, Optional
+import pandas as pd
 from deepdiff import DeepDiff
 from test.integration.connectors.utils.constants import expected_results_path
 from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
+def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    expected_df = pd.read_csv(expected_filepath)
+    current_df = pd.read_csv(current_filepath)
+    if expected_df.equals(current_df):
+        return True
+    # Print diff
+    diff = expected_df.merge(current_df, indicator=True, how="left").loc[
+        lambda x: x["_merge"] != "both"
+    ]
+    print("diff between expected and current df:")
+    print(diff)
+    return False
 @dataclass
 class ValidationConfigs:
     test_id: str
+    expected_number_indexed_file_data: Optional[int] = None
     expected_num_files: Optional[int] = None
     predownload_file_data_check: Optional[Callable[[FileData], None]] = None
     postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
@@ -21,6 +38,8 @@ class ValidationConfigs:
         default_factory=lambda: ["local_download_path", "metadata.date_processed"]
     )
     exclude_fields_extend: list[str] = field(default_factory=list)
+    validate_downloaded_files: bool = False
+    downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
     def get_exclude_fields(self) -> list[str]:
         exclude_fields = self.exclude_fields
@@ -78,6 +97,13 @@ def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
     assert not diff, "diff in files that exist: {}".format(", ".join(diff))
+def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
+    expected_files = get_files(dir_path=expected_output_dir)
+    current_files = get_files(dir_path=current_output_dir)
+    diff = set(expected_files) ^ set(current_files)
+    assert not diff, "diff in files that exist: {}".format(", ".join(diff))
 def check_contents(
     expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
 ):
@@ -96,6 +122,32 @@ def check_contents(
     assert not found_diff, f"Diffs found between files: {found_diff}"
+def check_raw_file_contents(
+    expected_output_dir: Path,
+    current_output_dir: Path,
+    configs: ValidationConfigs,
+):
+    current_files = get_files(dir_path=current_output_dir)
+    found_diff = False
+    files = []
+    for current_file in current_files:
+        current_file_path = current_output_dir / current_file
+        expected_file_path = expected_output_dir / current_file
+        if downloaded_file_equality_check := configs.downloaded_file_equality_check:
+            is_different = downloaded_file_equality_check(expected_file_path, current_file_path)
+        elif expected_file_path.suffix == ".csv" and current_file_path.suffix == ".csv":
+            is_different = not pandas_df_equality_check(
+                expected_filepath=expected_file_path, current_filepath=current_file_path
+            )
+        else:
+            is_different = not filecmp.cmp(expected_file_path, current_file_path, shallow=False)
+        if is_different:
+            found_diff = True
+            files.append(str(expected_file_path))
+            print(f"diffs between files {expected_file_path} and {current_file_path}")
+    assert not found_diff, "Diffs found between files: {}".format(", ".join(files))
 def run_expected_results_validation(
     expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
 ):
@@ -105,6 +157,21 @@ def run_expected_results_validation(
     )
+def run_expected_download_files_validation(
+    expected_output_dir: Path,
+    current_download_dir: Path,
+    configs: ValidationConfigs,
+):
+    check_files_in_paths(
+        expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
+    )
+    check_raw_file_contents(
+        expected_output_dir=expected_output_dir,
+        current_output_dir=current_download_dir,
+        configs=configs,
+    )
 def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
     directory_record = expected_output_dir / "directory_structure.json"
     with directory_record.open("r") as directory_file:
@@ -113,13 +180,18 @@ def run_directory_structure_validation(expected_output_dir: Path, download_files
     assert directory_structure == download_files
-def update_fixtures(output_dir: Path, download_dir: Path, all_file_data: list[FileData]):
+def update_fixtures(
+    output_dir: Path,
+    download_dir: Path,
+    all_file_data: list[FileData],
+    save_downloads: bool = False,
+):
     # Delete current files
     shutil.rmtree(path=output_dir, ignore_errors=True)
     output_dir.mkdir(parents=True)
     # Rewrite the current file data
     file_data_output_path = output_dir / "file_data"
-    file_data_output_path.mkdir(parents=True)
+    file_data_output_path.mkdir(parents=True, exist_ok=True)
     for file_data in all_file_data:
         file_data_path = file_data_output_path / f"{file_data.identifier}.json"
         with file_data_path.open(mode="w") as f:
@@ -132,6 +204,11 @@ def update_fixtures(output_dir: Path, download_dir: Path, all_file_data: list[Fi
     with download_dir_record.open(mode="w") as f:
         json.dump({"directory_structure": download_files}, f, indent=2)
+    # If applicable, save raw downloads
+    if save_downloads:
+        raw_download_output_path = output_dir / "downloads"
+        shutil.copytree(download_dir, raw_download_output_path)
 def run_all_validations(
     configs: ValidationConfigs,
@@ -140,6 +217,13 @@ def run_all_validations(
     download_dir: Path,
     test_output_dir: Path,
 ):
+    if expected_number_indexed_file_data := configs.expected_number_indexed_file_data:
+        assert (
+            len(predownload_file_data) == expected_number_indexed_file_data
+        ), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
+    if expected_num_files := configs.expected_num_files:
+        assert len(postdownload_file_data) == expected_num_files
     for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
         configs.run_file_data_validation(
             predownload_file_data=pre_data, postdownload_file_data=post_data
@@ -155,6 +239,12 @@ def run_all_validations(
     run_directory_structure_validation(
         expected_output_dir=configs.test_output_dir(), download_files=download_files
     )
+    if configs.validate_downloaded_files:
+        run_expected_download_files_validation(
+            expected_output_dir=test_output_dir / "downloads",
+            current_download_dir=download_dir,
+            configs=configs,
+        )
 async def source_connector_validation(
@@ -200,4 +290,5 @@ async def source_connector_validation(
             output_dir=test_output_dir,
             download_dir=download_dir,
             all_file_data=all_postdownload_file_data,
+            save_downloads=configs.validate_downloaded_files,
         )

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.1" # pragma: no cover
1	+ __version__ = "0.2.0" # pragma: no cover

unstructured_ingest/v2/cli/utils/click.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import json
 import os.path
+from datetime import date, datetime
 from gettext import gettext, ngettext
 from gettext import gettext as _
 from pathlib import Path
 from typing import Any, Optional, Type, TypeVar, Union
 import click
-from pydantic import BaseModel, ConfigDict, Secret
+from pydantic import BaseModel, ConfigDict, Secret, TypeAdapter, ValidationError
 def conform_click_options(options: dict):
@@ -109,6 +110,36 @@ class DelimitedString(click.ParamType):
         return split
+class PydanticDateTime(click.ParamType):
+    name = "datetime"
+    def convert(
+        self,
+        value: Any,
+        param: Optional[click.Parameter] = None,
+        ctx: Optional[click.Context] = None,
+    ) -> Any:
+        try:
+            return TypeAdapter(datetime).validate_strings(value)
+        except ValidationError:
+            self.fail(f"{value} is not a valid datetime", param, ctx)
+class PydanticDate(click.ParamType):
+    name = "date"
+    def convert(
+        self,
+        value: Any,
+        param: Optional[click.Parameter] = None,
+        ctx: Optional[click.Context] = None,
+    ) -> Any:
+        try:
+            return TypeAdapter(date).validate_strings(value)
+        except ValidationError:
+            self.fail(f"{value} is not a valid date", param, ctx)
 BaseModelT = TypeVar("BaseModelT", bound=BaseModel)

unstructured_ingest/v2/cli/utils/model_conversion.py CHANGED Viewed

@@ -25,7 +25,12 @@ from pydantic.fields import FieldInfo
 from pydantic.types import _SecretBase
 from pydantic_core import PydanticUndefined
-from unstructured_ingest.v2.cli.utils.click import DelimitedString, Dict
+from unstructured_ingest.v2.cli.utils.click import (
+    DelimitedString,
+    Dict,
+    PydanticDate,
+    PydanticDateTime,
+)
 NoneType = type(None)
@@ -135,8 +140,10 @@ def get_type_from_annotation(field_type: Any) -> click.ParamType:
         return click.UUID
     if field_type is Path:
         return click.Path(path_type=Path)
-    if field_type in (datetime.datetime, datetime.date):
-        return click.DateTime()
+    if field_type is datetime.datetime:
+        return PydanticDateTime()
+    if field_type is datetime.date:
+        return PydanticDate()
     if field_origin is Literal:
         return click.Choice(field_args)
     if isinstance(field_type, EnumMeta):

unstructured_ingest/v2/interfaces/indexer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Generator, Optional, TypeVar
+from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
 from pydantic import BaseModel
@@ -25,3 +25,6 @@ class Indexer(BaseProcess, BaseConnector, ABC):
     @abstractmethod
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
         pass
+    async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
+        raise NotImplementedError()

unstructured_ingest/v2/pipeline/pipeline.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import asyncio
 import logging
 import multiprocessing as mp
 import shutil
@@ -186,6 +187,14 @@ class Pipeline:
         filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
         return filtered_records
+    def get_indices(self) -> list[dict]:
+        if self.indexer_step.process.is_async():
+            indices = asyncio.run(self.indexer_step.run_async())
+        else:
+            indices = self.indexer_step.run()
+        indices_inputs = [{"file_data_path": i} for i in indices]
+        return indices_inputs
     def _run(self):
         logger.info(
             f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
@@ -197,8 +206,7 @@ class Pipeline:
             self.context.status = {}
         # Index into data source
-        indices = self.indexer_step.run()
-        indices_inputs = [{"file_data_path": i} for i in indices]
+        indices_inputs = self.get_indices()
         if not indices_inputs:
             logger.info("No files to process after indexer, exiting")
             return

unstructured_ingest/v2/pipeline/steps/index.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import hashlib
 import json
 from dataclasses import dataclass
-from typing import Generator, Optional, TypeVar
+from typing import AsyncGenerator, Generator, Optional, TypeVar
 from unstructured_ingest.v2.interfaces.indexer import Indexer
 from unstructured_ingest.v2.logger import logger
@@ -52,6 +52,23 @@ class IndexStep(PipelineStep):
                     raise e
                 continue
+    async def run_async(self) -> AsyncGenerator[str, None]:
+        async for file_data in self.process.run_async():
+            logger.debug(f"generated file data: {file_data.to_dict()}")
+            try:
+                record_hash = self.get_hash(extras=[file_data.identifier])
+                filename = f"{record_hash}.json"
+                filepath = (self.cache_dir / filename).resolve()
+                filepath.parent.mkdir(parents=True, exist_ok=True)
+                with open(str(filepath), "w") as f:
+                    json.dump(file_data.to_dict(), f, indent=2)
+                yield str(filepath)
+            except Exception as e:
+                logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
+                if self.context.raise_on_error:
+                    raise e
+                continue
     def get_hash(self, extras: Optional[list[str]]) -> str:
         index_config_dict = json.loads(
             serialize_base_model_json(model=self.process.index_config, sort_keys=True)

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -18,6 +18,8 @@ from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
 from .chroma import chroma_destination_entry
 from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
 from .couchbase import couchbase_destination_entry, couchbase_source_entry
+from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
+from .delta_table import delta_table_destination_entry
 from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
 from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
 from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -44,6 +46,8 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
 from .sharepoint import sharepoint_source_entry
 from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
 from .singlestore import singlestore_destination_entry
+from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
+from .slack import slack_source_entry
 from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
 from .weaviate import weaviate_destination_entry
@@ -54,6 +58,10 @@ add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_desti
 add_source_entry(source_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_source_entry)
 add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
+add_destination_entry(
+    destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
+)
 add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
 add_destination_entry(
     destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
@@ -93,3 +101,5 @@ add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destina
 add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
 add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
+add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)

unstructured_ingest/v2/processes/connectors/databricks/volumes.py CHANGED Viewed

@@ -166,7 +166,7 @@ class DatabricksVolumesUploader(Uploader, ABC):
             raise DestinationConnectionError(f"failed to validate connection: {e}")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        output_path = os.path.join(self.upload_config.path, path.name)
+        output_path = os.path.join(self.upload_config.path, file_data.source_identifiers.filename)
         with open(path, "rb") as elements_file:
             self.connection_config.get_client().files.upload(
                 file_path=output_path,

unstructured_ingest/v2/processes/connectors/delta_table.py ADDED Viewed

@@ -0,0 +1,185 @@
+import json
+import os
+from dataclasses import dataclass, field
+from multiprocessing import Process
+from pathlib import Path
+from typing import Any, Optional
+import pandas as pd
+from pydantic import Field, Secret
+from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.utils.table import convert_to_pandas_dataframe
+from unstructured_ingest.v2.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    FileData,
+    Uploader,
+    UploaderConfig,
+    UploadStager,
+    UploadStagerConfig,
+)
+from unstructured_ingest.v2.logger import logger
+from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
+CONNECTOR_TYPE = "delta_table"
+class DeltaTableAccessConfig(AccessConfig):
+    aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
+    aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
+class DeltaTableConnectionConfig(ConnectionConfig):
+    access_config: Secret[DeltaTableAccessConfig] = Field(
+        default=DeltaTableAccessConfig(), validate_default=True
+    )
+    aws_region: Optional[str] = Field(default=None, description="AWS Region")
+    table_uri: str = Field(
+        default=None,
+        description=(
+            "Local path or path to the target folder in the S3 bucket, "
+            "formatted as s3://my-bucket/my-folder/"
+        ),
+    )
+    def update_storage_options(self, storage_options: dict) -> None:
+        secrets = self.access_config.get_secret_value()
+        if self.aws_region and secrets.aws_access_key_id and secrets.aws_secret_access_key:
+            storage_options["AWS_REGION"] = self.aws_region
+            storage_options["AWS_ACCESS_KEY_ID"] = secrets.aws_access_key_id
+            storage_options["AWS_SECRET_ACCESS_KEY"] = secrets.aws_secret_access_key
+            # Delta-rs doesn't support concurrent S3 writes without external locks (DynamoDB).
+            # This flag allows single-writer uploads to S3 without using locks, according to:
+            # https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/
+            storage_options["AWS_S3_ALLOW_UNSAFE_RENAME"] = "true"
+class DeltaTableUploadStagerConfig(UploadStagerConfig):
+    pass
+@dataclass
+class DeltaTableUploadStager(UploadStager):
+    upload_stager_config: DeltaTableUploadStagerConfig = field(
+        default_factory=lambda: DeltaTableUploadStagerConfig()
+    )
+    def run(
+        self,
+        elements_filepath: Path,
+        output_dir: Path,
+        output_filename: str,
+        **kwargs: Any,
+    ) -> Path:
+        with open(elements_filepath) as elements_file:
+            elements_contents = json.load(elements_file)
+        output_path = Path(output_dir) / Path(f"{output_filename}.parquet")
+        df = convert_to_pandas_dataframe(elements_dict=elements_contents)
+        df.to_parquet(output_path)
+        return output_path
+class DeltaTableUploaderConfig(UploaderConfig):
+    pass
+@dataclass
+class DeltaTableUploader(Uploader):
+    upload_config: DeltaTableUploaderConfig
+    connection_config: DeltaTableConnectionConfig
+    connector_type: str = CONNECTOR_TYPE
+    @requires_dependencies(["s3fs", "fsspec"], extras="s3")
+    def precheck(self):
+        secrets = self.connection_config.access_config.get_secret_value()
+        if (
+            self.connection_config.aws_region
+            and secrets.aws_access_key_id
+            and secrets.aws_secret_access_key
+        ):
+            from fsspec import get_filesystem_class
+            try:
+                fs = get_filesystem_class("s3")(
+                    key=secrets.aws_access_key_id, secret=secrets.aws_secret_access_key
+                )
+                fs.write_bytes(path=self.connection_config.table_uri, value=b"")
+            except Exception as e:
+                logger.error(f"failed to validate connection: {e}", exc_info=True)
+                raise DestinationConnectionError(f"failed to validate connection: {e}")
+    def process_csv(self, csv_paths: list[Path]) -> pd.DataFrame:
+        logger.debug(f"uploading content from {len(csv_paths)} csv files")
+        df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
+        return df
+    def process_json(self, json_paths: list[Path]) -> pd.DataFrame:
+        logger.debug(f"uploading content from {len(json_paths)} json files")
+        all_records = []
+        for p in json_paths:
+            with open(p) as json_file:
+                all_records.extend(json.load(json_file))
+        return pd.DataFrame(data=all_records)
+    def process_parquet(self, parquet_paths: list[Path]) -> pd.DataFrame:
+        logger.debug(f"uploading content from {len(parquet_paths)} parquet files")
+        df = pd.concat((pd.read_parquet(path) for path in parquet_paths), ignore_index=True)
+        return df
+    def read_dataframe(self, path: Path) -> pd.DataFrame:
+        if path.suffix == ".csv":
+            return self.process_csv(csv_paths=[path])
+        elif path.suffix == ".json":
+            return self.process_json(json_paths=[path])
+        elif path.suffix == ".parquet":
+            return self.process_parquet(parquet_paths=[path])
+        else:
+            raise ValueError(f"Unsupported file type, must be parquet, json or csv file: {path}")
+    @requires_dependencies(["deltalake"], extras="delta-table")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        from deltalake.writer import write_deltalake
+        df = self.read_dataframe(path)
+        updated_upload_path = os.path.join(
+            self.connection_config.table_uri, file_data.source_identifiers.relative_path
+        )
+        logger.info(
+            f"writing {len(df)} rows to destination table "
+            f"at {updated_upload_path}\ndtypes: {df.dtypes}",
+        )
+        storage_options = {}
+        self.connection_config.update_storage_options(storage_options=storage_options)
+        writer_kwargs = {
+            "table_or_uri": updated_upload_path,
+            "data": df,
+            "mode": "overwrite",
+            "storage_options": storage_options,
+        }
+        # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
+        # ingest to fail, even though all tasks are completed normally. Putting the writer into a
+        # process mitigates this issue by ensuring python interpreter waits properly for deltalake's
+        # rust backend to finish
+        writer = Process(
+            target=write_deltalake,
+            kwargs=writer_kwargs,
+        )
+        writer.start()
+        writer.join()
+delta_table_destination_entry = DestinationRegistryEntry(
+    connection_config=DeltaTableConnectionConfig,
+    uploader=DeltaTableUploader,
+    uploader_config=DeltaTableUploaderConfig,
+    upload_stager=DeltaTableUploadStager,
+    upload_stager_config=DeltaTableUploadStagerConfig,
+)

unstructured-ingest 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.1.1py3-none-any.whl → 0.2.0py3-none-any.whl