PyPI - unstructured-ingest - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

unstructured-ingest 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show

test/integration/connectors/utils/validation.py CHANGED Viewed

@@ -7,13 +7,14 @@ from pathlib import Path
 from typing import Callable, Optional
 import pandas as pd
+from bs4 import BeautifulSoup
 from deepdiff import DeepDiff
 from test.integration.connectors.utils.constants import expected_results_path
 from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
-def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
     expected_df = pd.read_csv(expected_filepath)
     current_df = pd.read_csv(current_filepath)
     if expected_df.equals(current_df):
@@ -27,6 +28,42 @@ def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) ->
     return False
+def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as expected_f:
+        expected_soup = BeautifulSoup(expected_f, "html.parser")
+    with current_filepath.open() as current_f:
+        current_soup = BeautifulSoup(current_f, "html.parser")
+    return expected_soup.text == current_soup.text
+def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
+    with expected_filepath.open() as expected_f:
+        expected_text_lines = expected_f.readlines()
+    with current_filepath.open() as current_f:
+        current_text_lines = current_f.readlines()
+    if len(expected_text_lines) != len(current_text_lines):
+        print(
+            f"Lines in expected text file ({len(expected_text_lines)}) "
+            f"don't match current text file ({len(current_text_lines)})"
+        )
+        return False
+    expected_text = "\n".join(expected_text_lines)
+    current_text = "\n".join(current_text_lines)
+    if expected_text == current_text:
+        return True
+    print("txt content don't match:")
+    print(f"expected: {expected_text}")
+    print(f"current: {current_text}")
+    return False
+file_type_equality_check = {
+    ".json": json_equality_check,
+    ".html": html_equality_check,
+    ".txt": txt_equality_check,
+}
 @dataclass
 class ValidationConfigs:
     test_id: str
@@ -39,6 +76,7 @@ class ValidationConfigs:
     )
     exclude_fields_extend: list[str] = field(default_factory=list)
     validate_downloaded_files: bool = False
+    validate_file_data: bool = True
     downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
     def get_exclude_fields(self) -> list[str]:
@@ -86,7 +124,7 @@ class ValidationConfigs:
 def get_files(dir_path: Path) -> list[str]:
     return [
-        str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.iterdir() if f.is_file()
+        str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.rglob("*") if f.is_file()
     ]
@@ -122,6 +160,23 @@ def check_contents(
     assert not found_diff, f"Diffs found between files: {found_diff}"
+def detect_diff(
+    configs: ValidationConfigs, expected_filepath: Path, current_filepath: Path
+) -> bool:
+    if expected_filepath.suffix != current_filepath.suffix:
+        return True
+    if downloaded_file_equality_check := configs.downloaded_file_equality_check:
+        return not downloaded_file_equality_check(expected_filepath, current_filepath)
+    current_suffix = expected_filepath.suffix
+    if current_suffix in file_type_equality_check:
+        equality_check_callable = file_type_equality_check[current_suffix]
+        return not equality_check_callable(
+            expected_filepath=expected_filepath, current_filepath=current_filepath
+        )
+    # Fallback is using filecmp.cmp to compare the files
+    return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
 def check_raw_file_contents(
     expected_output_dir: Path,
     current_output_dir: Path,
@@ -133,15 +188,7 @@ def check_raw_file_contents(
     for current_file in current_files:
         current_file_path = current_output_dir / current_file
         expected_file_path = expected_output_dir / current_file
-        if downloaded_file_equality_check := configs.downloaded_file_equality_check:
-            is_different = downloaded_file_equality_check(expected_file_path, current_file_path)
-        elif expected_file_path.suffix == ".csv" and current_file_path.suffix == ".csv":
-            is_different = not pandas_df_equality_check(
-                expected_filepath=expected_file_path, current_filepath=current_file_path
-            )
-        else:
-            is_different = not filecmp.cmp(expected_file_path, current_file_path, shallow=False)
-        if is_different:
+        if detect_diff(configs, expected_file_path, current_file_path):
             found_diff = True
             files.append(str(expected_file_path))
             print(f"diffs between files {expected_file_path} and {current_file_path}")
@@ -185,17 +232,19 @@ def update_fixtures(
     download_dir: Path,
     all_file_data: list[FileData],
     save_downloads: bool = False,
+    save_filedata: bool = True,
 ):
     # Delete current files
     shutil.rmtree(path=output_dir, ignore_errors=True)
     output_dir.mkdir(parents=True)
     # Rewrite the current file data
-    file_data_output_path = output_dir / "file_data"
-    file_data_output_path.mkdir(parents=True, exist_ok=True)
-    for file_data in all_file_data:
-        file_data_path = file_data_output_path / f"{file_data.identifier}.json"
-        with file_data_path.open(mode="w") as f:
-            json.dump(file_data.to_dict(), f, indent=2)
+    if save_filedata:
+        file_data_output_path = output_dir / "file_data"
+        file_data_output_path.mkdir(parents=True, exist_ok=True)
+        for file_data in all_file_data:
+            file_data_path = file_data_output_path / f"{file_data.identifier}.json"
+            with file_data_path.open(mode="w") as f:
+                json.dump(file_data.to_dict(), f, indent=2)
     # Record file structure of download directory
     download_files = get_files(dir_path=download_dir)
@@ -229,11 +278,12 @@ def run_all_validations(
             predownload_file_data=pre_data, postdownload_file_data=post_data
         )
     configs.run_download_dir_validation(download_dir=download_dir)
-    run_expected_results_validation(
-        expected_output_dir=test_output_dir / "file_data",
-        all_file_data=postdownload_file_data,
-        configs=configs,
-    )
+    if configs.validate_file_data:
+        run_expected_results_validation(
+            expected_output_dir=test_output_dir / "file_data",
+            all_file_data=postdownload_file_data,
+            configs=configs,
+        )
     download_files = get_files(dir_path=download_dir)
     download_files.sort()
     run_directory_structure_validation(
@@ -291,4 +341,5 @@ async def source_connector_validation(
             download_dir=download_dir,
             all_file_data=all_postdownload_file_data,
             save_downloads=configs.validate_downloaded_files,
+            save_filedata=configs.validate_file_data,
         )

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.0" # pragma: no cover
1	+ __version__ = "0.2.2" # pragma: no cover

unstructured_ingest/connector/kafka.py CHANGED Viewed

@@ -181,7 +181,6 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
                     logger.debug(f"found {len(collected)} messages, stopping")
                     consumer.commit(asynchronous=False)
                     break
         return [
             KafkaIngestDoc(
                 connector_config=self.connector_config,

unstructured_ingest/interfaces.py CHANGED Viewed

@@ -21,6 +21,7 @@ from unstructured_ingest.enhanced_dataclass.core import _asdict
 from unstructured_ingest.error import PartitionError, SourceConnectionError
 from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.data_prep import flatten_dict
+from unstructured_ingest.v2.unstructured_api import call_api
 if TYPE_CHECKING:
     from unstructured.documents.elements import Element
@@ -565,6 +566,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
     ) -> list["Element"]:
         from unstructured.documents.elements import DataSourceMetadata
         from unstructured.partition.auto import partition
+        from unstructured.staging.base import elements_from_dicts
         if not partition_config.partition_by_api:
             logger.debug("Using local partition")
@@ -582,18 +584,16 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
                 **partition_kwargs,
             )
         else:
-            from unstructured.partition.api import partition_via_api
             endpoint = partition_config.partition_endpoint
             logger.debug(f"using remote partition ({endpoint})")
-            elements = partition_via_api(
-                filename=str(self.filename),
+            elements_dicts = call_api(
+                server_url=endpoint,
                 api_key=partition_config.api_key,
-                api_url=endpoint,
-                **partition_kwargs,
+                filename=Path(self.filename),
+                api_parameters=partition_kwargs,
             )
+            elements = elements_from_dicts(elements_dicts)
             # TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
             # pass the stringified json here
         return elements

unstructured_ingest/v2/interfaces/file_data.py CHANGED Viewed

@@ -43,6 +43,7 @@ class FileData(DataClassJsonMixin):
     additional_metadata: dict[str, Any] = field(default_factory=dict)
     reprocess: bool = False
     local_download_path: Optional[str] = None
+    display_name: Optional[str] = None
     @classmethod
     def from_file(cls, path: str) -> "FileData":

unstructured_ingest/v2/processes/chunker.py CHANGED Viewed

@@ -9,7 +9,7 @@ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces.process import BaseProcess
 from unstructured_ingest.v2.logger import logger
-from unstructured_ingest.v2.unstructured_api import call_api
+from unstructured_ingest.v2.unstructured_api import call_api_async
 CHUNK_MAX_CHARS_DEFAULT: int = 500
 CHUNK_MULTI_PAGE_DEFAULT: bool = True
@@ -112,7 +112,7 @@ class Chunker(BaseProcess, ABC):
     @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
     async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
-        elements = await call_api(
+        elements = await call_api_async(
             server_url=self.config.chunking_endpoint,
             api_key=self.config.chunk_api_key.get_secret_value(),
             filename=elements_filepath,

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -2,6 +2,8 @@ from __future__ import annotations
 import unstructured_ingest.v2.processes.connectors.databricks  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.fsspec  # noqa: F401
+import unstructured_ingest.v2.processes.connectors.kafka  # noqa: F401
+import unstructured_ingest.v2.processes.connectors.qdrant  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.sql  # noqa: F401
 from unstructured_ingest.v2.processes.connector_registry import (
     add_destination_entry,
@@ -11,17 +13,21 @@ from unstructured_ingest.v2.processes.connector_registry import (
 from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
 from .airtable import airtable_source_entry
 from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
-from .astradb import astra_db_destination_entry
+from .astradb import astra_db_destination_entry, astra_db_source_entry
 from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
 from .azure_cognitive_search import azure_cognitive_search_destination_entry
 from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
 from .chroma import chroma_destination_entry
+from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
+from .confluence import confluence_source_entry
 from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
 from .couchbase import couchbase_destination_entry, couchbase_source_entry
 from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
 from .delta_table import delta_table_destination_entry
 from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
 from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
+from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
+from .gitlab import gitlab_source_entry
 from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
 from .google_drive import google_drive_source_entry
 from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
@@ -33,7 +39,7 @@ from .milvus import milvus_destination_entry
 from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
 from .mongodb import mongodb_destination_entry, mongodb_source_entry
 from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
-from .onedrive import onedrive_source_entry
+from .onedrive import onedrive_destination_entry, onedrive_source_entry
 from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
 from .opensearch import opensearch_destination_entry, opensearch_source_entry
 from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
@@ -44,13 +50,12 @@ from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
 from .salesforce import salesforce_source_entry
 from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
 from .sharepoint import sharepoint_source_entry
-from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
-from .singlestore import singlestore_destination_entry
 from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
 from .slack import slack_source_entry
 from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
 from .weaviate import weaviate_destination_entry
+add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
 add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
 add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
@@ -73,6 +78,7 @@ add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
 add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)
 add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
+add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
 add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
 add_destination_entry(
@@ -88,9 +94,7 @@ add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
 add_destination_entry(destination_type=PINECONE_CONNECTOR_TYPE, entry=pinecone_destination_entry)
 add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_entry)
-add_destination_entry(
-    destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
-)
 add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
 add_destination_entry(
     destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
@@ -102,4 +106,8 @@ add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entr
 add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
+add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
 add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
+add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)

unstructured-ingest 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl