PyPI - unstructured-ingest - Versions diffs - 0.5.15__py3-none-any.whl → 0.5.17__py3-none-any.whl - Mend

unstructured-ingest 0.5.15py3-none-any.whl → 0.5.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (15) hide show

test/integration/connectors/test_zendesk.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 from pathlib import Path
-from typing import Optional
 import pytest
@@ -21,20 +20,20 @@ from unstructured_ingest.v2.processes.connectors.zendesk.zendesk import (
     ZendeskIndexerConfig,
 )
+SUBDOMAIN = "unstructuredhelp"
+EMAIL = "test@unstructured.io"
-async def zendesk_source_test(
-    tmp_path: Path,
-    token: Optional[str] = None,
-    email: Optional[str] = None,
-    subdomain: Optional[str] = None,
-):
-    access_config = ZendeskAccessConfig(api_token=token)
+@pytest.mark.asyncio
+@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
+@requires_env("ZENDESK_TOKEN")
+async def test_zendesk_source_tickets(temp_dir: Path):
+    access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
     connection_config = ZendeskConnectionConfig(
-        subdomain=subdomain, email=email, access_config=access_config
+        subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
     )
-    index_config = ZendeskIndexerConfig(batch_size=2, item_type="tickets")
+    index_config = ZendeskIndexerConfig(item_type="tickets")
     indexer = ZendeskIndexer(
         connection_config=connection_config,
@@ -43,7 +42,7 @@ async def zendesk_source_test(
     )
     # handle downloader.
-    download_config = ZendeskDownloaderConfig(download_dir=tmp_path)
+    download_config = ZendeskDownloaderConfig(download_dir=temp_dir)
     downloader = ZendeskDownloader(
         connection_config=connection_config,
@@ -57,26 +56,23 @@ async def zendesk_source_test(
         downloader=downloader,
         configs=SourceValidationConfigs(
             test_id="zendesk-tickets",
-            expected_num_files=4,
+            expected_num_files=8,
             validate_file_data=False,
             validate_downloaded_files=True,
         ),
     )
-async def zendesk_source_articles_test(
-    tmp_path: Path,
-    token: Optional[str] = None,
-    email: Optional[str] = None,
-    subdomain: Optional[str] = None,
-):
-    access_config = ZendeskAccessConfig(api_token=token)
+@pytest.mark.asyncio
+@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
+@requires_env("ZENDESK_TOKEN")
+async def test_zendesk_source_articles(temp_dir):
+    access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
     connection_config = ZendeskConnectionConfig(
-        subdomain=subdomain, email=email, access_config=access_config
+        subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
     )
-    index_config = ZendeskIndexerConfig(batch_size=2, item_type="articles")
+    index_config = ZendeskIndexerConfig(item_type="articles")
     indexer = ZendeskIndexer(
         connection_config=connection_config,
@@ -85,7 +81,7 @@ async def zendesk_source_articles_test(
     )
     # handle downloader.
-    download_config = ZendeskDownloaderConfig(download_dir=tmp_path, extract_images=True)
+    download_config = ZendeskDownloaderConfig(download_dir=temp_dir, extract_images=True)
     downloader = ZendeskDownloader(
         connection_config=connection_config,
@@ -99,44 +95,26 @@ async def zendesk_source_articles_test(
         downloader=downloader,
         configs=SourceValidationConfigs(
             test_id="zendesk-articles",
-            expected_num_files=4,
-            validate_file_data=False,
+            expected_num_files=8,
+            validate_file_data=True,
             validate_downloaded_files=True,
         ),
     )
-@pytest.mark.asyncio
 @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
-@requires_env("ZENDESK_TOKEN")
-async def test_zendesk_source(temp_dir):
-    await zendesk_source_test(
-        tmp_path=temp_dir,
-        token=os.environ["ZENDESK_TOKEN"],
-        email="test@unstructured.io",
-        subdomain="unstructuredhelp",
+def test_zendesk_source_articles_fail(temp_dir):
+    access_config = ZendeskAccessConfig(api_token="FAKE_TOKEN")
+    connection_config = ZendeskConnectionConfig(
+        subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
     )
+    index_config = ZendeskIndexerConfig(item_type="tickets")
-@pytest.mark.asyncio
-@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
-@requires_env("ZENDESK_TOKEN")
-async def test_zendesk_source_articles(temp_dir):
-    await zendesk_source_articles_test(
-        tmp_path=temp_dir,
-        token=os.environ["ZENDESK_TOKEN"],
-        email="test@unstructured.io",
-        subdomain="unstructuredhelp",
+    indexer = ZendeskIndexer(
+        connection_config=connection_config,
+        index_config=index_config,
+        connector_type=CONNECTOR_TYPE,
     )
-@pytest.mark.asyncio
-@pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
-async def test_zendesk_source_articles_fail(temp_dir):
     with pytest.raises(expected_exception=UserAuthError):
-        await zendesk_source_articles_test(
-            tmp_path=temp_dir,
-            token="FORCE_FAIL_TOKEN",
-            email="test@unstructured.io",
-            subdomain="unstructuredhelp",
-        )
+        indexer.precheck()

test/integration/connectors/utils/validation/source.py CHANGED Viewed

@@ -103,7 +103,7 @@ def check_contents(
         file_data_path = expected_output_dir / f"{file_data.identifier}.json"
         with file_data_path.open("r") as file:
             expected_file_data_contents = json.load(file)
-        current_file_data_contents = file_data.model_dump()
+        current_file_data_contents = json.loads(file_data.model_dump_json())
         expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
         current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
         diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
@@ -184,7 +184,7 @@ def update_fixtures(
         for file_data in all_file_data:
             file_data_path = file_data_output_path / f"{file_data.identifier}.json"
             with file_data_path.open(mode="w") as f:
-                json.dump(file_data.model_dump(), f, indent=2)
+                f.write(file_data.model_dump_json(indent=2))
     # Record file structure of download directory
     download_files = get_files(dir_path=download_dir)
@@ -216,7 +216,9 @@ def run_all_validations(
             len(predownload_file_data) == expected_number_indexed_file_data
         ), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
     if expected_num_files := configs.expected_num_files:
-        assert len(postdownload_file_data) == expected_num_files
+        assert (
+            len(postdownload_file_data) == expected_num_files
+        ), f"expected {expected_num_files} but got {len(postdownload_file_data)}"
     for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
         configs.run_file_data_validation(

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.15" # pragma: no cover
1	+ __version__ = "0.5.17" # pragma: no cover

unstructured_ingest/v2/pipeline/steps/download.py CHANGED Viewed

@@ -88,9 +88,9 @@ class DownloadStep(PipelineStep):
                 f"match size of local file: {file_size_bytes}, updating"
             )
             file_data.metadata.filesize_bytes = file_size_bytes
-        logger.debug(f"updating file data with new content: {file_data.model_dump()}")
+        logger.debug(f"updating file data with new content: {file_data.model_dump_json()}")
         with file_data_path.open("w") as file:
-            json.dump(file_data.model_dump(), file, indent=2)
+            file.write(file_data.model_dump_json(indent=2))
     async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
         file_data = file_data_from_file(path=file_data_path)
@@ -173,7 +173,7 @@ class DownloadStep(PipelineStep):
         filepath = (self.cache_dir / filename).resolve()
         filepath.parent.mkdir(parents=True, exist_ok=True)
         with open(str(filepath), "w") as f:
-            json.dump(file_data.model_dump(), f, indent=2)
+            f.write(file_data.model_dump_json(indent=2))
         return str(filepath)
     def get_hash(self, extras: Optional[list[str]]) -> str:

unstructured_ingest/v2/pipeline/steps/index.py CHANGED Viewed

@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
     @instrument(span_name=STEP_ID)
     def run(self) -> Generator[str, None, None]:
         for file_data in self.process.run():
-            logger.debug(f"generated file data: {file_data.model_dump()}")
+            logger.debug(f"generated file data: {file_data.model_dump_json()}")
             try:
                 record_hash = self.get_hash(extras=[file_data.identifier])
                 filename = f"{record_hash}.json"
                 filepath = (self.cache_dir / filename).resolve()
                 filepath.parent.mkdir(parents=True, exist_ok=True)
                 with open(str(filepath), "w") as f:
-                    json.dump(file_data.model_dump(), f, indent=2)
+                    f.write(file_data.model_dump_json(indent=2))
                 yield str(filepath)
             except Exception as e:
                 logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
     async def run_async(self) -> AsyncGenerator[str, None]:
         async for file_data in self.process.run_async():
-            logger.debug(f"generated file data: {file_data.model_dump()}")
+            logger.debug(f"generated file data: {file_data.model_dump_json()}")
             try:
                 record_hash = self.get_hash(extras=[file_data.identifier])
                 filename = f"{record_hash}.json"
                 filepath = (self.cache_dir / filename).resolve()
                 filepath.parent.mkdir(parents=True, exist_ok=True)
                 with open(str(filepath), "w") as f:
-                    json.dump(file_data.model_dump(), f, indent=2)
+                    f.write(file_data.model_dump_json(indent=2))
                 yield str(filepath)
             except Exception as e:
                 logger.error(f"failed to create index for file data: {file_data}", exc_info=True)

unstructured_ingest/v2/processes/connectors/astradb.py CHANGED Viewed

@@ -112,7 +112,6 @@ def get_astra_collection(
     collection_name: str,
     keyspace: str,
 ) -> "AstraDBCollection":
     astra_db = get_astra_db(connection_config=connection_config, keyspace=keyspace)
     # astradb will return a collection object in all cases (even if it doesn't exist)
@@ -315,6 +314,7 @@ class AstraDBUploadStager(UploadStager):
                     text_as_html, MAX_CONTENT_PARAM_BYTE_SIZE
                 )
             metadata["original_elements"] = format_and_truncate_orig_elements(element_dict)
+            metadata.pop("orig_elements", None)
     def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
         self.truncate_dict_elements(element_dict)

unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py CHANGED Viewed

@@ -61,7 +61,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
                         self.upload_config.database, ", ".join(databases)
                     )
                 )
-            cursor.execute("SHOW TABLES")
+            cursor.execute(f"SHOW TABLES IN {self.upload_config.database}")
             table_names = [r[1] for r in cursor.fetchall()]
             if self.upload_config.table_name not in table_names:
                 raise ValueError(

unstructured-ingest 0.5.15__py3-none-any.whl → 0.5.17__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.5.15py3-none-any.whl → 0.5.17py3-none-any.whl