PyPI - unstructured-ingest - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

unstructured-ingest 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (15) hide show

test/integration/connectors/test_lancedb.py CHANGED Viewed

@@ -12,6 +12,7 @@ from lancedb import AsyncConnection
 from upath import UPath
 from test.integration.connectors.utils.constants import DESTINATION_TAG
+from unstructured_ingest.v2.constants import RECORD_ID_LABEL
 from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
 from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
     LanceDBAwsAccessConfig,
@@ -43,7 +44,6 @@ DATABASE_NAME = "database"
 TABLE_NAME = "elements"
 DIMENSION = 384
 NUMBER_EXPECTED_ROWS = 22
-NUMBER_EXPECTED_COLUMNS = 10
 S3_BUCKET = "s3://utic-ingest-test-fixtures/"
 GS_BUCKET = "gs://utic-test-ingest-fixtures-output/"
 AZURE_BUCKET = "az://utic-ingest-test-fixtures-output/"
@@ -54,9 +54,9 @@ REQUIRED_ENV_VARS = {
     "local": (),
 }
 SCHEMA = pa.schema(
     [
+        pa.field(RECORD_ID_LABEL, pa.string()),
         pa.field("vector", pa.list_(pa.float16(), DIMENSION)),
         pa.field("text", pa.string(), nullable=True),
         pa.field("type", pa.string(), nullable=True),
@@ -69,6 +69,7 @@ SCHEMA = pa.schema(
         pa.field("metadata-page_number", pa.int32(), nullable=True),
     ]
 )
+NUMBER_EXPECTED_COLUMNS = len(SCHEMA.names)
 @pytest_asyncio.fixture
@@ -116,7 +117,7 @@ async def test_lancedb_destination(
     file_data = FileData(
         source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
         connector_type=CONNECTOR_TYPE,
-        identifier="mock file data",
+        identifier="mock-file-data",
     )
     stager = LanceDBUploadStager()
     uploader = _get_uploader(uri)
@@ -129,17 +130,52 @@ async def test_lancedb_destination(
     await uploader.run_async(path=staged_file_path, file_data=file_data)
-    table = await connection.open_table(TABLE_NAME)
-    table_df: pd.DataFrame = await table.to_pandas()
+    # Test upload to empty table
+    with await connection.open_table(TABLE_NAME) as table:
+        table_df: pd.DataFrame = await table.to_pandas()
     assert len(table_df) == NUMBER_EXPECTED_ROWS
     assert len(table_df.columns) == NUMBER_EXPECTED_COLUMNS
+    assert table_df[RECORD_ID_LABEL][0] == file_data.identifier
     assert table_df["element_id"][0] == "2470d8dc42215b3d68413b55bf00fed2"
     assert table_df["type"][0] == "CompositeElement"
     assert table_df["metadata-filename"][0] == "DA-1p-with-duplicate-pages.pdf.json"
     assert table_df["metadata-text_as_html"][0] is None
+    # Test upload of the second file, rows should be appended
+    file_data.identifier = "mock-file-data-2"
+    staged_second_file_path = stager.run(
+        elements_filepath=upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=f"{upload_file.stem}-2{upload_file.suffix}",
+    )
+    await uploader.run_async(path=staged_second_file_path, file_data=file_data)
+    with await connection.open_table(TABLE_NAME) as table:
+        appended_table_df: pd.DataFrame = await table.to_pandas()
+    assert len(appended_table_df) == 2 * NUMBER_EXPECTED_ROWS
+    # Test re-upload of the first file, rows should be overwritten, not appended
+    await uploader.run_async(path=staged_file_path, file_data=file_data)
+    with await connection.open_table(TABLE_NAME) as table:
+        overwritten_table_df: pd.DataFrame = await table.to_pandas()
+    assert len(overwritten_table_df) == 2 * NUMBER_EXPECTED_ROWS
+class TestPrecheck:
+    @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+    @pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
+    def test_succeeds(
+        self,
+        upload_file: Path,
+        connection_with_uri: tuple[AsyncConnection, str],
+        tmp_path: Path,
+    ) -> None:
+        _, uri = connection_with_uri
+        uploader = _get_uploader(uri)
+        uploader.precheck()
 def _get_uri(target: Literal["local", "s3", "gcs", "az"], local_base_path: Path) -> str:
     if target == "local":
@@ -158,11 +194,12 @@ def _get_uploader(
     uri: str,
 ) -> Union[LanceDBAzureUploader, LanceDBAzureUploader, LanceDBAwsUploader, LanceDBGSPUploader]:
     target = uri.split("://", maxsplit=1)[0] if uri.startswith(("s3", "az", "gs")) else "local"
+    upload_config = LanceDBUploaderConfig(table_name=TABLE_NAME)
     if target == "az":
         azure_connection_string = os.getenv("AZURE_DEST_CONNECTION_STR")
         access_config_kwargs = _parse_azure_connection_string(azure_connection_string)
         return LanceDBAzureUploader(
-            upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
+            upload_config=upload_config,
             connection_config=LanceDBAzureConnectionConfig(
                 access_config=LanceDBAzureAccessConfig(**access_config_kwargs),
                 uri=uri,
@@ -171,7 +208,7 @@ def _get_uploader(
     elif target == "s3":
         return LanceDBAwsUploader(
-            upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
+            upload_config=upload_config,
             connection_config=LanceDBAwsConnectionConfig(
                 access_config=LanceDBAwsAccessConfig(
                     aws_access_key_id=os.getenv("S3_INGEST_TEST_ACCESS_KEY"),
@@ -182,7 +219,7 @@ def _get_uploader(
         )
     elif target == "gs":
         return LanceDBGSPUploader(
-            upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
+            upload_config=upload_config,
             connection_config=LanceDBGCSConnectionConfig(
                 access_config=LanceDBGCSAccessConfig(
                     google_service_account_key=os.getenv("GCP_INGEST_SERVICE_KEY")
@@ -192,7 +229,7 @@ def _get_uploader(
         )
     else:
         return LanceDBLocalUploader(
-            upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
+            upload_config=upload_config,
             connection_config=LanceDBLocalConnectionConfig(
                 access_config=LanceDBLocalAccessConfig(),
                 uri=uri,

test/integration/connectors/test_pinecone.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import math
 import os
 import re
 import time
@@ -19,6 +20,7 @@ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.logger import logger
 from unstructured_ingest.v2.processes.connectors.pinecone import (
     CONNECTOR_TYPE,
+    MAX_QUERY_RESULTS,
     PineconeAccessConfig,
     PineconeConnectionConfig,
     PineconeUploader,
@@ -118,7 +120,10 @@ def validate_pinecone_index(
             f"retry attempt {i}: expected {expected_num_of_vectors} != vector count {vector_count}"
         )
         time.sleep(interval)
-    assert vector_count == expected_num_of_vectors
+    assert vector_count == expected_num_of_vectors, (
+        f"vector count from index ({vector_count}) doesn't "
+        f"match expected number: {expected_num_of_vectors}"
+    )
 @requires_env(API_KEY)
@@ -147,10 +152,7 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
     uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
     uploader.precheck()
-    if uploader.is_async():
-        await uploader.run_async(path=new_upload_file, file_data=file_data)
-    else:
-        uploader.run(path=new_upload_file, file_data=file_data)
+    uploader.run(path=new_upload_file, file_data=file_data)
     with new_upload_file.open() as f:
         staged_content = json.load(f)
     expected_num_of_vectors = len(staged_content)
@@ -160,10 +162,59 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
     )
     # Rerun uploader and make sure no duplicates exist
-    if uploader.is_async():
-        await uploader.run_async(path=new_upload_file, file_data=file_data)
-    else:
-        uploader.run(path=new_upload_file, file_data=file_data)
+    uploader.run(path=new_upload_file, file_data=file_data)
+    logger.info("validating second upload")
+    validate_pinecone_index(
+        index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
+    )
+@requires_env(API_KEY)
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
+@pytest.mark.skip(reason="TODO: get this to work")
+async def test_pinecone_destination_large_index(
+    pinecone_index: str, upload_file: Path, temp_dir: Path
+):
+    new_file = temp_dir / "large_file.json"
+    with upload_file.open() as f:
+        upload_content = json.load(f)
+    min_entries = math.ceil((MAX_QUERY_RESULTS * 2) / len(upload_content))
+    new_content = (upload_content * min_entries)[: (2 * MAX_QUERY_RESULTS)]
+    print(f"Creating large index content with {len(new_content)} records")
+    with new_file.open("w") as f:
+        json.dump(new_content, f)
+    expected_num_of_vectors = len(new_content)
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=new_file.name, filename=new_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="pinecone_mock_id",
+    )
+    connection_config = PineconeConnectionConfig(
+        index_name=pinecone_index,
+        access_config=PineconeAccessConfig(api_key=get_api_key()),
+    )
+    stager_config = PineconeUploadStagerConfig()
+    stager = PineconeUploadStager(upload_stager_config=stager_config)
+    new_upload_file = stager.run(
+        elements_filepath=new_file,
+        output_dir=temp_dir,
+        output_filename=new_file.name,
+        file_data=file_data,
+    )
+    upload_config = PineconeUploaderConfig()
+    uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
+    uploader.precheck()
+    uploader.run(path=new_upload_file, file_data=file_data)
+    validate_pinecone_index(
+        index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
+    )
+    # Rerun uploader and make sure no duplicates exist
+    uploader.run(path=new_upload_file, file_data=file_data)
     logger.info("validating second upload")
     validate_pinecone_index(
         index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.4" # pragma: no cover
1	+ __version__ = "0.3.6" # pragma: no cover

unstructured_ingest/v2/processes/connectors/azure_ai_search.py CHANGED Viewed

@@ -233,8 +233,7 @@ class AzureAISearchUploader(Uploader):
             raise WriteError(
                 ", ".join(
                     [
-                        f"{error.azure_ai_search_key}: "
-                        f"[{error.status_code}] {error.error_message}"
+                        f"{error.key}: " f"[{error.status_code}] {error.error_message}"
                         for error in errors
                     ],
                 ),

unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py CHANGED Viewed

@@ -142,8 +142,6 @@ class ElasticsearchIndexer(Indexer):
     def precheck(self) -> None:
         try:
             with self.connection_config.get_client() as client:
-                if not client.ping():
-                    raise SourceConnectionError("cluster not detected")
                 indices = client.indices.get_alias(index="*")
                 if self.index_config.index_name not in indices:
                     raise SourceConnectionError(
@@ -393,11 +391,9 @@ class ElasticsearchUploader(Uploader):
     def precheck(self) -> None:
         try:
             with self.connection_config.get_client() as client:
-                if not client.ping():
-                    raise DestinationConnectionError("cluster not detected")
                 indices = client.indices.get_alias(index="*")
                 if self.upload_config.index_name not in indices:
-                    raise SourceConnectionError(
+                    raise DestinationConnectionError(
                         "index {} not found: {}".format(
                             self.upload_config.index_name, ", ".join(indices.keys())
                         )

unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py CHANGED Viewed

@@ -15,6 +15,7 @@ from unstructured_ingest.error import DestinationConnectionError
 from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.data_prep import flatten_dict
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.constants import RECORD_ID_LABEL
 from unstructured_ingest.v2.interfaces.connector import ConnectionConfig
 from unstructured_ingest.v2.interfaces.file_data import FileData
 from unstructured_ingest.v2.interfaces.upload_stager import UploadStager, UploadStagerConfig
@@ -84,7 +85,7 @@ class LanceDBUploadStager(UploadStager):
         df = pd.DataFrame(
             [
-                self._conform_element_contents(element_contents)
+                self._conform_element_contents(element_contents, file_data)
                 for element_contents in elements_contents
             ]
         )
@@ -94,9 +95,10 @@ class LanceDBUploadStager(UploadStager):
         return output_path
-    def _conform_element_contents(self, element: dict) -> dict:
+    def _conform_element_contents(self, element: dict, file_data: FileData) -> dict:
         return {
             "vector": element.pop("embeddings", None),
+            RECORD_ID_LABEL: file_data.identifier,
             **flatten_dict(element, separator="-"),
         }
@@ -134,6 +136,14 @@ class LanceDBUploader(Uploader):
         async with self.get_table() as table:
             schema = await table.schema()
             df = self._fit_to_schema(df, schema)
+            if RECORD_ID_LABEL not in schema.names:
+                logger.warning(
+                    f"Designated table doesn't contain {RECORD_ID_LABEL} column of type"
+                    " string which is required to support overwriting updates on subsequent"
+                    " uploads of the same record. New rows will be appended instead."
+                )
+            else:
+                await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
             await table.add(data=df)
     def _fit_to_schema(self, df: pd.DataFrame, schema) -> pd.DataFrame:

unstructured_ingest/v2/processes/connectors/pinecone.py CHANGED Viewed

@@ -31,6 +31,7 @@ CONNECTOR_TYPE = "pinecone"
 MAX_PAYLOAD_SIZE = 2 * 1024 * 1024  # 2MB
 MAX_POOL_THREADS = 100
 MAX_METADATA_BYTES = 40960  # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
+MAX_QUERY_RESULTS = 10000
 class PineconeAccessConfig(AccessConfig):
@@ -84,7 +85,7 @@ ALLOWED_FIELDS = (
 class PineconeUploadStagerConfig(UploadStagerConfig):
     metadata_fields: list[str] = Field(
-        default=str(ALLOWED_FIELDS),
+        default=list(ALLOWED_FIELDS),
         description=(
             "which metadata from the source element to map to the payload metadata being sent to "
             "Pinecone."
@@ -137,7 +138,6 @@ class PineconeUploadStager(UploadStager):
             flatten_lists=True,
             remove_none=True,
         )
-        metadata[RECORD_ID_LABEL] = file_data.identifier
         metadata_size_bytes = len(json.dumps(metadata).encode())
         if metadata_size_bytes > MAX_METADATA_BYTES:
             logger.info(
@@ -146,6 +146,8 @@ class PineconeUploadStager(UploadStager):
             )
             metadata = {}
+        metadata[RECORD_ID_LABEL] = file_data.identifier
         return {
             "id": str(uuid.uuid4()),
             "values": embeddings,
@@ -213,6 +215,18 @@ class PineconeUploader(Uploader):
             f"from pinecone index: {resp}"
         )
+    def delete_by_query(self, index: "PineconeIndex", query_params: dict) -> None:
+        while True:
+            query_results = index.query(**query_params)
+            matches = query_results.get("matches", [])
+            if not matches:
+                break
+            ids = [match["id"] for match in matches]
+            delete_params = {"ids": ids}
+            if namespace := self.upload_config.namespace:
+                delete_params["namespace"] = namespace
+            index.delete(**delete_params)
     def serverless_delete_by_record_id(self, file_data: FileData) -> None:
         logger.debug(
             f"deleting any content with metadata "
@@ -221,29 +235,25 @@ class PineconeUploader(Uploader):
         )
         index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
         index_stats = index.describe_index_stats()
+        dimension = index_stats["dimension"]
         total_vectors = index_stats["total_vector_count"]
         if total_vectors == 0:
             return
-        dimension = index_stats["dimension"]
-        query_params = {
-            "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
-            "vector": [0] * dimension,
-            "top_k": total_vectors,
-        }
-        if namespace := self.upload_config.namespace:
-            query_params["namespace"] = namespace
-        while True:
-            query_results = index.query(**query_params)
-            matches = query_results.get("matches", [])
-            if not matches:
-                break
-            ids = [match["id"] for match in matches]
-            delete_params = {"ids": ids}
+        while total_vectors > 0:
+            top_k = min(total_vectors, MAX_QUERY_RESULTS)
+            query_params = {
+                "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
+                "vector": [0] * dimension,
+                "top_k": top_k,
+            }
             if namespace := self.upload_config.namespace:
-                delete_params["namespace"] = namespace
-            index.delete(**delete_params)
-        logger.debug(
-            f"deleted any content with metadata "
+                query_params["namespace"] = namespace
+            self.delete_by_query(index=index, query_params=query_params)
+            index_stats = index.describe_index_stats()
+            total_vectors = index_stats["total_vector_count"]
+        logger.info(
+            f"deleted {total_vectors} records with metadata "
             f"{self.upload_config.record_id_key}={file_data.identifier} "
             f"from pinecone index"
         )

unstructured_ingest/v2/processes/connectors/weaviate/__init__.py CHANGED Viewed

@@ -10,8 +10,6 @@ from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
 from .embedded import weaviate_embedded_destination_entry
 from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
 from .local import weaviate_local_destination_entry
-from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
-from .weaviate import weaviate_destination_entry
 add_destination_entry(
     destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
@@ -22,4 +20,3 @@ add_destination_entry(
 add_destination_entry(
     destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
 )
-add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)

unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py CHANGED Viewed

@@ -22,7 +22,6 @@ from unstructured_ingest.v2.interfaces import (
     UploadStagerConfig,
 )
 from unstructured_ingest.v2.logger import logger
-from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
 if TYPE_CHECKING:
     from weaviate.classes.init import Timeout
@@ -288,12 +287,3 @@ class WeaviateUploader(Uploader, ABC):
                         vector=vector,
                     )
             self.check_for_errors(client=weaviate_client)
-weaviate_destination_entry = DestinationRegistryEntry(
-    connection_config=WeaviateConnectionConfig,
-    uploader=WeaviateUploader,
-    uploader_config=WeaviateUploaderConfig,
-    upload_stager=WeaviateUploadStager,
-    upload_stager_config=WeaviateUploadStagerConfig,
-)

{unstructured_ingest-0.3.4.dist-info → unstructured_ingest-0.3.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: unstructured-ingest
-Version: 0.3.4
+Version: 0.3.6
 Summary: A library that prepares raw documents for downstream ML tasks.
 Home-page: https://github.com/Unstructured-IO/unstructured-ingest
 Author: Unstructured Technologies
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9.0,<3.13
 Description-Content-Type: text/markdown
 License-File: LICENSE.md
-Requires-Dist: opentelemetry-sdk
-Requires-Dist: pandas
 Requires-Dist: python-dateutil
+Requires-Dist: tqdm
 Requires-Dist: pydantic>=2.7
 Requires-Dist: dataclasses-json
-Requires-Dist: tqdm
+Requires-Dist: opentelemetry-sdk
+Requires-Dist: pandas
 Requires-Dist: click
 Provides-Extra: airtable
 Requires-Dist: pyairtable; extra == "airtable"
@@ -51,8 +51,8 @@ Requires-Dist: chromadb; extra == "chroma"
 Provides-Extra: clarifai
 Requires-Dist: clarifai; extra == "clarifai"
 Provides-Extra: confluence
-Requires-Dist: requests; extra == "confluence"
 Requires-Dist: atlassian-python-api; extra == "confluence"
+Requires-Dist: requests; extra == "confluence"
 Provides-Extra: couchbase
 Requires-Dist: couchbase; extra == "couchbase"
 Provides-Extra: csv
@@ -78,8 +78,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
 Provides-Extra: embed-mixedbreadai
 Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
 Provides-Extra: embed-octoai
-Requires-Dist: tiktoken; extra == "embed-octoai"
 Requires-Dist: openai; extra == "embed-octoai"
+Requires-Dist: tiktoken; extra == "embed-octoai"
 Provides-Extra: embed-vertexai
 Requires-Dist: vertexai; extra == "embed-vertexai"
 Provides-Extra: embed-voyageai
@@ -98,8 +98,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
 Provides-Extra: google-drive
 Requires-Dist: google-api-python-client; extra == "google-drive"
 Provides-Extra: hubspot
-Requires-Dist: hubspot-api-client; extra == "hubspot"
 Requires-Dist: urllib3; extra == "hubspot"
+Requires-Dist: hubspot-api-client; extra == "hubspot"
 Provides-Extra: jira
 Requires-Dist: atlassian-python-api; extra == "jira"
 Provides-Extra: kafka
@@ -117,26 +117,26 @@ Requires-Dist: pymongo; extra == "mongodb"
 Provides-Extra: msg
 Requires-Dist: unstructured[msg]; extra == "msg"
 Provides-Extra: notion
-Requires-Dist: htmlBuilder; extra == "notion"
+Requires-Dist: notion-client; extra == "notion"
 Requires-Dist: backoff; extra == "notion"
 Requires-Dist: httpx; extra == "notion"
-Requires-Dist: notion-client; extra == "notion"
+Requires-Dist: htmlBuilder; extra == "notion"
 Provides-Extra: odt
 Requires-Dist: unstructured[odt]; extra == "odt"
 Provides-Extra: onedrive
 Requires-Dist: bs4; extra == "onedrive"
-Requires-Dist: msal; extra == "onedrive"
 Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
+Requires-Dist: msal; extra == "onedrive"
 Provides-Extra: openai
-Requires-Dist: tiktoken; extra == "openai"
 Requires-Dist: openai; extra == "openai"
+Requires-Dist: tiktoken; extra == "openai"
 Provides-Extra: opensearch
 Requires-Dist: opensearch-py; extra == "opensearch"
 Provides-Extra: org
 Requires-Dist: unstructured[org]; extra == "org"
 Provides-Extra: outlook
-Requires-Dist: msal; extra == "outlook"
 Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
+Requires-Dist: msal; extra == "outlook"
 Provides-Extra: pdf
 Requires-Dist: unstructured[pdf]; extra == "pdf"
 Provides-Extra: pinecone
@@ -163,18 +163,18 @@ Requires-Dist: s3fs; extra == "s3"
 Provides-Extra: salesforce
 Requires-Dist: simple-salesforce; extra == "salesforce"
 Provides-Extra: sftp
-Requires-Dist: fsspec; extra == "sftp"
 Requires-Dist: paramiko; extra == "sftp"
+Requires-Dist: fsspec; extra == "sftp"
 Provides-Extra: sharepoint
-Requires-Dist: msal; extra == "sharepoint"
 Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
+Requires-Dist: msal; extra == "sharepoint"
 Provides-Extra: singlestore
 Requires-Dist: singlestoredb; extra == "singlestore"
 Provides-Extra: slack
 Requires-Dist: slack-sdk[optional]; extra == "slack"
 Provides-Extra: snowflake
-Requires-Dist: psycopg2-binary; extra == "snowflake"
 Requires-Dist: snowflake-connector-python; extra == "snowflake"
+Requires-Dist: psycopg2-binary; extra == "snowflake"
 Provides-Extra: togetherai
 Requires-Dist: together; extra == "togetherai"
 Provides-Extra: tsv

{unstructured_ingest-0.3.4.dist-info → unstructured_ingest-0.3.6.dist-info}/RECORD RENAMED Viewed

@@ -10,11 +10,11 @@ test/integration/connectors/test_azure_ai_search.py,sha256=dae4GifRiKue5YpsxworD
 test/integration/connectors/test_confluence.py,sha256=xcPmZ_vi_pkCt-tUPn10P49FH9i_9YUbrAPO6fYk5rU,3521
 test/integration/connectors/test_delta_table.py,sha256=GSzWIkbEUzOrRPt2F1uO0dabcp7kTFDj75BhhI2y-WU,6856
 test/integration/connectors/test_kafka.py,sha256=j7jsNWZumNBv9v-5Bpx8geUUXpxxad5EuA4CMRsl4R8,7104
-test/integration/connectors/test_lancedb.py,sha256=8hRlqw3zYOcFCu6PPlejquSvvEM_3OEBzKTQbNm_Zmg,7635
+test/integration/connectors/test_lancedb.py,sha256=U2HfIrf6iJ7lYMn-vz0j-LesVyDY-jc9QrQhlJVhG9Q,9183
 test/integration/connectors/test_milvus.py,sha256=p4UujDr_tsRaQDmhDmDZp38t8oSFm7hrTqiq6NNuhGo,5933
 test/integration/connectors/test_mongodb.py,sha256=YeS_DUnVYN02F76j87W8RhXGHnJMzQYb3n-L1-oWGXI,12254
 test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
-test/integration/connectors/test_pinecone.py,sha256=X10OWZ6IrO6YyhuR3ydMAZOQq3u2f5u_lCjKNYUUcnI,7558
+test/integration/connectors/test_pinecone.py,sha256=i-v5WkAI9M6SUZI7ch9qdILlRHopAdptpkSY12-BaTk,9483
 test/integration/connectors/test_qdrant.py,sha256=ASvO-BNyhv8m8or28KljrJy27Da0uaTNeoR5w_QsvFg,5121
 test/integration/connectors/test_s3.py,sha256=YHEYMqWTKTfR7wlL4VoxtgMs1YiYKyhLIBdG-anaQGo,6896
 test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -83,7 +83,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
 test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
 unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
-unstructured_ingest/__version__.py,sha256=0rNziXrR8RxleBY3pKm77TbOCJ0CwApHiLqXBAViUAo,42
+unstructured_ingest/__version__.py,sha256=J7Aic1p5b4KF_ydqV36h8cvEIhTtU-IJ72bMV9mQs8w,42
 unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
 unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
 unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -398,7 +398,7 @@ unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2j
 unstructured_ingest/v2/processes/connectors/__init__.py,sha256=8M3aYYNbOkS2SYG2B_HLHMgX4V69-Oz1VqpQcRQMiVg,5167
 unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
 unstructured_ingest/v2/processes/connectors/astradb.py,sha256=QTUQ-cv_iZi9eaXRRHQNKhtgFn-Pi20AXdSVaDFg9DM,15498
-unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=-6IijSWGqj-85vD0c4l5wdMHp-LF371jO8j53PPRB4I,12002
+unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=97HxxVvqf-80Bxb-AaBhFhMvoRl7cUjn4n-39vCAVG0,11962
 unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
 unstructured_ingest/v2/processes/connectors/confluence.py,sha256=qQApDcmPBGg4tHXwSOj4JPkAbrO9GQ4NRlaETjhp25U,7003
 unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=LbUJLt6fqaNYSmy9vUiovG-UOALMcvh8OD-gZAaf-f4,12333
@@ -411,7 +411,7 @@ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=3sV0Yv2vYMLyxszKCqA
 unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
 unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
 unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
-unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=hWkXgVDAzCtrBxf7A4HoexBACGAfVf_Qvn9YHbeiBSY,11505
+unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=-J6QPJv_jmjln8cTUsfEEAyd_hi_fmD-uwB6C84rA4w,11930
 unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
 unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
 unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
@@ -423,7 +423,7 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P
 unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
 unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=Wk7s2_u5G0BOV5slvGc8IlUf7ivznY9PrgPqe6nlJKM,2897
 unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
-unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=lzbrQ66zz3Dh_G29XFkyzQ84St8H_xfQVsYV4mTf32c,19141
+unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=sI58uypWr1mpSl4bxr46nIfypGZ4aqryCT83qqCVnSM,18921
 unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
 unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
 unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
@@ -443,7 +443,7 @@ unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=eeXWsh8UeVm1Ur
 unstructured_ingest/v2/processes/connectors/lancedb/azure.py,sha256=Ms5vQVRIpTF1Q2qBl_bET9wbgaf4diPaH-iR8kJlr4E,1461
 unstructured_ingest/v2/processes/connectors/lancedb/cloud.py,sha256=BFy0gW2OZ_qaZJM97m-tNsFaJPi9zOKrrd2y4thcNP0,1341
 unstructured_ingest/v2/processes/connectors/lancedb/gcp.py,sha256=p5BPaFtS3y3Yh8PIr3tUqsAXrUYu4QYYAWQNh5W2ucE,1361
-unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7WIShs2V3dpN6wUhDTt1j2rvdiPp6yopbh7XYkb9T3s,5129
+unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7FODnesYu8cFx1PeQJZxXij-8Dei4Kk3Bs0oxoUGBtI,5745
 unstructured_ingest/v2/processes/connectors/lancedb/local.py,sha256=_7-6iO6B60gAWwJUUrmlsRzYMFIBeZgu_QT3mhw5L0I,1272
 unstructured_ingest/v2/processes/connectors/qdrant/__init__.py,sha256=xM19uYzAuGizVoZIM_hnVZ5AcBN69aOBGpqZcpWPtuE,760
 unstructured_ingest/v2/processes/connectors/qdrant/cloud.py,sha256=accJ4sNWBVWV-KiVBDBDBYYx5A9CUoikP5NCErRmfik,1624
@@ -456,14 +456,14 @@ unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=YrmhAL1RQ1
 unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=jl524VudwmFK63emCT7DmZan_EWJAMiGir5_zoO9FuY,5697
 unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=LFzGeAUagLknK07DsXg2oSG7ZAgR6VqT9wfI_tYlHUg,14782
 unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
-unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=eXamSnQdzzMvt62z80B8nmlkwDKO-Pogln_K_zLz53A,1067
+unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
 unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
 unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
 unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
-unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=ln1p9ahFTaT-qsL7p4bgw_IqnU60As_l6vVAqUWyQVE,11655
-unstructured_ingest-0.3.4.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
-unstructured_ingest-0.3.4.dist-info/METADATA,sha256=6Nj2KHvch7j5QLfahz5NcFHmmNq9vNixTfZSDUEQPjo,7393
-unstructured_ingest-0.3.4.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
-unstructured_ingest-0.3.4.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
-unstructured_ingest-0.3.4.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
-unstructured_ingest-0.3.4.dist-info/RECORD,,
+unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=dBDC_M8GVKupl7i9UMRCZyRIUv6gTkq8bJE_SILydAc,11291
+unstructured_ingest-0.3.6.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
+unstructured_ingest-0.3.6.dist-info/METADATA,sha256=JmWEiv5oO6crJ6dRAOcBrCiJI12tOonA_arMTa5HoJY,7393
+unstructured_ingest-0.3.6.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+unstructured_ingest-0.3.6.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
+unstructured_ingest-0.3.6.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
+unstructured_ingest-0.3.6.dist-info/RECORD,,

{unstructured_ingest-0.3.4.dist-info → unstructured_ingest-0.3.6.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{unstructured_ingest-0.3.4.dist-info → unstructured_ingest-0.3.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{unstructured_ingest-0.3.4.dist-info → unstructured_ingest-0.3.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{unstructured_ingest-0.3.4.dist-info → unstructured_ingest-0.3.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

unstructured-ingest 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl