unstructured-ingest 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (22) hide show
  1. test/integration/connectors/test_lancedb.py +7 -7
  2. test/integration/connectors/test_milvus.py +34 -6
  3. test/integration/connectors/weaviate/test_cloud.py +34 -0
  4. test/integration/embedders/test_azure_openai.py +59 -0
  5. test/unit/test_utils.py +21 -1
  6. unstructured_ingest/__version__.py +1 -1
  7. unstructured_ingest/embed/azure_openai.py +31 -0
  8. unstructured_ingest/utils/string_and_date_utils.py +10 -0
  9. unstructured_ingest/v2/processes/connectors/astradb.py +16 -0
  10. unstructured_ingest/v2/processes/connectors/couchbase.py +4 -1
  11. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -4
  12. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +7 -7
  13. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  14. unstructured_ingest/v2/processes/connectors/milvus.py +9 -3
  15. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +4 -3
  16. unstructured_ingest/v2/processes/embedder.py +30 -0
  17. {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/METADATA +17 -15
  18. {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/RECORD +22 -18
  19. {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/LICENSE.md +0 -0
  20. {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/WHEEL +0 -0
  21. {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/entry_points.txt +0 -0
  22. {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/top_level.txt +0 -0
@@ -14,9 +14,9 @@ from upath import UPath
14
14
  from test.integration.connectors.utils.constants import DESTINATION_TAG
15
15
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
16
16
  from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
17
- LanceDBS3AccessConfig,
18
- LanceDBS3ConnectionConfig,
19
- LanceDBS3Uploader,
17
+ LanceDBAwsAccessConfig,
18
+ LanceDBAwsConnectionConfig,
19
+ LanceDBAwsUploader,
20
20
  )
21
21
  from unstructured_ingest.v2.processes.connectors.lancedb.azure import (
22
22
  LanceDBAzureAccessConfig,
@@ -156,7 +156,7 @@ def _get_uri(target: Literal["local", "s3", "gcs", "az"], local_base_path: Path)
156
156
 
157
157
  def _get_uploader(
158
158
  uri: str,
159
- ) -> Union[LanceDBAzureUploader, LanceDBAzureUploader, LanceDBS3Uploader, LanceDBGSPUploader]:
159
+ ) -> Union[LanceDBAzureUploader, LanceDBAzureUploader, LanceDBAwsUploader, LanceDBGSPUploader]:
160
160
  target = uri.split("://", maxsplit=1)[0] if uri.startswith(("s3", "az", "gs")) else "local"
161
161
  if target == "az":
162
162
  azure_connection_string = os.getenv("AZURE_DEST_CONNECTION_STR")
@@ -170,10 +170,10 @@ def _get_uploader(
170
170
  )
171
171
 
172
172
  elif target == "s3":
173
- return LanceDBS3Uploader(
173
+ return LanceDBAwsUploader(
174
174
  upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
175
- connection_config=LanceDBS3ConnectionConfig(
176
- access_config=LanceDBS3AccessConfig(
175
+ connection_config=LanceDBAwsConnectionConfig(
176
+ access_config=LanceDBAwsAccessConfig(
177
177
  aws_access_key_id=os.getenv("S3_INGEST_TEST_ACCESS_KEY"),
178
178
  aws_secret_access_key=os.getenv("S3_INGEST_TEST_SECRET_KEY"),
179
179
  ),
@@ -15,6 +15,7 @@ from pymilvus.milvus_client import IndexParams
15
15
  from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
16
16
  from test.integration.connectors.utils.docker import healthcheck_wait
17
17
  from test.integration.connectors.utils.docker_compose import docker_compose_context
18
+ from unstructured_ingest.error import DestinationConnectionError
18
19
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
20
  from unstructured_ingest.v2.processes.connectors.milvus import (
20
21
  CONNECTOR_TYPE,
@@ -24,9 +25,10 @@ from unstructured_ingest.v2.processes.connectors.milvus import (
24
25
  MilvusUploadStager,
25
26
  )
26
27
 
27
- DB_URI = "http://localhost:19530"
28
28
  DB_NAME = "test_database"
29
- COLLECTION_NAME = "test_collection"
29
+ EXISTENT_COLLECTION_NAME = "test_collection"
30
+ NONEXISTENT_COLLECTION_NAME = "nonexistent_collection"
31
+ DB_URI = "http://localhost:19530"
30
32
 
31
33
 
32
34
  def get_schema() -> CollectionSchema:
@@ -55,7 +57,9 @@ def get_index_params() -> IndexParams:
55
57
  return index_params
56
58
 
57
59
 
58
- @pytest.fixture
60
+ # NOTE: Precheck tests are read-only so they don't interfere with destination test,
61
+ # using scope="module" we can limit number of times the docker-compose has to be run
62
+ @pytest.fixture(scope="module")
59
63
  def collection():
60
64
  docker_client = docker.from_env()
61
65
  with docker_compose_context(docker_compose_path=env_setup_path / "milvus"):
@@ -73,10 +77,10 @@ def collection():
73
77
  schema = get_schema()
74
78
  index_params = get_index_params()
75
79
  collection_resp = milvus_client.create_collection(
76
- collection_name=COLLECTION_NAME, schema=schema, index_params=index_params
80
+ collection_name=EXISTENT_COLLECTION_NAME, schema=schema, index_params=index_params
77
81
  )
78
- print(f"Created collection {COLLECTION_NAME}: {collection_resp}")
79
- yield COLLECTION_NAME
82
+ print(f"Created collection {EXISTENT_COLLECTION_NAME}: {collection_resp}")
83
+ yield EXISTENT_COLLECTION_NAME
80
84
  finally:
81
85
  milvus_client.close()
82
86
 
@@ -139,3 +143,27 @@ async def test_milvus_destination(
139
143
  uploader.run(path=staged_filepath, file_data=file_data)
140
144
  with uploader.get_client() as client:
141
145
  validate_count(client=client, expected_count=expected_count)
146
+
147
+
148
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
149
+ def test_precheck_succeeds(collection: str):
150
+ uploader = MilvusUploader(
151
+ connection_config=MilvusConnectionConfig(uri=DB_URI),
152
+ upload_config=MilvusUploaderConfig(db_name=DB_NAME, collection_name=collection),
153
+ )
154
+ uploader.precheck()
155
+
156
+
157
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
158
+ def test_precheck_fails_on_nonexistent_collection(collection: str):
159
+ uploader = MilvusUploader(
160
+ connection_config=MilvusConnectionConfig(uri=DB_URI),
161
+ upload_config=MilvusUploaderConfig(
162
+ db_name=DB_NAME, collection_name=NONEXISTENT_COLLECTION_NAME
163
+ ),
164
+ )
165
+ with pytest.raises(
166
+ DestinationConnectionError,
167
+ match=f"Collection '{NONEXISTENT_COLLECTION_NAME}' does not exist",
168
+ ):
169
+ uploader.precheck()
@@ -0,0 +1,34 @@
1
+ import pytest
2
+ from pydantic import ValidationError
3
+
4
+ from unstructured_ingest.v2.processes.connectors.weaviate.cloud import (
5
+ CloudWeaviateAccessConfig,
6
+ CloudWeaviateConnectionConfig,
7
+ )
8
+
9
+
10
+ def test_weaviate_failing_connection_config():
11
+ with pytest.raises(ValidationError):
12
+ CloudWeaviateConnectionConfig(
13
+ access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
14
+ username="username",
15
+ cluster_url="clusterurl",
16
+ )
17
+
18
+
19
+ def test_weaviate_connection_config_happy_path():
20
+ CloudWeaviateConnectionConfig(
21
+ access_config=CloudWeaviateAccessConfig(
22
+ api_key="my key",
23
+ ),
24
+ cluster_url="clusterurl",
25
+ )
26
+
27
+
28
+ def test_weaviate_connection_config_anonymous():
29
+ CloudWeaviateConnectionConfig(
30
+ access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
31
+ username="username",
32
+ anonymous=True,
33
+ cluster_url="clusterurl",
34
+ )
@@ -0,0 +1,59 @@
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
7
+ from test.integration.utils import requires_env
8
+ from unstructured_ingest.embed.azure_openai import (
9
+ AzureOpenAIEmbeddingConfig,
10
+ AzureOpenAIEmbeddingEncoder,
11
+ )
12
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
13
+
14
+ API_KEY = "AZURE_OPENAI_API_KEY"
15
+ ENDPOINT = "AZURE_OPENAI_ENDPOINT"
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class AzureData:
20
+ api_key: str
21
+ endpoint: str
22
+
23
+
24
+ def get_azure_data() -> AzureData:
25
+ api_key = os.getenv(API_KEY, None)
26
+ assert api_key
27
+ endpoint = os.getenv(ENDPOINT, None)
28
+ assert endpoint
29
+ return AzureData(api_key, endpoint)
30
+
31
+
32
+ @requires_env(API_KEY, ENDPOINT)
33
+ def test_azure_openai_embedder(embedder_file: Path):
34
+ azure_data = get_azure_data()
35
+ embedder_config = EmbedderConfig(
36
+ embedding_provider="azure-openai",
37
+ embedding_api_key=azure_data.api_key,
38
+ embedding_azure_endpoint=azure_data.endpoint,
39
+ )
40
+ embedder = Embedder(config=embedder_config)
41
+ results = embedder.run(elements_filepath=embedder_file)
42
+ assert results
43
+ with embedder_file.open("r") as f:
44
+ original_elements = json.load(f)
45
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
46
+
47
+
48
+ @requires_env(API_KEY, ENDPOINT)
49
+ def test_raw_azure_openai_embedder(embedder_file: Path):
50
+ azure_data = get_azure_data()
51
+ embedder = AzureOpenAIEmbeddingEncoder(
52
+ config=AzureOpenAIEmbeddingConfig(
53
+ api_key=azure_data.api_key,
54
+ azure_endpoint=azure_data.endpoint,
55
+ )
56
+ )
57
+ validate_raw_embedder(
58
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
59
+ )
test/unit/test_utils.py CHANGED
@@ -8,7 +8,11 @@ import pytz
8
8
 
9
9
  from unstructured_ingest.cli.utils import extract_config
10
10
  from unstructured_ingest.interfaces import BaseConfig
11
- from unstructured_ingest.utils.string_and_date_utils import ensure_isoformat_datetime, json_to_dict
11
+ from unstructured_ingest.utils.string_and_date_utils import (
12
+ ensure_isoformat_datetime,
13
+ json_to_dict,
14
+ truncate_string_bytes,
15
+ )
12
16
 
13
17
 
14
18
  @dataclass
@@ -162,3 +166,19 @@ def test_ensure_isoformat_datetime_fails_on_string():
162
166
  def test_ensure_isoformat_datetime_fails_on_int():
163
167
  with pytest.raises(TypeError):
164
168
  ensure_isoformat_datetime(1111)
169
+
170
+
171
+ def test_truncate_string_bytes_return_truncated_string():
172
+ test_string = "abcdef안녕하세요ghijklmn방갑습니opqrstu 더 길어지면 안되는 문자열vwxyz"
173
+ max_bytes = 11
174
+ result = truncate_string_bytes(test_string, max_bytes)
175
+ assert result == "abcdef안"
176
+ assert len(result.encode("utf-8")) <= max_bytes
177
+
178
+
179
+ def test_truncate_string_bytes_return_untouched_string():
180
+ test_string = "abcdef"
181
+ max_bytes = 11
182
+ result = truncate_string_bytes(test_string, max_bytes)
183
+ assert result == "abcdef"
184
+ assert len(result.encode("utf-8")) <= max_bytes
@@ -1 +1 @@
1
- __version__ = "0.3.2" # pragma: no cover
1
+ __version__ = "0.3.4" # pragma: no cover
@@ -0,0 +1,31 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
+
4
+ from pydantic import Field
5
+
6
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+
9
+ if TYPE_CHECKING:
10
+ from openai import AzureOpenAI
11
+
12
+
13
+ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
14
+ api_version: str = Field(description="Azure API version", default="2024-06-01")
15
+ azure_endpoint: str
16
+ embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
17
+
18
+ @requires_dependencies(["openai"], extras="openai")
19
+ def get_client(self) -> "AzureOpenAI":
20
+ from openai import AzureOpenAI
21
+
22
+ return AzureOpenAI(
23
+ api_key=self.api_key.get_secret_value(),
24
+ api_version=self.api_version,
25
+ azure_endpoint=self.azure_endpoint,
26
+ )
27
+
28
+
29
+ @dataclass
30
+ class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
31
+ config: AzureOpenAIEmbeddingConfig
@@ -37,3 +37,13 @@ def ensure_isoformat_datetime(timestamp: t.Union[datetime, str]) -> str:
37
37
  raise ValueError(f"String '{timestamp}' could not be parsed as a datetime.") from e
38
38
  else:
39
39
  raise TypeError(f"Expected input type datetime or str, but got {type(timestamp)}.")
40
+
41
+
42
+ def truncate_string_bytes(string: str, max_bytes: int, encoding: str = "utf-8") -> str:
43
+ """
44
+ Truncates a string to a specified maximum number of bytes.
45
+ """
46
+ encoded_string = str(string).encode(encoding)
47
+ if len(encoded_string) <= max_bytes:
48
+ return string
49
+ return encoded_string[:max_bytes].decode(encoding, errors="ignore")
@@ -19,6 +19,7 @@ from unstructured_ingest.error import (
19
19
  )
20
20
  from unstructured_ingest.utils.data_prep import batch_generator
21
21
  from unstructured_ingest.utils.dep_check import requires_dependencies
22
+ from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
22
23
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
23
24
  from unstructured_ingest.v2.interfaces import (
24
25
  AccessConfig,
@@ -50,6 +51,8 @@ if TYPE_CHECKING:
50
51
 
51
52
  CONNECTOR_TYPE = "astradb"
52
53
 
54
+ MAX_CONTENT_PARAM_BYTE_SIZE = 8000
55
+
53
56
 
54
57
  class AstraDBAccessConfig(AccessConfig):
55
58
  token: str = Field(description="Astra DB Token with access to the database.")
@@ -301,7 +304,20 @@ class AstraDBUploadStager(UploadStager):
301
304
  default_factory=lambda: AstraDBUploadStagerConfig()
302
305
  )
303
306
 
307
+ def truncate_dict_elements(self, element_dict: dict) -> None:
308
+ text = element_dict.pop("text", None)
309
+ if text is not None:
310
+ element_dict["text"] = truncate_string_bytes(text, MAX_CONTENT_PARAM_BYTE_SIZE)
311
+ metadata = element_dict.get("metadata")
312
+ if metadata is not None and isinstance(metadata, dict):
313
+ text_as_html = element_dict["metadata"].pop("text_as_html", None)
314
+ if text_as_html is not None:
315
+ element_dict["metadata"]["text_as_html"] = truncate_string_bytes(
316
+ text_as_html, MAX_CONTENT_PARAM_BYTE_SIZE
317
+ )
318
+
304
319
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
320
+ self.truncate_dict_elements(element_dict)
305
321
  return {
306
322
  "$vector": element_dict.pop("embeddings", None),
307
323
  "content": element_dict.pop("text", None),
@@ -219,6 +219,9 @@ class CouchbaseIndexer(Indexer):
219
219
 
220
220
 
221
221
  class CouchbaseDownloaderConfig(DownloaderConfig):
222
+ collection_id: str = Field(
223
+ default="id", description="The unique key of the id field in the collection"
224
+ )
222
225
  fields: list[str] = field(default_factory=list)
223
226
 
224
227
 
@@ -250,7 +253,7 @@ class CouchbaseDownloader(Downloader):
250
253
  def generate_download_response(
251
254
  self, result: dict, bucket: str, file_data: FileData
252
255
  ) -> DownloadResponse:
253
- record_id = result["id"]
256
+ record_id = result[self.download_config.collection_id]
254
257
  filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
255
258
  filename = f"{filename_id}.txt"
256
259
  download_path = self.download_dir / Path(filename)
@@ -6,12 +6,25 @@ from .aws import CONNECTOR_TYPE as LANCEDB_S3_CONNECTOR_TYPE
6
6
  from .aws import lancedb_aws_destination_entry
7
7
  from .azure import CONNECTOR_TYPE as LANCEDB_AZURE_CONNECTOR_TYPE
8
8
  from .azure import lancedb_azure_destination_entry
9
+ from .cloud import CONNECTOR_TYPE as LANCEDB_CLOUD_CONNECTOR_TYPE
10
+ from .cloud import lancedb_cloud_destination_entry
9
11
  from .gcp import CONNECTOR_TYPE as LANCEDB_GCS_CONNECTOR_TYPE
10
12
  from .gcp import lancedb_gcp_destination_entry
11
13
  from .local import CONNECTOR_TYPE as LANCEDB_LOCAL_CONNECTOR_TYPE
12
14
  from .local import lancedb_local_destination_entry
13
15
 
14
- add_destination_entry(LANCEDB_S3_CONNECTOR_TYPE, lancedb_aws_destination_entry)
15
- add_destination_entry(LANCEDB_AZURE_CONNECTOR_TYPE, lancedb_azure_destination_entry)
16
- add_destination_entry(LANCEDB_GCS_CONNECTOR_TYPE, lancedb_gcp_destination_entry)
17
- add_destination_entry(LANCEDB_LOCAL_CONNECTOR_TYPE, lancedb_local_destination_entry)
16
+ add_destination_entry(
17
+ destination_type=LANCEDB_S3_CONNECTOR_TYPE, entry=lancedb_aws_destination_entry
18
+ )
19
+ add_destination_entry(
20
+ destination_type=LANCEDB_AZURE_CONNECTOR_TYPE, entry=lancedb_azure_destination_entry
21
+ )
22
+ add_destination_entry(
23
+ destination_type=LANCEDB_GCS_CONNECTOR_TYPE, entry=lancedb_gcp_destination_entry
24
+ )
25
+ add_destination_entry(
26
+ destination_type=LANCEDB_LOCAL_CONNECTOR_TYPE, entry=lancedb_local_destination_entry
27
+ )
28
+ add_destination_entry(
29
+ destination_type=LANCEDB_CLOUD_CONNECTOR_TYPE, entry=lancedb_cloud_destination_entry
30
+ )
@@ -15,28 +15,28 @@ from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
15
15
  CONNECTOR_TYPE = "lancedb_aws"
16
16
 
17
17
 
18
- class LanceDBS3AccessConfig(AccessConfig):
18
+ class LanceDBAwsAccessConfig(AccessConfig):
19
19
  aws_access_key_id: str = Field(description="The AWS access key ID to use.")
20
20
  aws_secret_access_key: str = Field(description="The AWS secret access key to use.")
21
21
 
22
22
 
23
- class LanceDBS3ConnectionConfig(LanceDBRemoteConnectionConfig):
24
- access_config: Secret[LanceDBS3AccessConfig]
23
+ class LanceDBAwsConnectionConfig(LanceDBRemoteConnectionConfig):
24
+ access_config: Secret[LanceDBAwsAccessConfig]
25
25
 
26
26
  def get_storage_options(self) -> dict:
27
27
  return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
28
28
 
29
29
 
30
30
  @dataclass
31
- class LanceDBS3Uploader(LanceDBUploader):
31
+ class LanceDBAwsUploader(LanceDBUploader):
32
32
  upload_config: LanceDBUploaderConfig
33
- connection_config: LanceDBS3ConnectionConfig
33
+ connection_config: LanceDBAwsConnectionConfig
34
34
  connector_type: str = CONNECTOR_TYPE
35
35
 
36
36
 
37
37
  lancedb_aws_destination_entry = DestinationRegistryEntry(
38
- connection_config=LanceDBS3ConnectionConfig,
39
- uploader=LanceDBS3Uploader,
38
+ connection_config=LanceDBAwsConnectionConfig,
39
+ uploader=LanceDBAwsUploader,
40
40
  uploader_config=LanceDBUploaderConfig,
41
41
  upload_stager_config=LanceDBUploadStagerConfig,
42
42
  upload_stager=LanceDBUploadStager,
@@ -0,0 +1,42 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.interfaces.connector import AccessConfig
6
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
7
+ from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
8
+ LanceDBRemoteConnectionConfig,
9
+ LanceDBUploader,
10
+ LanceDBUploaderConfig,
11
+ LanceDBUploadStager,
12
+ LanceDBUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "lancedb_cloud"
16
+
17
+
18
+ class LanceDBCloudAccessConfig(AccessConfig):
19
+ api_key: str = Field(description="Api key associated with LanceDb cloud")
20
+
21
+
22
+ class LanceDBCloudConnectionConfig(LanceDBRemoteConnectionConfig):
23
+ access_config: Secret[LanceDBCloudAccessConfig]
24
+
25
+ def get_storage_options(self) -> dict:
26
+ return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
27
+
28
+
29
+ @dataclass
30
+ class LanceDBCloudUploader(LanceDBUploader):
31
+ upload_config: LanceDBUploaderConfig
32
+ connection_config: LanceDBCloudConnectionConfig
33
+ connector_type: str = CONNECTOR_TYPE
34
+
35
+
36
+ lancedb_cloud_destination_entry = DestinationRegistryEntry(
37
+ connection_config=LanceDBCloudConnectionConfig,
38
+ uploader=LanceDBCloudUploader,
39
+ uploader_config=LanceDBUploaderConfig,
40
+ upload_stager_config=LanceDBUploadStagerConfig,
41
+ upload_stager=LanceDBUploadStager,
42
+ )
@@ -8,7 +8,7 @@ import pandas as pd
8
8
  from dateutil import parser
9
9
  from pydantic import Field, Secret
10
10
 
11
- from unstructured_ingest.error import WriteError
11
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
12
12
  from unstructured_ingest.utils.data_prep import flatten_dict
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
14
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
@@ -66,7 +66,6 @@ class MilvusConnectionConfig(ConnectionConfig):
66
66
 
67
67
 
68
68
  class MilvusUploadStagerConfig(UploadStagerConfig):
69
-
70
69
  fields_to_include: Optional[list[str]] = None
71
70
  """If set - list of fields to include in the output.
72
71
  Unspecified fields are removed from the elements.
@@ -174,6 +173,14 @@ class MilvusUploader(Uploader):
174
173
  upload_config: MilvusUploaderConfig
175
174
  connector_type: str = CONNECTOR_TYPE
176
175
 
176
+ @DestinationConnectionError.wrap
177
+ def precheck(self):
178
+ with self.get_client() as client:
179
+ if not client.has_collection(self.upload_config.collection_name):
180
+ raise DestinationConnectionError(
181
+ f"Collection '{self.upload_config.collection_name}' does not exist"
182
+ )
183
+
177
184
  @contextmanager
178
185
  def get_client(self) -> Generator["MilvusClient", None, None]:
179
186
  client = self.connection_config.get_client()
@@ -218,7 +225,6 @@ class MilvusUploader(Uploader):
218
225
  f"db in collection {self.upload_config.collection_name}"
219
226
  )
220
227
  with self.get_client() as client:
221
-
222
228
  try:
223
229
  res = client.insert(collection_name=self.upload_config.collection_name, data=data)
224
230
  except MilvusException as milvus_exception:
@@ -55,10 +55,11 @@ class CloudWeaviateConnectionConfig(WeaviateConnectionConfig):
55
55
  "client_secret": access_config.client_secret is not None,
56
56
  "client_password": access_config.password is not None and self.username is not None,
57
57
  }
58
- if len(auths) == 0:
58
+ existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
59
+
60
+ if len(existing_auths) == 0:
59
61
  raise ValueError("No auth values provided and anonymous is False")
60
- if len(auths) > 1:
61
- existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
62
+ if len(existing_auths) > 1:
62
63
  raise ValueError(
63
64
  "Multiple auth values provided, only one approach can be used: {}".format(
64
65
  ", ".join(existing_auths)
@@ -16,6 +16,7 @@ class EmbedderConfig(BaseModel):
16
16
  embedding_provider: Optional[
17
17
  Literal[
18
18
  "openai",
19
+ "azure-openai",
19
20
  "huggingface",
20
21
  "aws-bedrock",
21
22
  "vertexai",
@@ -43,6 +44,14 @@ class EmbedderConfig(BaseModel):
43
44
  embedding_aws_region: Optional[str] = Field(
44
45
  default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
45
46
  )
47
+ embedding_azure_endpoint: Optional[str] = Field(
48
+ default=None,
49
+ description="Your Azure endpoint, including the resource, "
50
+ "e.g. `https://example-resource.azure.openai.com/`",
51
+ )
52
+ embedding_azure_api_version: Optional[str] = Field(
53
+ description="Azure API version", default=None
54
+ )
46
55
 
47
56
  def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
48
57
  from unstructured_ingest.embed.huggingface import (
@@ -59,6 +68,25 @@ class EmbedderConfig(BaseModel):
59
68
 
60
69
  return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
61
70
 
71
+ def get_azure_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
72
+ from unstructured_ingest.embed.azure_openai import (
73
+ AzureOpenAIEmbeddingConfig,
74
+ AzureOpenAIEmbeddingEncoder,
75
+ )
76
+
77
+ config_kwargs = {
78
+ "api_key": self.embedding_api_key,
79
+ "azure_endpoint": self.embedding_azure_endpoint,
80
+ }
81
+ if api_version := self.embedding_azure_api_version:
82
+ config_kwargs["api_version"] = api_version
83
+ if model_name := self.embedding_model_name:
84
+ config_kwargs["model_name"] = model_name
85
+
86
+ return AzureOpenAIEmbeddingEncoder(
87
+ config=AzureOpenAIEmbeddingConfig.model_validate(config_kwargs)
88
+ )
89
+
62
90
  def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
63
91
  from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
64
92
 
@@ -146,6 +174,8 @@ class EmbedderConfig(BaseModel):
146
174
  return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
147
175
  if self.embedding_provider == "togetherai":
148
176
  return self.get_togetherai_embedder(embedding_kwargs=kwargs)
177
+ if self.embedding_provider == "azure-openai":
178
+ return self.get_azure_openai_embedder(embedding_kwargs=kwargs)
149
179
 
150
180
  raise ValueError(f"{self.embedding_provider} not a recognized encoder")
151
181
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,20 +22,20 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: python-dateutil
26
- Requires-Dist: pydantic>=2.7
27
25
  Requires-Dist: opentelemetry-sdk
28
- Requires-Dist: click
29
- Requires-Dist: tqdm
30
26
  Requires-Dist: pandas
27
+ Requires-Dist: python-dateutil
28
+ Requires-Dist: pydantic>=2.7
31
29
  Requires-Dist: dataclasses-json
30
+ Requires-Dist: tqdm
31
+ Requires-Dist: click
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: fsspec; extra == "azure"
38
37
  Requires-Dist: adlfs; extra == "azure"
38
+ Requires-Dist: fsspec; extra == "azure"
39
39
  Provides-Extra: azure-ai-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
@@ -44,15 +44,15 @@ Provides-Extra: biomed
44
44
  Requires-Dist: bs4; extra == "biomed"
45
45
  Requires-Dist: requests; extra == "biomed"
46
46
  Provides-Extra: box
47
- Requires-Dist: boxfs; extra == "box"
48
47
  Requires-Dist: fsspec; extra == "box"
48
+ Requires-Dist: boxfs; extra == "box"
49
49
  Provides-Extra: chroma
50
50
  Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
52
52
  Requires-Dist: clarifai; extra == "clarifai"
53
53
  Provides-Extra: confluence
54
- Requires-Dist: atlassian-python-api; extra == "confluence"
55
54
  Requires-Dist: requests; extra == "confluence"
55
+ Requires-Dist: atlassian-python-api; extra == "confluence"
56
56
  Provides-Extra: couchbase
57
57
  Requires-Dist: couchbase; extra == "couchbase"
58
58
  Provides-Extra: csv
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
60
60
  Provides-Extra: databricks-volumes
61
61
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
62
62
  Provides-Extra: delta-table
63
- Requires-Dist: boto3; extra == "delta-table"
64
63
  Requires-Dist: deltalake; extra == "delta-table"
64
+ Requires-Dist: boto3; extra == "delta-table"
65
65
  Provides-Extra: discord
66
66
  Requires-Dist: discord-py; extra == "discord"
67
67
  Provides-Extra: doc
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
69
69
  Provides-Extra: docx
70
70
  Requires-Dist: unstructured[docx]; extra == "docx"
71
71
  Provides-Extra: dropbox
72
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
73
72
  Requires-Dist: fsspec; extra == "dropbox"
73
+ Requires-Dist: dropboxdrivefs; extra == "dropbox"
74
74
  Provides-Extra: elasticsearch
75
75
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
76
76
  Provides-Extra: embed-huggingface
@@ -88,8 +88,8 @@ Provides-Extra: epub
88
88
  Requires-Dist: unstructured[epub]; extra == "epub"
89
89
  Provides-Extra: gcs
90
90
  Requires-Dist: bs4; extra == "gcs"
91
- Requires-Dist: gcsfs; extra == "gcs"
92
91
  Requires-Dist: fsspec; extra == "gcs"
92
+ Requires-Dist: gcsfs; extra == "gcs"
93
93
  Provides-Extra: github
94
94
  Requires-Dist: requests; extra == "github"
95
95
  Requires-Dist: pygithub>1.58.0; extra == "github"
@@ -98,14 +98,16 @@ Requires-Dist: python-gitlab; extra == "gitlab"
98
98
  Provides-Extra: google-drive
99
99
  Requires-Dist: google-api-python-client; extra == "google-drive"
100
100
  Provides-Extra: hubspot
101
- Requires-Dist: urllib3; extra == "hubspot"
102
101
  Requires-Dist: hubspot-api-client; extra == "hubspot"
102
+ Requires-Dist: urllib3; extra == "hubspot"
103
103
  Provides-Extra: jira
104
104
  Requires-Dist: atlassian-python-api; extra == "jira"
105
105
  Provides-Extra: kafka
106
106
  Requires-Dist: confluent-kafka; extra == "kafka"
107
107
  Provides-Extra: kdbai
108
108
  Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
109
+ Provides-Extra: lancedb
110
+ Requires-Dist: lancedb; extra == "lancedb"
109
111
  Provides-Extra: md
110
112
  Requires-Dist: unstructured[md]; extra == "md"
111
113
  Provides-Extra: milvus
@@ -115,10 +117,10 @@ Requires-Dist: pymongo; extra == "mongodb"
115
117
  Provides-Extra: msg
116
118
  Requires-Dist: unstructured[msg]; extra == "msg"
117
119
  Provides-Extra: notion
118
- Requires-Dist: backoff; extra == "notion"
119
120
  Requires-Dist: htmlBuilder; extra == "notion"
120
- Requires-Dist: notion-client; extra == "notion"
121
+ Requires-Dist: backoff; extra == "notion"
121
122
  Requires-Dist: httpx; extra == "notion"
123
+ Requires-Dist: notion-client; extra == "notion"
122
124
  Provides-Extra: odt
123
125
  Requires-Dist: unstructured[odt]; extra == "odt"
124
126
  Provides-Extra: onedrive
@@ -161,8 +163,8 @@ Requires-Dist: s3fs; extra == "s3"
161
163
  Provides-Extra: salesforce
162
164
  Requires-Dist: simple-salesforce; extra == "salesforce"
163
165
  Provides-Extra: sftp
164
- Requires-Dist: paramiko; extra == "sftp"
165
166
  Requires-Dist: fsspec; extra == "sftp"
167
+ Requires-Dist: paramiko; extra == "sftp"
166
168
  Provides-Extra: sharepoint
167
169
  Requires-Dist: msal; extra == "sharepoint"
168
170
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
@@ -10,8 +10,8 @@ test/integration/connectors/test_azure_ai_search.py,sha256=dae4GifRiKue5YpsxworD
10
10
  test/integration/connectors/test_confluence.py,sha256=xcPmZ_vi_pkCt-tUPn10P49FH9i_9YUbrAPO6fYk5rU,3521
11
11
  test/integration/connectors/test_delta_table.py,sha256=GSzWIkbEUzOrRPt2F1uO0dabcp7kTFDj75BhhI2y-WU,6856
12
12
  test/integration/connectors/test_kafka.py,sha256=j7jsNWZumNBv9v-5Bpx8geUUXpxxad5EuA4CMRsl4R8,7104
13
- test/integration/connectors/test_lancedb.py,sha256=O3YF6MVBkCsCgklXCJe8Kpy8aKGfafASVH4PspmpcYs,7628
14
- test/integration/connectors/test_milvus.py,sha256=CVmYw9iEeKT_0OtShxye2E6i1LbWzzDA8JtwJRkYQlA,4763
13
+ test/integration/connectors/test_lancedb.py,sha256=8hRlqw3zYOcFCu6PPlejquSvvEM_3OEBzKTQbNm_Zmg,7635
14
+ test/integration/connectors/test_milvus.py,sha256=p4UujDr_tsRaQDmhDmDZp38t8oSFm7hrTqiq6NNuhGo,5933
15
15
  test/integration/connectors/test_mongodb.py,sha256=YeS_DUnVYN02F76j87W8RhXGHnJMzQYb3n-L1-oWGXI,12254
16
16
  test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
17
17
  test/integration/connectors/test_pinecone.py,sha256=X10OWZ6IrO6YyhuR3ydMAZOQq3u2f5u_lCjKNYUUcnI,7558
@@ -35,9 +35,11 @@ test/integration/connectors/utils/docker_compose.py,sha256=GVTB6Cel05c0VQ2n4AwkQ
35
35
  test/integration/connectors/utils/validation.py,sha256=SwvPVuHjJxTo8xEUwnuL9FZNpu3sZZ8iouOz5xh_kB8,14272
36
36
  test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
37
  test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
38
+ test/integration/connectors/weaviate/test_cloud.py,sha256=07VxNRxWWcgTstFfpoZ1FlVnEhcBnQlo5nosWKjKz_4,979
38
39
  test/integration/connectors/weaviate/test_local.py,sha256=SK6iEwQUKiCd0X99BEk8GlQoLaCcJcFPt09NN526Ct0,4508
39
40
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
41
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
42
+ test/integration/embedders/test_azure_openai.py,sha256=6tFpKFBFRXD49imhhRzsvy3MPtuZ4L1PtnKyMVBRAqc,1808
41
43
  test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
42
44
  test/integration/embedders/test_huggingface.py,sha256=0mMTOO-Nh7KB70AGs_7LLQIxMYrnSPqyihriUeqACbM,1007
43
45
  test/integration/embedders/test_mixedbread.py,sha256=RrLv8SByMNXsgrlh94RbaT-VyxZ4-DILO-OPpmOwvSI,1441
@@ -54,7 +56,7 @@ test/unit/test_chunking_utils.py,sha256=0iPwfnMPpyTm-yOE0BXMnEQQP4iguS6NhOqgMQU5
54
56
  test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
55
57
  test/unit/test_interfaces.py,sha256=XNj8qasc1ltaeUv-2y31rv7R9xquo0rgRrMvBZoNZLw,9623
56
58
  test/unit/test_logger.py,sha256=0SKndXE_VRd8XmUHkrj7zuBQHZscXx3ZQllMEOvtF9Y,2380
57
- test/unit/test_utils.py,sha256=xJ9WGpHBihWpQWvIzd6z99UIdZJba8U7c31h3q6C9To,4800
59
+ test/unit/test_utils.py,sha256=Q6mp9YZPah8z3-2lreyRbmAc7m2Y_w26_N9vocSInoA,5421
58
60
  test/unit/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
61
  test/unit/embed/test_mixedbreadai.py,sha256=XFNJDP5pIgF3eQYwBiuEWmH3zZWx72Wpwyv-Q4m0DJg,1332
60
62
  test/unit/embed/test_octoai.py,sha256=Ha9EgAW64Q45hFj51tToe8RyKXWXwqAkdDqSFDMu37Q,831
@@ -81,7 +83,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
81
83
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
84
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
83
85
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
84
- unstructured_ingest/__version__.py,sha256=Js7MXQhyIj1akVjPNsLkmZxqoOHDGOr2opEPgFOSTZQ,42
86
+ unstructured_ingest/__version__.py,sha256=0rNziXrR8RxleBY3pKm77TbOCJ0CwApHiLqXBAViUAo,42
85
87
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
86
88
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
87
89
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -249,6 +251,7 @@ unstructured_ingest/connector/notion/types/database_properties/unique_id.py,sha2
249
251
  unstructured_ingest/connector/notion/types/database_properties/url.py,sha256=iXQ2tVUm9UlKVtDA0NQiFIRJ5PHYW9wOaWt2vFfSVCg,862
250
252
  unstructured_ingest/connector/notion/types/database_properties/verification.py,sha256=J_DLjY-v2T6xDGMQ7FkI0YMKMA6SG6Y3yYW7qUD1hKA,2334
251
253
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
254
+ unstructured_ingest/embed/azure_openai.py,sha256=4YBOIxv66wVZ5EqNNC4uCDPNJ3VrsLPe5wwagT6zqe0,1001
252
255
  unstructured_ingest/embed/bedrock.py,sha256=-PRdZsF44vwi6G4G75gdO31AJKfZWClOXkJQAk7rEO8,3096
253
256
  unstructured_ingest/embed/huggingface.py,sha256=2cBiQhOhfWHX3hS-eKjocysOkUaRlyRfUj9Kxjrp6cE,1934
254
257
  unstructured_ingest/embed/interfaces.py,sha256=au4Xp8ciDvo4bidlUbazFW2aC7NZW5-UDLKXBFVzAX4,2025
@@ -342,7 +345,7 @@ unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSz
342
345
  unstructured_ingest/utils/data_prep.py,sha256=IDAedOSBdgZpD9IY4tLJT-rmKGV7GHtU6KRj6VM-_tE,4666
343
346
  unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
344
347
  unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
345
- unstructured_ingest/utils/string_and_date_utils.py,sha256=LwcbLmWpwt1zEabLlyUd5kIf9oOWcZxsRzxDglLCMeU,1375
348
+ unstructured_ingest/utils/string_and_date_utils.py,sha256=kijtPlGAbH376vVjFSo5H_ZhW-FEcMC2sCNsSNwDOjo,1729
346
349
  unstructured_ingest/utils/table.py,sha256=aWjcowDVSClNpEAdR6PY3H7khKu4T6T3QqQE6GjmQ_M,3469
347
350
  unstructured_ingest/v2/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
348
351
  unstructured_ingest/v2/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
@@ -388,23 +391,23 @@ unstructured_ingest/v2/pipeline/steps/upload.py,sha256=zlgXgwReX9TBOdfTpS9hETah4
388
391
  unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
389
392
  unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
390
393
  unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
391
- unstructured_ingest/v2/processes/embedder.py,sha256=PQn0IO8xbGRQHpcT2VVl-J8gTJ5HGGEP9gdEAwMVK3U,6498
394
+ unstructured_ingest/v2/processes/embedder.py,sha256=xCBpaL07WnVUOUW8SHktaf1vwBGZxl3Nf8-99509ClQ,7721
392
395
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
393
396
  unstructured_ingest/v2/processes/partitioner.py,sha256=agpHwB9FR8OZVQqE7zFEb0IcDPCOPA_BZjLzLF71nOY,8194
394
397
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
395
398
  unstructured_ingest/v2/processes/connectors/__init__.py,sha256=8M3aYYNbOkS2SYG2B_HLHMgX4V69-Oz1VqpQcRQMiVg,5167
396
399
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
397
- unstructured_ingest/v2/processes/connectors/astradb.py,sha256=zsIElFNNqVCXcLqBw6C8bRoyPQDrGNPkTWeA0FYYO94,14703
400
+ unstructured_ingest/v2/processes/connectors/astradb.py,sha256=QTUQ-cv_iZi9eaXRRHQNKhtgFn-Pi20AXdSVaDFg9DM,15498
398
401
  unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=-6IijSWGqj-85vD0c4l5wdMHp-LF371jO8j53PPRB4I,12002
399
402
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
400
403
  unstructured_ingest/v2/processes/connectors/confluence.py,sha256=qQApDcmPBGg4tHXwSOj4JPkAbrO9GQ4NRlaETjhp25U,7003
401
- unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=yhMDbpkZXs-Kis7tFlgjvNemU-MdWMdpCZDrpZNFaU4,12180
404
+ unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=LbUJLt6fqaNYSmy9vUiovG-UOALMcvh8OD-gZAaf-f4,12333
402
405
  unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=1yS7ivEyiucwd_kv6LL5HQdGabT43yeG6XCdwiz89hc,8019
403
406
  unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=yBgCeLy9iCVI8bBDcHHuHB0H3BO05e9E1OccbHwvKAo,9724
404
407
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=EEwXK1Anlu-eXl5qxmdDIqPYW7eMSez6WGlTPG2vSn8,13121
405
408
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
406
409
  unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
407
- unstructured_ingest/v2/processes/connectors/milvus.py,sha256=Bzv2fa852BcM4_Pr-I_DPvLmjPoXv0Z7BeEA8qSKCDc,9725
410
+ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=3sV0Yv2vYMLyxszKCqAqlMcHHJSBR-GGbaZf1nvobLE,10089
408
411
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
409
412
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
410
413
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
@@ -435,9 +438,10 @@ unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfh
435
438
  unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=qprsfI8VH0mVTa1MOCpa2D4coyopinQ5ag2KXcAecXE,3296
436
439
  unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=qEv_yaG94KekFtfS06KgpTTbqeJkje0hn5uOjsMMngw,9414
437
440
  unstructured_ingest/v2/processes/connectors/kafka/local.py,sha256=vwLZjvc_C17zOqcrzic0aIoPwS98sqYiwiMknw2IcK4,2586
438
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py,sha256=lHUPCOiyOGu1IME1QiyFBZaB8z8e3bP8Y8TkqKs32Qk,906
439
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=yR8V4O-oI_nUKJtHTLxhteEJpPDPn-_d2IkkXvgThJ0,1406
441
+ unstructured_ingest/v2/processes/connectors/lancedb/__init__.py,sha256=LW37xZrn48JeHluRNulLTreUPdaF-ZU81F7MCUHcCv8,1253
442
+ unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=eeXWsh8UeVm1Ur53C4MEnpLplfO8U91KYgk--0kk5pE,1413
440
443
  unstructured_ingest/v2/processes/connectors/lancedb/azure.py,sha256=Ms5vQVRIpTF1Q2qBl_bET9wbgaf4diPaH-iR8kJlr4E,1461
444
+ unstructured_ingest/v2/processes/connectors/lancedb/cloud.py,sha256=BFy0gW2OZ_qaZJM97m-tNsFaJPi9zOKrrd2y4thcNP0,1341
441
445
  unstructured_ingest/v2/processes/connectors/lancedb/gcp.py,sha256=p5BPaFtS3y3Yh8PIr3tUqsAXrUYu4QYYAWQNh5W2ucE,1361
442
446
  unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7WIShs2V3dpN6wUhDTt1j2rvdiPp6yopbh7XYkb9T3s,5129
443
447
  unstructured_ingest/v2/processes/connectors/lancedb/local.py,sha256=_7-6iO6B60gAWwJUUrmlsRzYMFIBeZgu_QT3mhw5L0I,1272
@@ -453,13 +457,13 @@ unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=jl524VudwmFK
453
457
  unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=LFzGeAUagLknK07DsXg2oSG7ZAgR6VqT9wfI_tYlHUg,14782
454
458
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
455
459
  unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=eXamSnQdzzMvt62z80B8nmlkwDKO-Pogln_K_zLz53A,1067
456
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=2g1Fm2J0ppfy2jCw4b5YtrsWrSD3VcrAaqiE7FlpIAg,6236
460
+ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
457
461
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
458
462
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
459
463
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=ln1p9ahFTaT-qsL7p4bgw_IqnU60As_l6vVAqUWyQVE,11655
460
- unstructured_ingest-0.3.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
461
- unstructured_ingest-0.3.2.dist-info/METADATA,sha256=rqTWqewB8eIrgrHJ-8AsNtehy35eSHKseCsveXTwN3Y,7326
462
- unstructured_ingest-0.3.2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
463
- unstructured_ingest-0.3.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
464
- unstructured_ingest-0.3.2.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
465
- unstructured_ingest-0.3.2.dist-info/RECORD,,
464
+ unstructured_ingest-0.3.4.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
465
+ unstructured_ingest-0.3.4.dist-info/METADATA,sha256=6Nj2KHvch7j5QLfahz5NcFHmmNq9vNixTfZSDUEQPjo,7393
466
+ unstructured_ingest-0.3.4.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
467
+ unstructured_ingest-0.3.4.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
468
+ unstructured_ingest-0.3.4.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
469
+ unstructured_ingest-0.3.4.dist-info/RECORD,,