unstructured-ingest 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -12,6 +12,7 @@ from lancedb import AsyncConnection
12
12
  from upath import UPath
13
13
 
14
14
  from test.integration.connectors.utils.constants import DESTINATION_TAG
15
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
15
16
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
16
17
  from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
17
18
  LanceDBAwsAccessConfig,
@@ -43,7 +44,6 @@ DATABASE_NAME = "database"
43
44
  TABLE_NAME = "elements"
44
45
  DIMENSION = 384
45
46
  NUMBER_EXPECTED_ROWS = 22
46
- NUMBER_EXPECTED_COLUMNS = 10
47
47
  S3_BUCKET = "s3://utic-ingest-test-fixtures/"
48
48
  GS_BUCKET = "gs://utic-test-ingest-fixtures-output/"
49
49
  AZURE_BUCKET = "az://utic-ingest-test-fixtures-output/"
@@ -54,9 +54,9 @@ REQUIRED_ENV_VARS = {
54
54
  "local": (),
55
55
  }
56
56
 
57
-
58
57
  SCHEMA = pa.schema(
59
58
  [
59
+ pa.field(RECORD_ID_LABEL, pa.string()),
60
60
  pa.field("vector", pa.list_(pa.float16(), DIMENSION)),
61
61
  pa.field("text", pa.string(), nullable=True),
62
62
  pa.field("type", pa.string(), nullable=True),
@@ -69,6 +69,7 @@ SCHEMA = pa.schema(
69
69
  pa.field("metadata-page_number", pa.int32(), nullable=True),
70
70
  ]
71
71
  )
72
+ NUMBER_EXPECTED_COLUMNS = len(SCHEMA.names)
72
73
 
73
74
 
74
75
  @pytest_asyncio.fixture
@@ -116,7 +117,7 @@ async def test_lancedb_destination(
116
117
  file_data = FileData(
117
118
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
118
119
  connector_type=CONNECTOR_TYPE,
119
- identifier="mock file data",
120
+ identifier="mock-file-data",
120
121
  )
121
122
  stager = LanceDBUploadStager()
122
123
  uploader = _get_uploader(uri)
@@ -129,17 +130,52 @@ async def test_lancedb_destination(
129
130
 
130
131
  await uploader.run_async(path=staged_file_path, file_data=file_data)
131
132
 
132
- table = await connection.open_table(TABLE_NAME)
133
- table_df: pd.DataFrame = await table.to_pandas()
133
+ # Test upload to empty table
134
+ with await connection.open_table(TABLE_NAME) as table:
135
+ table_df: pd.DataFrame = await table.to_pandas()
134
136
 
135
137
  assert len(table_df) == NUMBER_EXPECTED_ROWS
136
138
  assert len(table_df.columns) == NUMBER_EXPECTED_COLUMNS
137
139
 
140
+ assert table_df[RECORD_ID_LABEL][0] == file_data.identifier
138
141
  assert table_df["element_id"][0] == "2470d8dc42215b3d68413b55bf00fed2"
139
142
  assert table_df["type"][0] == "CompositeElement"
140
143
  assert table_df["metadata-filename"][0] == "DA-1p-with-duplicate-pages.pdf.json"
141
144
  assert table_df["metadata-text_as_html"][0] is None
142
145
 
146
+ # Test upload of the second file, rows should be appended
147
+ file_data.identifier = "mock-file-data-2"
148
+ staged_second_file_path = stager.run(
149
+ elements_filepath=upload_file,
150
+ file_data=file_data,
151
+ output_dir=tmp_path,
152
+ output_filename=f"{upload_file.stem}-2{upload_file.suffix}",
153
+ )
154
+ await uploader.run_async(path=staged_second_file_path, file_data=file_data)
155
+ with await connection.open_table(TABLE_NAME) as table:
156
+ appended_table_df: pd.DataFrame = await table.to_pandas()
157
+ assert len(appended_table_df) == 2 * NUMBER_EXPECTED_ROWS
158
+
159
+ # Test re-upload of the first file, rows should be overwritten, not appended
160
+ await uploader.run_async(path=staged_file_path, file_data=file_data)
161
+ with await connection.open_table(TABLE_NAME) as table:
162
+ overwritten_table_df: pd.DataFrame = await table.to_pandas()
163
+ assert len(overwritten_table_df) == 2 * NUMBER_EXPECTED_ROWS
164
+
165
+
166
+ class TestPrecheck:
167
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
168
+ @pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
169
+ def test_succeeds(
170
+ self,
171
+ upload_file: Path,
172
+ connection_with_uri: tuple[AsyncConnection, str],
173
+ tmp_path: Path,
174
+ ) -> None:
175
+ _, uri = connection_with_uri
176
+ uploader = _get_uploader(uri)
177
+ uploader.precheck()
178
+
143
179
 
144
180
  def _get_uri(target: Literal["local", "s3", "gcs", "az"], local_base_path: Path) -> str:
145
181
  if target == "local":
@@ -158,11 +194,12 @@ def _get_uploader(
158
194
  uri: str,
159
195
  ) -> Union[LanceDBAzureUploader, LanceDBAzureUploader, LanceDBAwsUploader, LanceDBGSPUploader]:
160
196
  target = uri.split("://", maxsplit=1)[0] if uri.startswith(("s3", "az", "gs")) else "local"
197
+ upload_config = LanceDBUploaderConfig(table_name=TABLE_NAME)
161
198
  if target == "az":
162
199
  azure_connection_string = os.getenv("AZURE_DEST_CONNECTION_STR")
163
200
  access_config_kwargs = _parse_azure_connection_string(azure_connection_string)
164
201
  return LanceDBAzureUploader(
165
- upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
202
+ upload_config=upload_config,
166
203
  connection_config=LanceDBAzureConnectionConfig(
167
204
  access_config=LanceDBAzureAccessConfig(**access_config_kwargs),
168
205
  uri=uri,
@@ -171,7 +208,7 @@ def _get_uploader(
171
208
 
172
209
  elif target == "s3":
173
210
  return LanceDBAwsUploader(
174
- upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
211
+ upload_config=upload_config,
175
212
  connection_config=LanceDBAwsConnectionConfig(
176
213
  access_config=LanceDBAwsAccessConfig(
177
214
  aws_access_key_id=os.getenv("S3_INGEST_TEST_ACCESS_KEY"),
@@ -182,7 +219,7 @@ def _get_uploader(
182
219
  )
183
220
  elif target == "gs":
184
221
  return LanceDBGSPUploader(
185
- upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
222
+ upload_config=upload_config,
186
223
  connection_config=LanceDBGCSConnectionConfig(
187
224
  access_config=LanceDBGCSAccessConfig(
188
225
  google_service_account_key=os.getenv("GCP_INGEST_SERVICE_KEY")
@@ -192,7 +229,7 @@ def _get_uploader(
192
229
  )
193
230
  else:
194
231
  return LanceDBLocalUploader(
195
- upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
232
+ upload_config=upload_config,
196
233
  connection_config=LanceDBLocalConnectionConfig(
197
234
  access_config=LanceDBLocalAccessConfig(),
198
235
  uri=uri,
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import math
2
3
  import os
3
4
  import re
4
5
  import time
@@ -19,6 +20,7 @@ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
20
  from unstructured_ingest.v2.logger import logger
20
21
  from unstructured_ingest.v2.processes.connectors.pinecone import (
21
22
  CONNECTOR_TYPE,
23
+ MAX_QUERY_RESULTS,
22
24
  PineconeAccessConfig,
23
25
  PineconeConnectionConfig,
24
26
  PineconeUploader,
@@ -118,7 +120,10 @@ def validate_pinecone_index(
118
120
  f"retry attempt {i}: expected {expected_num_of_vectors} != vector count {vector_count}"
119
121
  )
120
122
  time.sleep(interval)
121
- assert vector_count == expected_num_of_vectors
123
+ assert vector_count == expected_num_of_vectors, (
124
+ f"vector count from index ({vector_count}) doesn't "
125
+ f"match expected number: {expected_num_of_vectors}"
126
+ )
122
127
 
123
128
 
124
129
  @requires_env(API_KEY)
@@ -147,10 +152,7 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
147
152
  uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
148
153
  uploader.precheck()
149
154
 
150
- if uploader.is_async():
151
- await uploader.run_async(path=new_upload_file, file_data=file_data)
152
- else:
153
- uploader.run(path=new_upload_file, file_data=file_data)
155
+ uploader.run(path=new_upload_file, file_data=file_data)
154
156
  with new_upload_file.open() as f:
155
157
  staged_content = json.load(f)
156
158
  expected_num_of_vectors = len(staged_content)
@@ -160,10 +162,59 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
160
162
  )
161
163
 
162
164
  # Rerun uploader and make sure no duplicates exist
163
- if uploader.is_async():
164
- await uploader.run_async(path=new_upload_file, file_data=file_data)
165
- else:
166
- uploader.run(path=new_upload_file, file_data=file_data)
165
+ uploader.run(path=new_upload_file, file_data=file_data)
166
+ logger.info("validating second upload")
167
+ validate_pinecone_index(
168
+ index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
169
+ )
170
+
171
+
172
+ @requires_env(API_KEY)
173
+ @pytest.mark.asyncio
174
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
175
+ @pytest.mark.skip(reason="TODO: get this to work")
176
+ async def test_pinecone_destination_large_index(
177
+ pinecone_index: str, upload_file: Path, temp_dir: Path
178
+ ):
179
+ new_file = temp_dir / "large_file.json"
180
+ with upload_file.open() as f:
181
+ upload_content = json.load(f)
182
+
183
+ min_entries = math.ceil((MAX_QUERY_RESULTS * 2) / len(upload_content))
184
+ new_content = (upload_content * min_entries)[: (2 * MAX_QUERY_RESULTS)]
185
+ print(f"Creating large index content with {len(new_content)} records")
186
+ with new_file.open("w") as f:
187
+ json.dump(new_content, f)
188
+
189
+ expected_num_of_vectors = len(new_content)
190
+ file_data = FileData(
191
+ source_identifiers=SourceIdentifiers(fullpath=new_file.name, filename=new_file.name),
192
+ connector_type=CONNECTOR_TYPE,
193
+ identifier="pinecone_mock_id",
194
+ )
195
+ connection_config = PineconeConnectionConfig(
196
+ index_name=pinecone_index,
197
+ access_config=PineconeAccessConfig(api_key=get_api_key()),
198
+ )
199
+ stager_config = PineconeUploadStagerConfig()
200
+ stager = PineconeUploadStager(upload_stager_config=stager_config)
201
+ new_upload_file = stager.run(
202
+ elements_filepath=new_file,
203
+ output_dir=temp_dir,
204
+ output_filename=new_file.name,
205
+ file_data=file_data,
206
+ )
207
+
208
+ upload_config = PineconeUploaderConfig()
209
+ uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
210
+ uploader.precheck()
211
+
212
+ uploader.run(path=new_upload_file, file_data=file_data)
213
+ validate_pinecone_index(
214
+ index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
215
+ )
216
+ # Rerun uploader and make sure no duplicates exist
217
+ uploader.run(path=new_upload_file, file_data=file_data)
167
218
  logger.info("validating second upload")
168
219
  validate_pinecone_index(
169
220
  index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
@@ -0,0 +1,59 @@
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
7
+ from test.integration.utils import requires_env
8
+ from unstructured_ingest.embed.azure_openai import (
9
+ AzureOpenAIEmbeddingConfig,
10
+ AzureOpenAIEmbeddingEncoder,
11
+ )
12
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
13
+
14
+ API_KEY = "AZURE_OPENAI_API_KEY"
15
+ ENDPOINT = "AZURE_OPENAI_ENDPOINT"
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class AzureData:
20
+ api_key: str
21
+ endpoint: str
22
+
23
+
24
+ def get_azure_data() -> AzureData:
25
+ api_key = os.getenv(API_KEY, None)
26
+ assert api_key
27
+ endpoint = os.getenv(ENDPOINT, None)
28
+ assert endpoint
29
+ return AzureData(api_key, endpoint)
30
+
31
+
32
+ @requires_env(API_KEY, ENDPOINT)
33
+ def test_azure_openai_embedder(embedder_file: Path):
34
+ azure_data = get_azure_data()
35
+ embedder_config = EmbedderConfig(
36
+ embedding_provider="azure-openai",
37
+ embedding_api_key=azure_data.api_key,
38
+ embedding_azure_endpoint=azure_data.endpoint,
39
+ )
40
+ embedder = Embedder(config=embedder_config)
41
+ results = embedder.run(elements_filepath=embedder_file)
42
+ assert results
43
+ with embedder_file.open("r") as f:
44
+ original_elements = json.load(f)
45
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
46
+
47
+
48
+ @requires_env(API_KEY, ENDPOINT)
49
+ def test_raw_azure_openai_embedder(embedder_file: Path):
50
+ azure_data = get_azure_data()
51
+ embedder = AzureOpenAIEmbeddingEncoder(
52
+ config=AzureOpenAIEmbeddingConfig(
53
+ api_key=azure_data.api_key,
54
+ azure_endpoint=azure_data.endpoint,
55
+ )
56
+ )
57
+ validate_raw_embedder(
58
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
59
+ )
@@ -1 +1 @@
1
- __version__ = "0.3.3" # pragma: no cover
1
+ __version__ = "0.3.5" # pragma: no cover
@@ -0,0 +1,31 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
+
4
+ from pydantic import Field
5
+
6
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+
9
+ if TYPE_CHECKING:
10
+ from openai import AzureOpenAI
11
+
12
+
13
+ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
14
+ api_version: str = Field(description="Azure API version", default="2024-06-01")
15
+ azure_endpoint: str
16
+ embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
17
+
18
+ @requires_dependencies(["openai"], extras="openai")
19
+ def get_client(self) -> "AzureOpenAI":
20
+ from openai import AzureOpenAI
21
+
22
+ return AzureOpenAI(
23
+ api_key=self.api_key.get_secret_value(),
24
+ api_version=self.api_version,
25
+ azure_endpoint=self.azure_endpoint,
26
+ )
27
+
28
+
29
+ @dataclass
30
+ class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
31
+ config: AzureOpenAIEmbeddingConfig
@@ -219,6 +219,9 @@ class CouchbaseIndexer(Indexer):
219
219
 
220
220
 
221
221
  class CouchbaseDownloaderConfig(DownloaderConfig):
222
+ collection_id: str = Field(
223
+ default="id", description="The unique key of the id field in the collection"
224
+ )
222
225
  fields: list[str] = field(default_factory=list)
223
226
 
224
227
 
@@ -250,7 +253,7 @@ class CouchbaseDownloader(Downloader):
250
253
  def generate_download_response(
251
254
  self, result: dict, bucket: str, file_data: FileData
252
255
  ) -> DownloadResponse:
253
- record_id = result["id"]
256
+ record_id = result[self.download_config.collection_id]
254
257
  filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
255
258
  filename = f"{filename_id}.txt"
256
259
  download_path = self.download_dir / Path(filename)
@@ -142,8 +142,6 @@ class ElasticsearchIndexer(Indexer):
142
142
  def precheck(self) -> None:
143
143
  try:
144
144
  with self.connection_config.get_client() as client:
145
- if not client.ping():
146
- raise SourceConnectionError("cluster not detected")
147
145
  indices = client.indices.get_alias(index="*")
148
146
  if self.index_config.index_name not in indices:
149
147
  raise SourceConnectionError(
@@ -393,11 +391,9 @@ class ElasticsearchUploader(Uploader):
393
391
  def precheck(self) -> None:
394
392
  try:
395
393
  with self.connection_config.get_client() as client:
396
- if not client.ping():
397
- raise DestinationConnectionError("cluster not detected")
398
394
  indices = client.indices.get_alias(index="*")
399
395
  if self.upload_config.index_name not in indices:
400
- raise SourceConnectionError(
396
+ raise DestinationConnectionError(
401
397
  "index {} not found: {}".format(
402
398
  self.upload_config.index_name, ", ".join(indices.keys())
403
399
  )
@@ -15,6 +15,7 @@ from unstructured_ingest.error import DestinationConnectionError
15
15
  from unstructured_ingest.logger import logger
16
16
  from unstructured_ingest.utils.data_prep import flatten_dict
17
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
18
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
18
19
  from unstructured_ingest.v2.interfaces.connector import ConnectionConfig
19
20
  from unstructured_ingest.v2.interfaces.file_data import FileData
20
21
  from unstructured_ingest.v2.interfaces.upload_stager import UploadStager, UploadStagerConfig
@@ -84,7 +85,7 @@ class LanceDBUploadStager(UploadStager):
84
85
 
85
86
  df = pd.DataFrame(
86
87
  [
87
- self._conform_element_contents(element_contents)
88
+ self._conform_element_contents(element_contents, file_data)
88
89
  for element_contents in elements_contents
89
90
  ]
90
91
  )
@@ -94,9 +95,10 @@ class LanceDBUploadStager(UploadStager):
94
95
 
95
96
  return output_path
96
97
 
97
- def _conform_element_contents(self, element: dict) -> dict:
98
+ def _conform_element_contents(self, element: dict, file_data: FileData) -> dict:
98
99
  return {
99
100
  "vector": element.pop("embeddings", None),
101
+ RECORD_ID_LABEL: file_data.identifier,
100
102
  **flatten_dict(element, separator="-"),
101
103
  }
102
104
 
@@ -134,6 +136,14 @@ class LanceDBUploader(Uploader):
134
136
  async with self.get_table() as table:
135
137
  schema = await table.schema()
136
138
  df = self._fit_to_schema(df, schema)
139
+ if RECORD_ID_LABEL not in schema.names:
140
+ logger.warning(
141
+ f"Designated table doesn't contain {RECORD_ID_LABEL} column of type"
142
+ " string which is required to support overwriting updates on subsequent"
143
+ " uploads of the same record. New rows will be appended instead."
144
+ )
145
+ else:
146
+ await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
137
147
  await table.add(data=df)
138
148
 
139
149
  def _fit_to_schema(self, df: pd.DataFrame, schema) -> pd.DataFrame:
@@ -31,6 +31,7 @@ CONNECTOR_TYPE = "pinecone"
31
31
  MAX_PAYLOAD_SIZE = 2 * 1024 * 1024 # 2MB
32
32
  MAX_POOL_THREADS = 100
33
33
  MAX_METADATA_BYTES = 40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
34
+ MAX_QUERY_RESULTS = 10000
34
35
 
35
36
 
36
37
  class PineconeAccessConfig(AccessConfig):
@@ -84,7 +85,7 @@ ALLOWED_FIELDS = (
84
85
 
85
86
  class PineconeUploadStagerConfig(UploadStagerConfig):
86
87
  metadata_fields: list[str] = Field(
87
- default=str(ALLOWED_FIELDS),
88
+ default=list(ALLOWED_FIELDS),
88
89
  description=(
89
90
  "which metadata from the source element to map to the payload metadata being sent to "
90
91
  "Pinecone."
@@ -137,7 +138,6 @@ class PineconeUploadStager(UploadStager):
137
138
  flatten_lists=True,
138
139
  remove_none=True,
139
140
  )
140
- metadata[RECORD_ID_LABEL] = file_data.identifier
141
141
  metadata_size_bytes = len(json.dumps(metadata).encode())
142
142
  if metadata_size_bytes > MAX_METADATA_BYTES:
143
143
  logger.info(
@@ -146,6 +146,8 @@ class PineconeUploadStager(UploadStager):
146
146
  )
147
147
  metadata = {}
148
148
 
149
+ metadata[RECORD_ID_LABEL] = file_data.identifier
150
+
149
151
  return {
150
152
  "id": str(uuid.uuid4()),
151
153
  "values": embeddings,
@@ -213,6 +215,18 @@ class PineconeUploader(Uploader):
213
215
  f"from pinecone index: {resp}"
214
216
  )
215
217
 
218
+ def delete_by_query(self, index: "PineconeIndex", query_params: dict) -> None:
219
+ while True:
220
+ query_results = index.query(**query_params)
221
+ matches = query_results.get("matches", [])
222
+ if not matches:
223
+ break
224
+ ids = [match["id"] for match in matches]
225
+ delete_params = {"ids": ids}
226
+ if namespace := self.upload_config.namespace:
227
+ delete_params["namespace"] = namespace
228
+ index.delete(**delete_params)
229
+
216
230
  def serverless_delete_by_record_id(self, file_data: FileData) -> None:
217
231
  logger.debug(
218
232
  f"deleting any content with metadata "
@@ -221,29 +235,25 @@ class PineconeUploader(Uploader):
221
235
  )
222
236
  index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
223
237
  index_stats = index.describe_index_stats()
238
+ dimension = index_stats["dimension"]
224
239
  total_vectors = index_stats["total_vector_count"]
225
240
  if total_vectors == 0:
226
241
  return
227
- dimension = index_stats["dimension"]
228
- query_params = {
229
- "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
230
- "vector": [0] * dimension,
231
- "top_k": total_vectors,
232
- }
233
- if namespace := self.upload_config.namespace:
234
- query_params["namespace"] = namespace
235
- while True:
236
- query_results = index.query(**query_params)
237
- matches = query_results.get("matches", [])
238
- if not matches:
239
- break
240
- ids = [match["id"] for match in matches]
241
- delete_params = {"ids": ids}
242
+ while total_vectors > 0:
243
+ top_k = min(total_vectors, MAX_QUERY_RESULTS)
244
+ query_params = {
245
+ "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
246
+ "vector": [0] * dimension,
247
+ "top_k": top_k,
248
+ }
242
249
  if namespace := self.upload_config.namespace:
243
- delete_params["namespace"] = namespace
244
- index.delete(**delete_params)
245
- logger.debug(
246
- f"deleted any content with metadata "
250
+ query_params["namespace"] = namespace
251
+ self.delete_by_query(index=index, query_params=query_params)
252
+ index_stats = index.describe_index_stats()
253
+ total_vectors = index_stats["total_vector_count"]
254
+
255
+ logger.info(
256
+ f"deleted {total_vectors} records with metadata "
247
257
  f"{self.upload_config.record_id_key}={file_data.identifier} "
248
258
  f"from pinecone index"
249
259
  )
@@ -10,8 +10,6 @@ from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
10
10
  from .embedded import weaviate_embedded_destination_entry
11
11
  from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
12
12
  from .local import weaviate_local_destination_entry
13
- from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
14
- from .weaviate import weaviate_destination_entry
15
13
 
16
14
  add_destination_entry(
17
15
  destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
@@ -22,4 +20,3 @@ add_destination_entry(
22
20
  add_destination_entry(
23
21
  destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
24
22
  )
25
- add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
@@ -22,7 +22,6 @@ from unstructured_ingest.v2.interfaces import (
22
22
  UploadStagerConfig,
23
23
  )
24
24
  from unstructured_ingest.v2.logger import logger
25
- from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
26
25
 
27
26
  if TYPE_CHECKING:
28
27
  from weaviate.classes.init import Timeout
@@ -288,12 +287,3 @@ class WeaviateUploader(Uploader, ABC):
288
287
  vector=vector,
289
288
  )
290
289
  self.check_for_errors(client=weaviate_client)
291
-
292
-
293
- weaviate_destination_entry = DestinationRegistryEntry(
294
- connection_config=WeaviateConnectionConfig,
295
- uploader=WeaviateUploader,
296
- uploader_config=WeaviateUploaderConfig,
297
- upload_stager=WeaviateUploadStager,
298
- upload_stager_config=WeaviateUploadStagerConfig,
299
- )
@@ -16,6 +16,7 @@ class EmbedderConfig(BaseModel):
16
16
  embedding_provider: Optional[
17
17
  Literal[
18
18
  "openai",
19
+ "azure-openai",
19
20
  "huggingface",
20
21
  "aws-bedrock",
21
22
  "vertexai",
@@ -43,6 +44,14 @@ class EmbedderConfig(BaseModel):
43
44
  embedding_aws_region: Optional[str] = Field(
44
45
  default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
45
46
  )
47
+ embedding_azure_endpoint: Optional[str] = Field(
48
+ default=None,
49
+ description="Your Azure endpoint, including the resource, "
50
+ "e.g. `https://example-resource.azure.openai.com/`",
51
+ )
52
+ embedding_azure_api_version: Optional[str] = Field(
53
+ description="Azure API version", default=None
54
+ )
46
55
 
47
56
  def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
48
57
  from unstructured_ingest.embed.huggingface import (
@@ -59,6 +68,25 @@ class EmbedderConfig(BaseModel):
59
68
 
60
69
  return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
61
70
 
71
+ def get_azure_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
72
+ from unstructured_ingest.embed.azure_openai import (
73
+ AzureOpenAIEmbeddingConfig,
74
+ AzureOpenAIEmbeddingEncoder,
75
+ )
76
+
77
+ config_kwargs = {
78
+ "api_key": self.embedding_api_key,
79
+ "azure_endpoint": self.embedding_azure_endpoint,
80
+ }
81
+ if api_version := self.embedding_azure_api_version:
82
+ config_kwargs["api_version"] = api_version
83
+ if model_name := self.embedding_model_name:
84
+ config_kwargs["model_name"] = model_name
85
+
86
+ return AzureOpenAIEmbeddingEncoder(
87
+ config=AzureOpenAIEmbeddingConfig.model_validate(config_kwargs)
88
+ )
89
+
62
90
  def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
63
91
  from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
64
92
 
@@ -146,6 +174,8 @@ class EmbedderConfig(BaseModel):
146
174
  return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
147
175
  if self.embedding_provider == "togetherai":
148
176
  return self.get_togetherai_embedder(embedding_kwargs=kwargs)
177
+ if self.embedding_provider == "azure-openai":
178
+ return self.get_azure_openai_embedder(embedding_kwargs=kwargs)
149
179
 
150
180
  raise ValueError(f"{self.embedding_provider} not a recognized encoder")
151
181
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: dataclasses-json
26
- Requires-Dist: pydantic>=2.7
27
25
  Requires-Dist: pandas
26
+ Requires-Dist: dataclasses-json
28
27
  Requires-Dist: tqdm
29
- Requires-Dist: python-dateutil
30
- Requires-Dist: click
31
28
  Requires-Dist: opentelemetry-sdk
29
+ Requires-Dist: pydantic>=2.7
30
+ Requires-Dist: click
31
+ Requires-Dist: python-dateutil
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
@@ -51,8 +51,8 @@ Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
52
52
  Requires-Dist: clarifai; extra == "clarifai"
53
53
  Provides-Extra: confluence
54
- Requires-Dist: atlassian-python-api; extra == "confluence"
55
54
  Requires-Dist: requests; extra == "confluence"
55
+ Requires-Dist: atlassian-python-api; extra == "confluence"
56
56
  Provides-Extra: couchbase
57
57
  Requires-Dist: couchbase; extra == "couchbase"
58
58
  Provides-Extra: csv
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
60
60
  Provides-Extra: databricks-volumes
61
61
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
62
62
  Provides-Extra: delta-table
63
- Requires-Dist: boto3; extra == "delta-table"
64
63
  Requires-Dist: deltalake; extra == "delta-table"
64
+ Requires-Dist: boto3; extra == "delta-table"
65
65
  Provides-Extra: discord
66
66
  Requires-Dist: discord-py; extra == "discord"
67
67
  Provides-Extra: doc
@@ -78,8 +78,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
78
78
  Provides-Extra: embed-mixedbreadai
79
79
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
80
80
  Provides-Extra: embed-octoai
81
- Requires-Dist: openai; extra == "embed-octoai"
82
81
  Requires-Dist: tiktoken; extra == "embed-octoai"
82
+ Requires-Dist: openai; extra == "embed-octoai"
83
83
  Provides-Extra: embed-vertexai
84
84
  Requires-Dist: vertexai; extra == "embed-vertexai"
85
85
  Provides-Extra: embed-voyageai
@@ -88,8 +88,8 @@ Provides-Extra: epub
88
88
  Requires-Dist: unstructured[epub]; extra == "epub"
89
89
  Provides-Extra: gcs
90
90
  Requires-Dist: bs4; extra == "gcs"
91
- Requires-Dist: gcsfs; extra == "gcs"
92
91
  Requires-Dist: fsspec; extra == "gcs"
92
+ Requires-Dist: gcsfs; extra == "gcs"
93
93
  Provides-Extra: github
94
94
  Requires-Dist: pygithub>1.58.0; extra == "github"
95
95
  Requires-Dist: requests; extra == "github"
@@ -117,26 +117,26 @@ Requires-Dist: pymongo; extra == "mongodb"
117
117
  Provides-Extra: msg
118
118
  Requires-Dist: unstructured[msg]; extra == "msg"
119
119
  Provides-Extra: notion
120
+ Requires-Dist: notion-client; extra == "notion"
120
121
  Requires-Dist: backoff; extra == "notion"
121
122
  Requires-Dist: htmlBuilder; extra == "notion"
122
- Requires-Dist: notion-client; extra == "notion"
123
123
  Requires-Dist: httpx; extra == "notion"
124
124
  Provides-Extra: odt
125
125
  Requires-Dist: unstructured[odt]; extra == "odt"
126
126
  Provides-Extra: onedrive
127
- Requires-Dist: msal; extra == "onedrive"
128
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
129
127
  Requires-Dist: bs4; extra == "onedrive"
128
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
129
+ Requires-Dist: msal; extra == "onedrive"
130
130
  Provides-Extra: openai
131
- Requires-Dist: openai; extra == "openai"
132
131
  Requires-Dist: tiktoken; extra == "openai"
132
+ Requires-Dist: openai; extra == "openai"
133
133
  Provides-Extra: opensearch
134
134
  Requires-Dist: opensearch-py; extra == "opensearch"
135
135
  Provides-Extra: org
136
136
  Requires-Dist: unstructured[org]; extra == "org"
137
137
  Provides-Extra: outlook
138
- Requires-Dist: msal; extra == "outlook"
139
138
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
139
+ Requires-Dist: msal; extra == "outlook"
140
140
  Provides-Extra: pdf
141
141
  Requires-Dist: unstructured[pdf]; extra == "pdf"
142
142
  Provides-Extra: pinecone
@@ -158,16 +158,16 @@ Requires-Dist: unstructured[rst]; extra == "rst"
158
158
  Provides-Extra: rtf
159
159
  Requires-Dist: unstructured[rtf]; extra == "rtf"
160
160
  Provides-Extra: s3
161
- Requires-Dist: fsspec; extra == "s3"
162
161
  Requires-Dist: s3fs; extra == "s3"
162
+ Requires-Dist: fsspec; extra == "s3"
163
163
  Provides-Extra: salesforce
164
164
  Requires-Dist: simple-salesforce; extra == "salesforce"
165
165
  Provides-Extra: sftp
166
166
  Requires-Dist: paramiko; extra == "sftp"
167
167
  Requires-Dist: fsspec; extra == "sftp"
168
168
  Provides-Extra: sharepoint
169
- Requires-Dist: msal; extra == "sharepoint"
170
169
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
170
+ Requires-Dist: msal; extra == "sharepoint"
171
171
  Provides-Extra: singlestore
172
172
  Requires-Dist: singlestoredb; extra == "singlestore"
173
173
  Provides-Extra: slack
@@ -10,11 +10,11 @@ test/integration/connectors/test_azure_ai_search.py,sha256=dae4GifRiKue5YpsxworD
10
10
  test/integration/connectors/test_confluence.py,sha256=xcPmZ_vi_pkCt-tUPn10P49FH9i_9YUbrAPO6fYk5rU,3521
11
11
  test/integration/connectors/test_delta_table.py,sha256=GSzWIkbEUzOrRPt2F1uO0dabcp7kTFDj75BhhI2y-WU,6856
12
12
  test/integration/connectors/test_kafka.py,sha256=j7jsNWZumNBv9v-5Bpx8geUUXpxxad5EuA4CMRsl4R8,7104
13
- test/integration/connectors/test_lancedb.py,sha256=8hRlqw3zYOcFCu6PPlejquSvvEM_3OEBzKTQbNm_Zmg,7635
13
+ test/integration/connectors/test_lancedb.py,sha256=U2HfIrf6iJ7lYMn-vz0j-LesVyDY-jc9QrQhlJVhG9Q,9183
14
14
  test/integration/connectors/test_milvus.py,sha256=p4UujDr_tsRaQDmhDmDZp38t8oSFm7hrTqiq6NNuhGo,5933
15
15
  test/integration/connectors/test_mongodb.py,sha256=YeS_DUnVYN02F76j87W8RhXGHnJMzQYb3n-L1-oWGXI,12254
16
16
  test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
17
- test/integration/connectors/test_pinecone.py,sha256=X10OWZ6IrO6YyhuR3ydMAZOQq3u2f5u_lCjKNYUUcnI,7558
17
+ test/integration/connectors/test_pinecone.py,sha256=i-v5WkAI9M6SUZI7ch9qdILlRHopAdptpkSY12-BaTk,9483
18
18
  test/integration/connectors/test_qdrant.py,sha256=ASvO-BNyhv8m8or28KljrJy27Da0uaTNeoR5w_QsvFg,5121
19
19
  test/integration/connectors/test_s3.py,sha256=YHEYMqWTKTfR7wlL4VoxtgMs1YiYKyhLIBdG-anaQGo,6896
20
20
  test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -39,6 +39,7 @@ test/integration/connectors/weaviate/test_cloud.py,sha256=07VxNRxWWcgTstFfpoZ1Fl
39
39
  test/integration/connectors/weaviate/test_local.py,sha256=SK6iEwQUKiCd0X99BEk8GlQoLaCcJcFPt09NN526Ct0,4508
40
40
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
41
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
42
+ test/integration/embedders/test_azure_openai.py,sha256=6tFpKFBFRXD49imhhRzsvy3MPtuZ4L1PtnKyMVBRAqc,1808
42
43
  test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
43
44
  test/integration/embedders/test_huggingface.py,sha256=0mMTOO-Nh7KB70AGs_7LLQIxMYrnSPqyihriUeqACbM,1007
44
45
  test/integration/embedders/test_mixedbread.py,sha256=RrLv8SByMNXsgrlh94RbaT-VyxZ4-DILO-OPpmOwvSI,1441
@@ -82,7 +83,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
82
83
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
84
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
84
85
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
85
- unstructured_ingest/__version__.py,sha256=p5uBTX3-kWJF_Qc2XUwgA0BcGSwYkkJo-kqLi89Vqo4,42
86
+ unstructured_ingest/__version__.py,sha256=70Yw9e-njzEFR9kr-pzp5J1EslWrJuu4TCVbxa-fdmM,42
86
87
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
87
88
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
88
89
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -250,6 +251,7 @@ unstructured_ingest/connector/notion/types/database_properties/unique_id.py,sha2
250
251
  unstructured_ingest/connector/notion/types/database_properties/url.py,sha256=iXQ2tVUm9UlKVtDA0NQiFIRJ5PHYW9wOaWt2vFfSVCg,862
251
252
  unstructured_ingest/connector/notion/types/database_properties/verification.py,sha256=J_DLjY-v2T6xDGMQ7FkI0YMKMA6SG6Y3yYW7qUD1hKA,2334
252
253
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
254
+ unstructured_ingest/embed/azure_openai.py,sha256=4YBOIxv66wVZ5EqNNC4uCDPNJ3VrsLPe5wwagT6zqe0,1001
253
255
  unstructured_ingest/embed/bedrock.py,sha256=-PRdZsF44vwi6G4G75gdO31AJKfZWClOXkJQAk7rEO8,3096
254
256
  unstructured_ingest/embed/huggingface.py,sha256=2cBiQhOhfWHX3hS-eKjocysOkUaRlyRfUj9Kxjrp6cE,1934
255
257
  unstructured_ingest/embed/interfaces.py,sha256=au4Xp8ciDvo4bidlUbazFW2aC7NZW5-UDLKXBFVzAX4,2025
@@ -389,7 +391,7 @@ unstructured_ingest/v2/pipeline/steps/upload.py,sha256=zlgXgwReX9TBOdfTpS9hETah4
389
391
  unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
390
392
  unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
391
393
  unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
392
- unstructured_ingest/v2/processes/embedder.py,sha256=PQn0IO8xbGRQHpcT2VVl-J8gTJ5HGGEP9gdEAwMVK3U,6498
394
+ unstructured_ingest/v2/processes/embedder.py,sha256=xCBpaL07WnVUOUW8SHktaf1vwBGZxl3Nf8-99509ClQ,7721
393
395
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
394
396
  unstructured_ingest/v2/processes/partitioner.py,sha256=agpHwB9FR8OZVQqE7zFEb0IcDPCOPA_BZjLzLF71nOY,8194
395
397
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
@@ -399,7 +401,7 @@ unstructured_ingest/v2/processes/connectors/astradb.py,sha256=QTUQ-cv_iZi9eaXRRH
399
401
  unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=-6IijSWGqj-85vD0c4l5wdMHp-LF371jO8j53PPRB4I,12002
400
402
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
401
403
  unstructured_ingest/v2/processes/connectors/confluence.py,sha256=qQApDcmPBGg4tHXwSOj4JPkAbrO9GQ4NRlaETjhp25U,7003
402
- unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=yhMDbpkZXs-Kis7tFlgjvNemU-MdWMdpCZDrpZNFaU4,12180
404
+ unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=LbUJLt6fqaNYSmy9vUiovG-UOALMcvh8OD-gZAaf-f4,12333
403
405
  unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=1yS7ivEyiucwd_kv6LL5HQdGabT43yeG6XCdwiz89hc,8019
404
406
  unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=yBgCeLy9iCVI8bBDcHHuHB0H3BO05e9E1OccbHwvKAo,9724
405
407
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=EEwXK1Anlu-eXl5qxmdDIqPYW7eMSez6WGlTPG2vSn8,13121
@@ -409,7 +411,7 @@ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=3sV0Yv2vYMLyxszKCqA
409
411
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
410
412
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
411
413
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
412
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=hWkXgVDAzCtrBxf7A4HoexBACGAfVf_Qvn9YHbeiBSY,11505
414
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=-J6QPJv_jmjln8cTUsfEEAyd_hi_fmD-uwB6C84rA4w,11930
413
415
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
414
416
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
415
417
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
@@ -421,7 +423,7 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P
421
423
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
422
424
  unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=Wk7s2_u5G0BOV5slvGc8IlUf7ivznY9PrgPqe6nlJKM,2897
423
425
  unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
424
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=lzbrQ66zz3Dh_G29XFkyzQ84St8H_xfQVsYV4mTf32c,19141
426
+ unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=sI58uypWr1mpSl4bxr46nIfypGZ4aqryCT83qqCVnSM,18921
425
427
  unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
426
428
  unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
427
429
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
@@ -441,7 +443,7 @@ unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=eeXWsh8UeVm1Ur
441
443
  unstructured_ingest/v2/processes/connectors/lancedb/azure.py,sha256=Ms5vQVRIpTF1Q2qBl_bET9wbgaf4diPaH-iR8kJlr4E,1461
442
444
  unstructured_ingest/v2/processes/connectors/lancedb/cloud.py,sha256=BFy0gW2OZ_qaZJM97m-tNsFaJPi9zOKrrd2y4thcNP0,1341
443
445
  unstructured_ingest/v2/processes/connectors/lancedb/gcp.py,sha256=p5BPaFtS3y3Yh8PIr3tUqsAXrUYu4QYYAWQNh5W2ucE,1361
444
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7WIShs2V3dpN6wUhDTt1j2rvdiPp6yopbh7XYkb9T3s,5129
446
+ unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7FODnesYu8cFx1PeQJZxXij-8Dei4Kk3Bs0oxoUGBtI,5745
445
447
  unstructured_ingest/v2/processes/connectors/lancedb/local.py,sha256=_7-6iO6B60gAWwJUUrmlsRzYMFIBeZgu_QT3mhw5L0I,1272
446
448
  unstructured_ingest/v2/processes/connectors/qdrant/__init__.py,sha256=xM19uYzAuGizVoZIM_hnVZ5AcBN69aOBGpqZcpWPtuE,760
447
449
  unstructured_ingest/v2/processes/connectors/qdrant/cloud.py,sha256=accJ4sNWBVWV-KiVBDBDBYYx5A9CUoikP5NCErRmfik,1624
@@ -454,14 +456,14 @@ unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=YrmhAL1RQ1
454
456
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=jl524VudwmFK63emCT7DmZan_EWJAMiGir5_zoO9FuY,5697
455
457
  unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=LFzGeAUagLknK07DsXg2oSG7ZAgR6VqT9wfI_tYlHUg,14782
456
458
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
457
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=eXamSnQdzzMvt62z80B8nmlkwDKO-Pogln_K_zLz53A,1067
459
+ unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
458
460
  unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
459
461
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
460
462
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
461
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=ln1p9ahFTaT-qsL7p4bgw_IqnU60As_l6vVAqUWyQVE,11655
462
- unstructured_ingest-0.3.3.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
463
- unstructured_ingest-0.3.3.dist-info/METADATA,sha256=AEumzINrBNXXeBEBQiIB8309_9OkIWhLeo7Giqzl1ew,7393
464
- unstructured_ingest-0.3.3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
465
- unstructured_ingest-0.3.3.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
466
- unstructured_ingest-0.3.3.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
467
- unstructured_ingest-0.3.3.dist-info/RECORD,,
463
+ unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=dBDC_M8GVKupl7i9UMRCZyRIUv6gTkq8bJE_SILydAc,11291
464
+ unstructured_ingest-0.3.5.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
465
+ unstructured_ingest-0.3.5.dist-info/METADATA,sha256=6lMRXK_RZho8cMblH299fqDfZix6a9843VGiPvhnDV8,7393
466
+ unstructured_ingest-0.3.5.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
467
+ unstructured_ingest-0.3.5.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
468
+ unstructured_ingest-0.3.5.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
469
+ unstructured_ingest-0.3.5.dist-info/RECORD,,