unstructured-ingest 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -12,6 +12,7 @@ from lancedb import AsyncConnection
12
12
  from upath import UPath
13
13
 
14
14
  from test.integration.connectors.utils.constants import DESTINATION_TAG
15
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
15
16
  from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
16
17
  from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
17
18
  LanceDBAwsAccessConfig,
@@ -43,7 +44,6 @@ DATABASE_NAME = "database"
43
44
  TABLE_NAME = "elements"
44
45
  DIMENSION = 384
45
46
  NUMBER_EXPECTED_ROWS = 22
46
- NUMBER_EXPECTED_COLUMNS = 10
47
47
  S3_BUCKET = "s3://utic-ingest-test-fixtures/"
48
48
  GS_BUCKET = "gs://utic-test-ingest-fixtures-output/"
49
49
  AZURE_BUCKET = "az://utic-ingest-test-fixtures-output/"
@@ -54,9 +54,9 @@ REQUIRED_ENV_VARS = {
54
54
  "local": (),
55
55
  }
56
56
 
57
-
58
57
  SCHEMA = pa.schema(
59
58
  [
59
+ pa.field(RECORD_ID_LABEL, pa.string()),
60
60
  pa.field("vector", pa.list_(pa.float16(), DIMENSION)),
61
61
  pa.field("text", pa.string(), nullable=True),
62
62
  pa.field("type", pa.string(), nullable=True),
@@ -69,6 +69,7 @@ SCHEMA = pa.schema(
69
69
  pa.field("metadata-page_number", pa.int32(), nullable=True),
70
70
  ]
71
71
  )
72
+ NUMBER_EXPECTED_COLUMNS = len(SCHEMA.names)
72
73
 
73
74
 
74
75
  @pytest_asyncio.fixture
@@ -116,7 +117,7 @@ async def test_lancedb_destination(
116
117
  file_data = FileData(
117
118
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
118
119
  connector_type=CONNECTOR_TYPE,
119
- identifier="mock file data",
120
+ identifier="mock-file-data",
120
121
  )
121
122
  stager = LanceDBUploadStager()
122
123
  uploader = _get_uploader(uri)
@@ -129,17 +130,52 @@ async def test_lancedb_destination(
129
130
 
130
131
  await uploader.run_async(path=staged_file_path, file_data=file_data)
131
132
 
132
- table = await connection.open_table(TABLE_NAME)
133
- table_df: pd.DataFrame = await table.to_pandas()
133
+ # Test upload to empty table
134
+ with await connection.open_table(TABLE_NAME) as table:
135
+ table_df: pd.DataFrame = await table.to_pandas()
134
136
 
135
137
  assert len(table_df) == NUMBER_EXPECTED_ROWS
136
138
  assert len(table_df.columns) == NUMBER_EXPECTED_COLUMNS
137
139
 
140
+ assert table_df[RECORD_ID_LABEL][0] == file_data.identifier
138
141
  assert table_df["element_id"][0] == "2470d8dc42215b3d68413b55bf00fed2"
139
142
  assert table_df["type"][0] == "CompositeElement"
140
143
  assert table_df["metadata-filename"][0] == "DA-1p-with-duplicate-pages.pdf.json"
141
144
  assert table_df["metadata-text_as_html"][0] is None
142
145
 
146
+ # Test upload of the second file, rows should be appended
147
+ file_data.identifier = "mock-file-data-2"
148
+ staged_second_file_path = stager.run(
149
+ elements_filepath=upload_file,
150
+ file_data=file_data,
151
+ output_dir=tmp_path,
152
+ output_filename=f"{upload_file.stem}-2{upload_file.suffix}",
153
+ )
154
+ await uploader.run_async(path=staged_second_file_path, file_data=file_data)
155
+ with await connection.open_table(TABLE_NAME) as table:
156
+ appended_table_df: pd.DataFrame = await table.to_pandas()
157
+ assert len(appended_table_df) == 2 * NUMBER_EXPECTED_ROWS
158
+
159
+ # Test re-upload of the first file, rows should be overwritten, not appended
160
+ await uploader.run_async(path=staged_file_path, file_data=file_data)
161
+ with await connection.open_table(TABLE_NAME) as table:
162
+ overwritten_table_df: pd.DataFrame = await table.to_pandas()
163
+ assert len(overwritten_table_df) == 2 * NUMBER_EXPECTED_ROWS
164
+
165
+
166
+ class TestPrecheck:
167
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
168
+ @pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
169
+ def test_succeeds(
170
+ self,
171
+ upload_file: Path,
172
+ connection_with_uri: tuple[AsyncConnection, str],
173
+ tmp_path: Path,
174
+ ) -> None:
175
+ _, uri = connection_with_uri
176
+ uploader = _get_uploader(uri)
177
+ uploader.precheck()
178
+
143
179
 
144
180
  def _get_uri(target: Literal["local", "s3", "gcs", "az"], local_base_path: Path) -> str:
145
181
  if target == "local":
@@ -158,11 +194,12 @@ def _get_uploader(
158
194
  uri: str,
159
195
  ) -> Union[LanceDBAzureUploader, LanceDBAzureUploader, LanceDBAwsUploader, LanceDBGSPUploader]:
160
196
  target = uri.split("://", maxsplit=1)[0] if uri.startswith(("s3", "az", "gs")) else "local"
197
+ upload_config = LanceDBUploaderConfig(table_name=TABLE_NAME)
161
198
  if target == "az":
162
199
  azure_connection_string = os.getenv("AZURE_DEST_CONNECTION_STR")
163
200
  access_config_kwargs = _parse_azure_connection_string(azure_connection_string)
164
201
  return LanceDBAzureUploader(
165
- upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
202
+ upload_config=upload_config,
166
203
  connection_config=LanceDBAzureConnectionConfig(
167
204
  access_config=LanceDBAzureAccessConfig(**access_config_kwargs),
168
205
  uri=uri,
@@ -171,7 +208,7 @@ def _get_uploader(
171
208
 
172
209
  elif target == "s3":
173
210
  return LanceDBAwsUploader(
174
- upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
211
+ upload_config=upload_config,
175
212
  connection_config=LanceDBAwsConnectionConfig(
176
213
  access_config=LanceDBAwsAccessConfig(
177
214
  aws_access_key_id=os.getenv("S3_INGEST_TEST_ACCESS_KEY"),
@@ -182,7 +219,7 @@ def _get_uploader(
182
219
  )
183
220
  elif target == "gs":
184
221
  return LanceDBGSPUploader(
185
- upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
222
+ upload_config=upload_config,
186
223
  connection_config=LanceDBGCSConnectionConfig(
187
224
  access_config=LanceDBGCSAccessConfig(
188
225
  google_service_account_key=os.getenv("GCP_INGEST_SERVICE_KEY")
@@ -192,7 +229,7 @@ def _get_uploader(
192
229
  )
193
230
  else:
194
231
  return LanceDBLocalUploader(
195
- upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
232
+ upload_config=upload_config,
196
233
  connection_config=LanceDBLocalConnectionConfig(
197
234
  access_config=LanceDBLocalAccessConfig(),
198
235
  uri=uri,
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import math
2
3
  import os
3
4
  import re
4
5
  import time
@@ -19,6 +20,7 @@ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
20
  from unstructured_ingest.v2.logger import logger
20
21
  from unstructured_ingest.v2.processes.connectors.pinecone import (
21
22
  CONNECTOR_TYPE,
23
+ MAX_QUERY_RESULTS,
22
24
  PineconeAccessConfig,
23
25
  PineconeConnectionConfig,
24
26
  PineconeUploader,
@@ -118,7 +120,10 @@ def validate_pinecone_index(
118
120
  f"retry attempt {i}: expected {expected_num_of_vectors} != vector count {vector_count}"
119
121
  )
120
122
  time.sleep(interval)
121
- assert vector_count == expected_num_of_vectors
123
+ assert vector_count == expected_num_of_vectors, (
124
+ f"vector count from index ({vector_count}) doesn't "
125
+ f"match expected number: {expected_num_of_vectors}"
126
+ )
122
127
 
123
128
 
124
129
  @requires_env(API_KEY)
@@ -147,10 +152,7 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
147
152
  uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
148
153
  uploader.precheck()
149
154
 
150
- if uploader.is_async():
151
- await uploader.run_async(path=new_upload_file, file_data=file_data)
152
- else:
153
- uploader.run(path=new_upload_file, file_data=file_data)
155
+ uploader.run(path=new_upload_file, file_data=file_data)
154
156
  with new_upload_file.open() as f:
155
157
  staged_content = json.load(f)
156
158
  expected_num_of_vectors = len(staged_content)
@@ -160,10 +162,59 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
160
162
  )
161
163
 
162
164
  # Rerun uploader and make sure no duplicates exist
163
- if uploader.is_async():
164
- await uploader.run_async(path=new_upload_file, file_data=file_data)
165
- else:
166
- uploader.run(path=new_upload_file, file_data=file_data)
165
+ uploader.run(path=new_upload_file, file_data=file_data)
166
+ logger.info("validating second upload")
167
+ validate_pinecone_index(
168
+ index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
169
+ )
170
+
171
+
172
+ @requires_env(API_KEY)
173
+ @pytest.mark.asyncio
174
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
175
+ @pytest.mark.skip(reason="TODO: get this to work")
176
+ async def test_pinecone_destination_large_index(
177
+ pinecone_index: str, upload_file: Path, temp_dir: Path
178
+ ):
179
+ new_file = temp_dir / "large_file.json"
180
+ with upload_file.open() as f:
181
+ upload_content = json.load(f)
182
+
183
+ min_entries = math.ceil((MAX_QUERY_RESULTS * 2) / len(upload_content))
184
+ new_content = (upload_content * min_entries)[: (2 * MAX_QUERY_RESULTS)]
185
+ print(f"Creating large index content with {len(new_content)} records")
186
+ with new_file.open("w") as f:
187
+ json.dump(new_content, f)
188
+
189
+ expected_num_of_vectors = len(new_content)
190
+ file_data = FileData(
191
+ source_identifiers=SourceIdentifiers(fullpath=new_file.name, filename=new_file.name),
192
+ connector_type=CONNECTOR_TYPE,
193
+ identifier="pinecone_mock_id",
194
+ )
195
+ connection_config = PineconeConnectionConfig(
196
+ index_name=pinecone_index,
197
+ access_config=PineconeAccessConfig(api_key=get_api_key()),
198
+ )
199
+ stager_config = PineconeUploadStagerConfig()
200
+ stager = PineconeUploadStager(upload_stager_config=stager_config)
201
+ new_upload_file = stager.run(
202
+ elements_filepath=new_file,
203
+ output_dir=temp_dir,
204
+ output_filename=new_file.name,
205
+ file_data=file_data,
206
+ )
207
+
208
+ upload_config = PineconeUploaderConfig()
209
+ uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
210
+ uploader.precheck()
211
+
212
+ uploader.run(path=new_upload_file, file_data=file_data)
213
+ validate_pinecone_index(
214
+ index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
215
+ )
216
+ # Rerun uploader and make sure no duplicates exist
217
+ uploader.run(path=new_upload_file, file_data=file_data)
167
218
  logger.info("validating second upload")
168
219
  validate_pinecone_index(
169
220
  index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
@@ -1 +1 @@
1
- __version__ = "0.3.4" # pragma: no cover
1
+ __version__ = "0.3.6" # pragma: no cover
@@ -233,8 +233,7 @@ class AzureAISearchUploader(Uploader):
233
233
  raise WriteError(
234
234
  ", ".join(
235
235
  [
236
- f"{error.azure_ai_search_key}: "
237
- f"[{error.status_code}] {error.error_message}"
236
+ f"{error.key}: " f"[{error.status_code}] {error.error_message}"
238
237
  for error in errors
239
238
  ],
240
239
  ),
@@ -142,8 +142,6 @@ class ElasticsearchIndexer(Indexer):
142
142
  def precheck(self) -> None:
143
143
  try:
144
144
  with self.connection_config.get_client() as client:
145
- if not client.ping():
146
- raise SourceConnectionError("cluster not detected")
147
145
  indices = client.indices.get_alias(index="*")
148
146
  if self.index_config.index_name not in indices:
149
147
  raise SourceConnectionError(
@@ -393,11 +391,9 @@ class ElasticsearchUploader(Uploader):
393
391
  def precheck(self) -> None:
394
392
  try:
395
393
  with self.connection_config.get_client() as client:
396
- if not client.ping():
397
- raise DestinationConnectionError("cluster not detected")
398
394
  indices = client.indices.get_alias(index="*")
399
395
  if self.upload_config.index_name not in indices:
400
- raise SourceConnectionError(
396
+ raise DestinationConnectionError(
401
397
  "index {} not found: {}".format(
402
398
  self.upload_config.index_name, ", ".join(indices.keys())
403
399
  )
@@ -15,6 +15,7 @@ from unstructured_ingest.error import DestinationConnectionError
15
15
  from unstructured_ingest.logger import logger
16
16
  from unstructured_ingest.utils.data_prep import flatten_dict
17
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
18
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
18
19
  from unstructured_ingest.v2.interfaces.connector import ConnectionConfig
19
20
  from unstructured_ingest.v2.interfaces.file_data import FileData
20
21
  from unstructured_ingest.v2.interfaces.upload_stager import UploadStager, UploadStagerConfig
@@ -84,7 +85,7 @@ class LanceDBUploadStager(UploadStager):
84
85
 
85
86
  df = pd.DataFrame(
86
87
  [
87
- self._conform_element_contents(element_contents)
88
+ self._conform_element_contents(element_contents, file_data)
88
89
  for element_contents in elements_contents
89
90
  ]
90
91
  )
@@ -94,9 +95,10 @@ class LanceDBUploadStager(UploadStager):
94
95
 
95
96
  return output_path
96
97
 
97
- def _conform_element_contents(self, element: dict) -> dict:
98
+ def _conform_element_contents(self, element: dict, file_data: FileData) -> dict:
98
99
  return {
99
100
  "vector": element.pop("embeddings", None),
101
+ RECORD_ID_LABEL: file_data.identifier,
100
102
  **flatten_dict(element, separator="-"),
101
103
  }
102
104
 
@@ -134,6 +136,14 @@ class LanceDBUploader(Uploader):
134
136
  async with self.get_table() as table:
135
137
  schema = await table.schema()
136
138
  df = self._fit_to_schema(df, schema)
139
+ if RECORD_ID_LABEL not in schema.names:
140
+ logger.warning(
141
+ f"Designated table doesn't contain {RECORD_ID_LABEL} column of type"
142
+ " string which is required to support overwriting updates on subsequent"
143
+ " uploads of the same record. New rows will be appended instead."
144
+ )
145
+ else:
146
+ await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
137
147
  await table.add(data=df)
138
148
 
139
149
  def _fit_to_schema(self, df: pd.DataFrame, schema) -> pd.DataFrame:
@@ -31,6 +31,7 @@ CONNECTOR_TYPE = "pinecone"
31
31
  MAX_PAYLOAD_SIZE = 2 * 1024 * 1024 # 2MB
32
32
  MAX_POOL_THREADS = 100
33
33
  MAX_METADATA_BYTES = 40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
34
+ MAX_QUERY_RESULTS = 10000
34
35
 
35
36
 
36
37
  class PineconeAccessConfig(AccessConfig):
@@ -84,7 +85,7 @@ ALLOWED_FIELDS = (
84
85
 
85
86
  class PineconeUploadStagerConfig(UploadStagerConfig):
86
87
  metadata_fields: list[str] = Field(
87
- default=str(ALLOWED_FIELDS),
88
+ default=list(ALLOWED_FIELDS),
88
89
  description=(
89
90
  "which metadata from the source element to map to the payload metadata being sent to "
90
91
  "Pinecone."
@@ -137,7 +138,6 @@ class PineconeUploadStager(UploadStager):
137
138
  flatten_lists=True,
138
139
  remove_none=True,
139
140
  )
140
- metadata[RECORD_ID_LABEL] = file_data.identifier
141
141
  metadata_size_bytes = len(json.dumps(metadata).encode())
142
142
  if metadata_size_bytes > MAX_METADATA_BYTES:
143
143
  logger.info(
@@ -146,6 +146,8 @@ class PineconeUploadStager(UploadStager):
146
146
  )
147
147
  metadata = {}
148
148
 
149
+ metadata[RECORD_ID_LABEL] = file_data.identifier
150
+
149
151
  return {
150
152
  "id": str(uuid.uuid4()),
151
153
  "values": embeddings,
@@ -213,6 +215,18 @@ class PineconeUploader(Uploader):
213
215
  f"from pinecone index: {resp}"
214
216
  )
215
217
 
218
+ def delete_by_query(self, index: "PineconeIndex", query_params: dict) -> None:
219
+ while True:
220
+ query_results = index.query(**query_params)
221
+ matches = query_results.get("matches", [])
222
+ if not matches:
223
+ break
224
+ ids = [match["id"] for match in matches]
225
+ delete_params = {"ids": ids}
226
+ if namespace := self.upload_config.namespace:
227
+ delete_params["namespace"] = namespace
228
+ index.delete(**delete_params)
229
+
216
230
  def serverless_delete_by_record_id(self, file_data: FileData) -> None:
217
231
  logger.debug(
218
232
  f"deleting any content with metadata "
@@ -221,29 +235,25 @@ class PineconeUploader(Uploader):
221
235
  )
222
236
  index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
223
237
  index_stats = index.describe_index_stats()
238
+ dimension = index_stats["dimension"]
224
239
  total_vectors = index_stats["total_vector_count"]
225
240
  if total_vectors == 0:
226
241
  return
227
- dimension = index_stats["dimension"]
228
- query_params = {
229
- "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
230
- "vector": [0] * dimension,
231
- "top_k": total_vectors,
232
- }
233
- if namespace := self.upload_config.namespace:
234
- query_params["namespace"] = namespace
235
- while True:
236
- query_results = index.query(**query_params)
237
- matches = query_results.get("matches", [])
238
- if not matches:
239
- break
240
- ids = [match["id"] for match in matches]
241
- delete_params = {"ids": ids}
242
+ while total_vectors > 0:
243
+ top_k = min(total_vectors, MAX_QUERY_RESULTS)
244
+ query_params = {
245
+ "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
246
+ "vector": [0] * dimension,
247
+ "top_k": top_k,
248
+ }
242
249
  if namespace := self.upload_config.namespace:
243
- delete_params["namespace"] = namespace
244
- index.delete(**delete_params)
245
- logger.debug(
246
- f"deleted any content with metadata "
250
+ query_params["namespace"] = namespace
251
+ self.delete_by_query(index=index, query_params=query_params)
252
+ index_stats = index.describe_index_stats()
253
+ total_vectors = index_stats["total_vector_count"]
254
+
255
+ logger.info(
256
+ f"deleted {total_vectors} records with metadata "
247
257
  f"{self.upload_config.record_id_key}={file_data.identifier} "
248
258
  f"from pinecone index"
249
259
  )
@@ -10,8 +10,6 @@ from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
10
10
  from .embedded import weaviate_embedded_destination_entry
11
11
  from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
12
12
  from .local import weaviate_local_destination_entry
13
- from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
14
- from .weaviate import weaviate_destination_entry
15
13
 
16
14
  add_destination_entry(
17
15
  destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
@@ -22,4 +20,3 @@ add_destination_entry(
22
20
  add_destination_entry(
23
21
  destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
24
22
  )
25
- add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
@@ -22,7 +22,6 @@ from unstructured_ingest.v2.interfaces import (
22
22
  UploadStagerConfig,
23
23
  )
24
24
  from unstructured_ingest.v2.logger import logger
25
- from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
26
25
 
27
26
  if TYPE_CHECKING:
28
27
  from weaviate.classes.init import Timeout
@@ -288,12 +287,3 @@ class WeaviateUploader(Uploader, ABC):
288
287
  vector=vector,
289
288
  )
290
289
  self.check_for_errors(client=weaviate_client)
291
-
292
-
293
- weaviate_destination_entry = DestinationRegistryEntry(
294
- connection_config=WeaviateConnectionConfig,
295
- uploader=WeaviateUploader,
296
- uploader_config=WeaviateUploaderConfig,
297
- upload_stager=WeaviateUploadStager,
298
- upload_stager_config=WeaviateUploadStagerConfig,
299
- )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.3.4
3
+ Version: 0.3.6
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: opentelemetry-sdk
26
- Requires-Dist: pandas
27
25
  Requires-Dist: python-dateutil
26
+ Requires-Dist: tqdm
28
27
  Requires-Dist: pydantic>=2.7
29
28
  Requires-Dist: dataclasses-json
30
- Requires-Dist: tqdm
29
+ Requires-Dist: opentelemetry-sdk
30
+ Requires-Dist: pandas
31
31
  Requires-Dist: click
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
@@ -51,8 +51,8 @@ Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
52
52
  Requires-Dist: clarifai; extra == "clarifai"
53
53
  Provides-Extra: confluence
54
- Requires-Dist: requests; extra == "confluence"
55
54
  Requires-Dist: atlassian-python-api; extra == "confluence"
55
+ Requires-Dist: requests; extra == "confluence"
56
56
  Provides-Extra: couchbase
57
57
  Requires-Dist: couchbase; extra == "couchbase"
58
58
  Provides-Extra: csv
@@ -78,8 +78,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
78
78
  Provides-Extra: embed-mixedbreadai
79
79
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
80
80
  Provides-Extra: embed-octoai
81
- Requires-Dist: tiktoken; extra == "embed-octoai"
82
81
  Requires-Dist: openai; extra == "embed-octoai"
82
+ Requires-Dist: tiktoken; extra == "embed-octoai"
83
83
  Provides-Extra: embed-vertexai
84
84
  Requires-Dist: vertexai; extra == "embed-vertexai"
85
85
  Provides-Extra: embed-voyageai
@@ -98,8 +98,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
98
98
  Provides-Extra: google-drive
99
99
  Requires-Dist: google-api-python-client; extra == "google-drive"
100
100
  Provides-Extra: hubspot
101
- Requires-Dist: hubspot-api-client; extra == "hubspot"
102
101
  Requires-Dist: urllib3; extra == "hubspot"
102
+ Requires-Dist: hubspot-api-client; extra == "hubspot"
103
103
  Provides-Extra: jira
104
104
  Requires-Dist: atlassian-python-api; extra == "jira"
105
105
  Provides-Extra: kafka
@@ -117,26 +117,26 @@ Requires-Dist: pymongo; extra == "mongodb"
117
117
  Provides-Extra: msg
118
118
  Requires-Dist: unstructured[msg]; extra == "msg"
119
119
  Provides-Extra: notion
120
- Requires-Dist: htmlBuilder; extra == "notion"
120
+ Requires-Dist: notion-client; extra == "notion"
121
121
  Requires-Dist: backoff; extra == "notion"
122
122
  Requires-Dist: httpx; extra == "notion"
123
- Requires-Dist: notion-client; extra == "notion"
123
+ Requires-Dist: htmlBuilder; extra == "notion"
124
124
  Provides-Extra: odt
125
125
  Requires-Dist: unstructured[odt]; extra == "odt"
126
126
  Provides-Extra: onedrive
127
127
  Requires-Dist: bs4; extra == "onedrive"
128
- Requires-Dist: msal; extra == "onedrive"
129
128
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
129
+ Requires-Dist: msal; extra == "onedrive"
130
130
  Provides-Extra: openai
131
- Requires-Dist: tiktoken; extra == "openai"
132
131
  Requires-Dist: openai; extra == "openai"
132
+ Requires-Dist: tiktoken; extra == "openai"
133
133
  Provides-Extra: opensearch
134
134
  Requires-Dist: opensearch-py; extra == "opensearch"
135
135
  Provides-Extra: org
136
136
  Requires-Dist: unstructured[org]; extra == "org"
137
137
  Provides-Extra: outlook
138
- Requires-Dist: msal; extra == "outlook"
139
138
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
139
+ Requires-Dist: msal; extra == "outlook"
140
140
  Provides-Extra: pdf
141
141
  Requires-Dist: unstructured[pdf]; extra == "pdf"
142
142
  Provides-Extra: pinecone
@@ -163,18 +163,18 @@ Requires-Dist: s3fs; extra == "s3"
163
163
  Provides-Extra: salesforce
164
164
  Requires-Dist: simple-salesforce; extra == "salesforce"
165
165
  Provides-Extra: sftp
166
- Requires-Dist: fsspec; extra == "sftp"
167
166
  Requires-Dist: paramiko; extra == "sftp"
167
+ Requires-Dist: fsspec; extra == "sftp"
168
168
  Provides-Extra: sharepoint
169
- Requires-Dist: msal; extra == "sharepoint"
170
169
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
170
+ Requires-Dist: msal; extra == "sharepoint"
171
171
  Provides-Extra: singlestore
172
172
  Requires-Dist: singlestoredb; extra == "singlestore"
173
173
  Provides-Extra: slack
174
174
  Requires-Dist: slack-sdk[optional]; extra == "slack"
175
175
  Provides-Extra: snowflake
176
- Requires-Dist: psycopg2-binary; extra == "snowflake"
177
176
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
177
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
178
178
  Provides-Extra: togetherai
179
179
  Requires-Dist: together; extra == "togetherai"
180
180
  Provides-Extra: tsv
@@ -10,11 +10,11 @@ test/integration/connectors/test_azure_ai_search.py,sha256=dae4GifRiKue5YpsxworD
10
10
  test/integration/connectors/test_confluence.py,sha256=xcPmZ_vi_pkCt-tUPn10P49FH9i_9YUbrAPO6fYk5rU,3521
11
11
  test/integration/connectors/test_delta_table.py,sha256=GSzWIkbEUzOrRPt2F1uO0dabcp7kTFDj75BhhI2y-WU,6856
12
12
  test/integration/connectors/test_kafka.py,sha256=j7jsNWZumNBv9v-5Bpx8geUUXpxxad5EuA4CMRsl4R8,7104
13
- test/integration/connectors/test_lancedb.py,sha256=8hRlqw3zYOcFCu6PPlejquSvvEM_3OEBzKTQbNm_Zmg,7635
13
+ test/integration/connectors/test_lancedb.py,sha256=U2HfIrf6iJ7lYMn-vz0j-LesVyDY-jc9QrQhlJVhG9Q,9183
14
14
  test/integration/connectors/test_milvus.py,sha256=p4UujDr_tsRaQDmhDmDZp38t8oSFm7hrTqiq6NNuhGo,5933
15
15
  test/integration/connectors/test_mongodb.py,sha256=YeS_DUnVYN02F76j87W8RhXGHnJMzQYb3n-L1-oWGXI,12254
16
16
  test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
17
- test/integration/connectors/test_pinecone.py,sha256=X10OWZ6IrO6YyhuR3ydMAZOQq3u2f5u_lCjKNYUUcnI,7558
17
+ test/integration/connectors/test_pinecone.py,sha256=i-v5WkAI9M6SUZI7ch9qdILlRHopAdptpkSY12-BaTk,9483
18
18
  test/integration/connectors/test_qdrant.py,sha256=ASvO-BNyhv8m8or28KljrJy27Da0uaTNeoR5w_QsvFg,5121
19
19
  test/integration/connectors/test_s3.py,sha256=YHEYMqWTKTfR7wlL4VoxtgMs1YiYKyhLIBdG-anaQGo,6896
20
20
  test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -83,7 +83,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
83
83
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
84
84
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
85
85
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
86
- unstructured_ingest/__version__.py,sha256=0rNziXrR8RxleBY3pKm77TbOCJ0CwApHiLqXBAViUAo,42
86
+ unstructured_ingest/__version__.py,sha256=J7Aic1p5b4KF_ydqV36h8cvEIhTtU-IJ72bMV9mQs8w,42
87
87
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
88
88
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
89
89
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -398,7 +398,7 @@ unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2j
398
398
  unstructured_ingest/v2/processes/connectors/__init__.py,sha256=8M3aYYNbOkS2SYG2B_HLHMgX4V69-Oz1VqpQcRQMiVg,5167
399
399
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
400
400
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=QTUQ-cv_iZi9eaXRRHQNKhtgFn-Pi20AXdSVaDFg9DM,15498
401
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=-6IijSWGqj-85vD0c4l5wdMHp-LF371jO8j53PPRB4I,12002
401
+ unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=97HxxVvqf-80Bxb-AaBhFhMvoRl7cUjn4n-39vCAVG0,11962
402
402
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
403
403
  unstructured_ingest/v2/processes/connectors/confluence.py,sha256=qQApDcmPBGg4tHXwSOj4JPkAbrO9GQ4NRlaETjhp25U,7003
404
404
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=LbUJLt6fqaNYSmy9vUiovG-UOALMcvh8OD-gZAaf-f4,12333
@@ -411,7 +411,7 @@ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=3sV0Yv2vYMLyxszKCqA
411
411
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
412
412
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
413
413
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
414
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=hWkXgVDAzCtrBxf7A4HoexBACGAfVf_Qvn9YHbeiBSY,11505
414
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=-J6QPJv_jmjln8cTUsfEEAyd_hi_fmD-uwB6C84rA4w,11930
415
415
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
416
416
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
417
417
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
@@ -423,7 +423,7 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P
423
423
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
424
424
  unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=Wk7s2_u5G0BOV5slvGc8IlUf7ivznY9PrgPqe6nlJKM,2897
425
425
  unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
426
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=lzbrQ66zz3Dh_G29XFkyzQ84St8H_xfQVsYV4mTf32c,19141
426
+ unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=sI58uypWr1mpSl4bxr46nIfypGZ4aqryCT83qqCVnSM,18921
427
427
  unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
428
428
  unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
429
429
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
@@ -443,7 +443,7 @@ unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=eeXWsh8UeVm1Ur
443
443
  unstructured_ingest/v2/processes/connectors/lancedb/azure.py,sha256=Ms5vQVRIpTF1Q2qBl_bET9wbgaf4diPaH-iR8kJlr4E,1461
444
444
  unstructured_ingest/v2/processes/connectors/lancedb/cloud.py,sha256=BFy0gW2OZ_qaZJM97m-tNsFaJPi9zOKrrd2y4thcNP0,1341
445
445
  unstructured_ingest/v2/processes/connectors/lancedb/gcp.py,sha256=p5BPaFtS3y3Yh8PIr3tUqsAXrUYu4QYYAWQNh5W2ucE,1361
446
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7WIShs2V3dpN6wUhDTt1j2rvdiPp6yopbh7XYkb9T3s,5129
446
+ unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7FODnesYu8cFx1PeQJZxXij-8Dei4Kk3Bs0oxoUGBtI,5745
447
447
  unstructured_ingest/v2/processes/connectors/lancedb/local.py,sha256=_7-6iO6B60gAWwJUUrmlsRzYMFIBeZgu_QT3mhw5L0I,1272
448
448
  unstructured_ingest/v2/processes/connectors/qdrant/__init__.py,sha256=xM19uYzAuGizVoZIM_hnVZ5AcBN69aOBGpqZcpWPtuE,760
449
449
  unstructured_ingest/v2/processes/connectors/qdrant/cloud.py,sha256=accJ4sNWBVWV-KiVBDBDBYYx5A9CUoikP5NCErRmfik,1624
@@ -456,14 +456,14 @@ unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=YrmhAL1RQ1
456
456
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=jl524VudwmFK63emCT7DmZan_EWJAMiGir5_zoO9FuY,5697
457
457
  unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=LFzGeAUagLknK07DsXg2oSG7ZAgR6VqT9wfI_tYlHUg,14782
458
458
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
459
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=eXamSnQdzzMvt62z80B8nmlkwDKO-Pogln_K_zLz53A,1067
459
+ unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
460
460
  unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
461
461
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
462
462
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
463
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=ln1p9ahFTaT-qsL7p4bgw_IqnU60As_l6vVAqUWyQVE,11655
464
- unstructured_ingest-0.3.4.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
465
- unstructured_ingest-0.3.4.dist-info/METADATA,sha256=6Nj2KHvch7j5QLfahz5NcFHmmNq9vNixTfZSDUEQPjo,7393
466
- unstructured_ingest-0.3.4.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
467
- unstructured_ingest-0.3.4.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
468
- unstructured_ingest-0.3.4.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
469
- unstructured_ingest-0.3.4.dist-info/RECORD,,
463
+ unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=dBDC_M8GVKupl7i9UMRCZyRIUv6gTkq8bJE_SILydAc,11291
464
+ unstructured_ingest-0.3.6.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
465
+ unstructured_ingest-0.3.6.dist-info/METADATA,sha256=JmWEiv5oO6crJ6dRAOcBrCiJI12tOonA_arMTa5HoJY,7393
466
+ unstructured_ingest-0.3.6.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
467
+ unstructured_ingest-0.3.6.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
468
+ unstructured_ingest-0.3.6.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
469
+ unstructured_ingest-0.3.6.dist-info/RECORD,,