unstructured-ingest 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -351,3 +351,37 @@ def test_pinecone_stager(
351
351
  stager=stager,
352
352
  tmp_dir=tmp_path,
353
353
  )
354
+
355
+
356
+ @requires_env(API_KEY)
357
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
358
+ def test_pinecone_create_destination(pinecone_index):
359
+ uploader = PineconeUploader(
360
+ connection_config=PineconeConnectionConfig(
361
+ access_config=PineconeAccessConfig(api_key=get_api_key())
362
+ ),
363
+ upload_config=PineconeUploaderConfig(),
364
+ )
365
+
366
+ random_id = str(uuid4()).split("-")[0]
367
+
368
+ index_name = f"test-create-destination-{random_id}"
369
+
370
+ assert not uploader.index_exists(index_name=index_name)
371
+
372
+ try:
373
+ uploader.create_destination(destination_name=index_name, vector_length=1536)
374
+ except Exception as e:
375
+ error_body = getattr(e, "body", None)
376
+ raise pytest.fail(f"failed to create destination: {e} {error_body}")
377
+
378
+ assert uploader.index_exists(index_name=index_name), "destination was not created successfully"
379
+
380
+ try:
381
+ pc = uploader.connection_config.get_client()
382
+ logger.info(f"deleting index for test create destination: {index_name}")
383
+ pc.delete_index(name=index_name)
384
+ except Exception as e:
385
+ raise pytest.fail(f"failed to cleanup / delete the destination: {e}")
386
+
387
+ assert not uploader.index_exists(index_name=index_name), "cleanup failed"
@@ -78,7 +78,6 @@ def run_uploader_and_validate(
78
78
  validate_count(expected_count=expected_count)
79
79
 
80
80
 
81
- @pytest.mark.asyncio
82
81
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
83
82
  def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
84
83
  file_data = FileData(
@@ -142,11 +141,12 @@ def test_weaviate_local_create_destination(weaviate_instance):
142
141
  upload_config=LocalWeaviateUploaderConfig(),
143
142
  connection_config=LocalWeaviateConnectionConfig(),
144
143
  )
145
- collection_name = "system_created"
144
+ collection_name = "system_created-123"
145
+ formatted_collection_name = "System_created_123"
146
146
  created = uploader.create_destination(destination_name=collection_name)
147
147
  assert created
148
148
  with uploader.connection_config.get_client() as weaviate_client:
149
- assert weaviate_client.collections.exists(name=collection_name)
149
+ assert weaviate_client.collections.exists(name=formatted_collection_name)
150
150
 
151
151
  created = uploader.create_destination(destination_name=collection_name)
152
152
  assert not created
@@ -1 +1 @@
1
- __version__ = "0.5.4" # pragma: no cover
1
+ __version__ = "0.5.6" # pragma: no cover
@@ -8,7 +8,7 @@ class BaseProcess(ABC):
8
8
  def is_async(self) -> bool:
9
9
  return False
10
10
 
11
- def init(self, *kwargs: Any) -> None:
11
+ def init(self, **kwargs: Any) -> None:
12
12
  pass
13
13
 
14
14
  def precheck(self) -> None:
@@ -1,7 +1,7 @@
1
1
  from abc import ABC
2
2
  from dataclasses import dataclass
3
3
  from pathlib import Path
4
- from typing import Any, Optional, TypeVar
4
+ from typing import Any, TypeVar
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -61,6 +61,6 @@ class Uploader(BaseProcess, BaseConnector, ABC):
61
61
  @dataclass
62
62
  class VectorDBUploader(Uploader, ABC):
63
63
  def create_destination(
64
- self, destination_name: str = "elements", vector_length: Optional[int] = None, **kwargs: Any
64
+ self, vector_length: int, destination_name: str = "elements", **kwargs: Any
65
65
  ) -> bool:
66
66
  return False
@@ -126,14 +126,32 @@ class Pipeline:
126
126
  for kk, vv in v.items():
127
127
  logger.error(f"{k}: [{kk}] {vv}")
128
128
 
129
+ def _run_initialization(self):
130
+ failures = {}
131
+ init_kwargs = {}
132
+ for step in self._get_ordered_steps():
133
+ try:
134
+ step.process.init(**init_kwargs)
135
+ step.process.precheck()
136
+ # Make sure embedder dimensions available for downstream steps
137
+ if isinstance(step.process, Embedder):
138
+ embed_dimensions = step.process.config.get_embedder().dimension
139
+ init_kwargs["vector_length"] = embed_dimensions
140
+
141
+ except Exception as e:
142
+ failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
143
+ if failures:
144
+ for k, v in failures.items():
145
+ logger.error(f"Step initialization failure: {k}: {v}")
146
+ raise PipelineError("Initialization failed")
147
+
129
148
  def run(self):
130
149
  otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.info)
131
150
  try:
132
151
  with otel_handler.get_tracer().start_as_current_span(
133
152
  "ingest process", record_exception=True
134
153
  ):
135
- self._run_inits()
136
- self._run_prechecks()
154
+ self._run_initialization()
137
155
  self._run()
138
156
  finally:
139
157
  self.log_statuses()
@@ -154,43 +172,20 @@ class Pipeline:
154
172
  final = [f for f in flat if f]
155
173
  return final or None
156
174
 
157
- def _get_all_steps(self) -> list[PipelineStep]:
158
- steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
175
+ def _get_ordered_steps(self) -> list[PipelineStep]:
176
+ steps = [self.indexer_step, self.downloader_step]
177
+ if self.uncompress_step:
178
+ steps.append(self.uncompress_step)
179
+ steps.append(self.partitioner_step)
159
180
  if self.chunker_step:
160
181
  steps.append(self.chunker_step)
161
182
  if self.embedder_step:
162
183
  steps.append(self.embedder_step)
163
- if self.uncompress_step:
164
- steps.append(self.uncompress_step)
165
184
  if self.stager_step:
166
185
  steps.append(self.stager_step)
186
+ steps.append(self.uploader_step)
167
187
  return steps
168
188
 
169
- def _run_inits(self):
170
- failures = {}
171
-
172
- for step in self._get_all_steps():
173
- try:
174
- step.process.init()
175
- except Exception as e:
176
- failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
177
- if failures:
178
- for k, v in failures.items():
179
- logger.error(f"Step init failure: {k}: {v}")
180
- raise PipelineError("Init failed")
181
-
182
- def _run_prechecks(self):
183
- failures = {}
184
- for step in self._get_all_steps():
185
- try:
186
- step.process.precheck()
187
- except Exception as e:
188
- failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
189
- if failures:
190
- for k, v in failures.items():
191
- logger.error(f"Step precheck failure: {k}: {v}")
192
- raise PipelineError("Precheck failed")
193
-
194
189
  def apply_filter(self, records: list[dict]) -> list[dict]:
195
190
  if not self.filter_step:
196
191
  return records
@@ -42,6 +42,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
42
42
  DestinationRegistryEntry,
43
43
  SourceRegistryEntry,
44
44
  )
45
+ from unstructured_ingest.v2.processes.connectors.utils import format_and_truncate_orig_elements
45
46
 
46
47
  if TYPE_CHECKING:
47
48
  from astrapy import AsyncCollection as AstraDBAsyncCollection
@@ -318,6 +319,7 @@ class AstraDBUploadStager(UploadStager):
318
319
  element_dict["metadata"]["text_as_html"] = truncate_string_bytes(
319
320
  text_as_html, MAX_CONTENT_PARAM_BYTE_SIZE
320
321
  )
322
+ metadata["original_elements"] = format_and_truncate_orig_elements(element_dict)
321
323
 
322
324
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
323
325
  self.truncate_dict_elements(element_dict)
@@ -14,7 +14,6 @@ from pydantic import BaseModel, ConfigDict, Field, Secret, field_validator
14
14
 
15
15
  from unstructured_ingest.error import DestinationConnectionError
16
16
  from unstructured_ingest.logger import logger
17
- from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
18
17
  from unstructured_ingest.utils.data_prep import batch_generator
19
18
  from unstructured_ingest.utils.dep_check import requires_dependencies
20
19
  from unstructured_ingest.v2.interfaces import (
@@ -29,6 +28,7 @@ from unstructured_ingest.v2.interfaces import (
29
28
  from unstructured_ingest.v2.processes.connector_registry import (
30
29
  DestinationRegistryEntry,
31
30
  )
31
+ from unstructured_ingest.v2.processes.connectors.utils import format_and_truncate_orig_elements
32
32
 
33
33
  SimilarityFunction = Literal["cosine"]
34
34
 
@@ -132,7 +132,7 @@ class Neo4jUploadStager(UploadStager):
132
132
  if self._is_chunk(element):
133
133
  origin_element_nodes = [
134
134
  self._create_element_node(origin_element)
135
- for origin_element in self._get_origin_elements(element)
135
+ for origin_element in format_and_truncate_orig_elements(element)
136
136
  ]
137
137
  graph.add_edges_from(
138
138
  [
@@ -166,7 +166,11 @@ class Neo4jUploadStager(UploadStager):
166
166
  return _Node(id_=file_data.identifier, properties=properties, labels=[Label.DOCUMENT])
167
167
 
168
168
  def _create_element_node(self, element: dict) -> _Node:
169
- properties = {"id": element["element_id"], "text": element["text"]}
169
+ properties = {"id": element["element_id"]}
170
+
171
+ if text := element.get("text"):
172
+ # if we have chunks, we won't have text here for the original elements
173
+ properties["text"] = text
170
174
 
171
175
  if embeddings := element.get("embeddings"):
172
176
  properties["embeddings"] = embeddings
@@ -174,10 +178,6 @@ class Neo4jUploadStager(UploadStager):
174
178
  label = Label.CHUNK if self._is_chunk(element) else Label.UNSTRUCTURED_ELEMENT
175
179
  return _Node(id_=element["element_id"], properties=properties, labels=[label])
176
180
 
177
- def _get_origin_elements(self, chunk_element: dict) -> list[dict]:
178
- orig_elements = chunk_element.get("metadata", {}).get("orig_elements")
179
- return elements_from_base64_gzipped_json(raw_s=orig_elements)
180
-
181
181
 
182
182
  class _GraphData(BaseModel):
183
183
  nodes: list[_Node]
@@ -1,6 +1,7 @@
1
1
  import json
2
+ import re
2
3
  from dataclasses import dataclass, field
3
- from typing import TYPE_CHECKING, Any, Optional
4
+ from typing import TYPE_CHECKING, Any, Literal, Optional
4
5
 
5
6
  from pydantic import Field, Secret
6
7
 
@@ -13,10 +14,10 @@ from unstructured_ingest.v2.interfaces import (
13
14
  AccessConfig,
14
15
  ConnectionConfig,
15
16
  FileData,
16
- Uploader,
17
17
  UploaderConfig,
18
18
  UploadStager,
19
19
  UploadStagerConfig,
20
+ VectorDBUploader,
20
21
  )
21
22
  from unstructured_ingest.v2.logger import logger
22
23
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
@@ -41,7 +42,7 @@ class PineconeAccessConfig(AccessConfig):
41
42
 
42
43
 
43
44
  class PineconeConnectionConfig(ConnectionConfig):
44
- index_name: str = Field(description="Name of the index to connect to.")
45
+ index_name: Optional[str] = Field(description="Name of the index to connect to.", default=None)
45
46
  access_config: Secret[PineconeAccessConfig] = Field(
46
47
  default=PineconeAccessConfig(), validate_default=True
47
48
  )
@@ -160,18 +161,101 @@ class PineconeUploadStager(UploadStager):
160
161
 
161
162
 
162
163
  @dataclass
163
- class PineconeUploader(Uploader):
164
+ class PineconeUploader(VectorDBUploader):
164
165
  upload_config: PineconeUploaderConfig
165
166
  connection_config: PineconeConnectionConfig
166
167
  connector_type: str = CONNECTOR_TYPE
167
168
 
169
+ def init(self, **kwargs: Any) -> None:
170
+ self.create_destination(**kwargs)
171
+
172
+ def index_exists(self, index_name: Optional[str]) -> bool:
173
+ from pinecone.exceptions import NotFoundException
174
+
175
+ index_name = index_name or self.connection_config.index_name
176
+ pc = self.connection_config.get_client()
177
+ try:
178
+ pc.describe_index(index_name)
179
+ return True
180
+ except NotFoundException:
181
+ return False
182
+ except Exception as e:
183
+ logger.error(f"failed to check if pinecone index exists : {e}")
184
+ raise DestinationConnectionError(f"failed to check if pinecone index exists : {e}")
185
+
168
186
  def precheck(self):
169
187
  try:
170
- self.connection_config.get_index()
188
+ # just a connection check here. not an actual index_exists check
189
+ self.index_exists("just-checking-our-connection")
190
+
191
+ if self.connection_config.index_name and not self.index_exists(
192
+ self.connection_config.index_name
193
+ ):
194
+ raise DestinationConnectionError(
195
+ f"index {self.connection_config.index_name} does not exist"
196
+ )
171
197
  except Exception as e:
172
198
  logger.error(f"failed to validate connection: {e}", exc_info=True)
173
199
  raise DestinationConnectionError(f"failed to validate connection: {e}")
174
200
 
201
+ def format_destination_name(self, destination_name: str) -> str:
202
+ # Pinecone naming requirements:
203
+ # can only contain lowercase letters, numbers, and hyphens
204
+ # must be 45 characters or less
205
+ formatted = re.sub(r"[^a-z0-9]", "-", destination_name.lower())
206
+ return formatted
207
+
208
+ def create_destination(
209
+ self,
210
+ vector_length: int,
211
+ destination_name: str = "elements",
212
+ destination_type: Literal["pod", "serverless"] = "serverless",
213
+ serverless_cloud: str = "aws",
214
+ serverless_region: str = "us-west-2",
215
+ pod_environment: str = "us-east1-gcp",
216
+ pod_type: str = "p1.x1",
217
+ pod_count: int = 1,
218
+ **kwargs: Any,
219
+ ) -> bool:
220
+ from pinecone import PodSpec, ServerlessSpec
221
+
222
+ index_name = destination_name or self.connection_config.index_name
223
+ index_name = self.format_destination_name(index_name)
224
+ self.connection_config.index_name = index_name
225
+
226
+ if not self.index_exists(index_name):
227
+
228
+ logger.info(f"creating pinecone index {index_name}")
229
+
230
+ pc = self.connection_config.get_client()
231
+
232
+ if destination_type == "serverless":
233
+ pc.create_index(
234
+ name=destination_name,
235
+ dimension=vector_length,
236
+ spec=ServerlessSpec(cloud=serverless_cloud, region=serverless_region),
237
+ **kwargs,
238
+ )
239
+
240
+ return True
241
+
242
+ elif destination_type == "pod":
243
+ pc.create_index(
244
+ name=destination_name,
245
+ dimension=vector_length,
246
+ spec=PodSpec(environment=pod_environment, pod_type=pod_type, pods=pod_count),
247
+ **kwargs,
248
+ )
249
+
250
+ return True
251
+
252
+ else:
253
+ raise ValueError(f"unexpected destination type: {destination_type}")
254
+
255
+ else:
256
+ logger.debug(f"index {index_name} already exists, skipping creation")
257
+ return False
258
+
175
259
  def pod_delete_by_record_id(self, file_data: FileData) -> None:
176
260
  logger.debug(
177
261
  f"deleting any content with metadata "
@@ -266,6 +350,10 @@ class PineconeUploader(Uploader):
266
350
  )
267
351
  # Determine if serverless or pod based index
268
352
  pinecone_client = self.connection_config.get_client()
353
+
354
+ if not self.connection_config.index_name:
355
+ raise ValueError("No index name specified")
356
+
269
357
  index_description = pinecone_client.describe_index(name=self.connection_config.index_name)
270
358
  if "serverless" in index_description.get("spec"):
271
359
  self.serverless_delete_by_record_id(file_data=file_data)
@@ -5,6 +5,8 @@ from typing import Any, Union
5
5
  from dateutil import parser
6
6
  from pydantic import ValidationError
7
7
 
8
+ from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
9
+
8
10
 
9
11
  def parse_datetime(date_value: Union[int, str, float, datetime]) -> datetime:
10
12
  if isinstance(date_value, datetime):
@@ -27,3 +29,29 @@ def conform_string_to_dict(value: Any) -> dict:
27
29
  if isinstance(value, str):
28
30
  return json.loads(value)
29
31
  raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
32
+
33
+
34
+ def format_and_truncate_orig_elements(element: dict) -> list[dict[str, Any]]:
35
+ """
36
+ This function is used to format and truncate the orig_elements field in the metadata.
37
+ This is used to remove the text field and other larger fields from the orig_elements
38
+ that are not helpful in filtering/searching when used along with chunked elements.
39
+ """
40
+ metadata = element.get("metadata", {})
41
+ raw_orig_elements = metadata.get("orig_elements", None)
42
+ orig_elements = []
43
+ if raw_orig_elements is not None:
44
+ for element in elements_from_base64_gzipped_json(raw_orig_elements):
45
+ element.pop("text", None)
46
+ for prop in (
47
+ "image_base64",
48
+ "text_as_html",
49
+ "table_as_cells",
50
+ "link_urls",
51
+ "link_texts",
52
+ "link_start_indexes",
53
+ "emphasized_text_contents",
54
+ ):
55
+ element["metadata"].pop(prop, None)
56
+ orig_elements.append(element)
57
+ return orig_elements
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import re
2
3
  from abc import ABC, abstractmethod
3
4
  from contextlib import contextmanager
4
5
  from dataclasses import dataclass, field
@@ -229,19 +230,29 @@ class WeaviateUploader(VectorDBUploader, ABC):
229
230
  logger.error(f"Failed to validate connection {e}", exc_info=True)
230
231
  raise DestinationConnectionError(f"failed to validate connection: {e}")
231
232
 
232
- def init(self, *kwargs: Any) -> None:
233
- self.create_destination()
233
+ def init(self, **kwargs: Any) -> None:
234
+ self.create_destination(**kwargs)
235
+
236
+ def format_destination_name(self, destination_name: str) -> str:
237
+ # Weaviate naming requirements:
238
+ # must be alphanumeric and underscores only
239
+ formatted = re.sub(r"[^a-zA-Z0-9]", "_", destination_name)
240
+ # must begin with capital letter
241
+ return formatted.capitalize()
234
242
 
235
243
  def create_destination(
236
244
  self, destination_name: str = "elements", vector_length: Optional[int] = None, **kwargs: Any
237
245
  ) -> bool:
246
+ destination_name = self.format_destination_name(destination_name)
238
247
  collection_name = self.upload_config.collection or destination_name
239
248
  self.upload_config.collection = collection_name
249
+
240
250
  connectors_dir = Path(__file__).parents[1]
241
251
  collection_config_file = connectors_dir / "assets" / "weaviate_collection_config.json"
242
252
  with collection_config_file.open() as f:
243
253
  collection_config = json.load(f)
244
254
  collection_config["class"] = collection_name
255
+
245
256
  if not self._collection_exists():
246
257
  logger.info(
247
258
  f"creating default weaviate collection '{collection_name}' with default configs"
@@ -186,7 +186,7 @@ class EmbedderConfig(BaseModel):
186
186
  class Embedder(BaseProcess, ABC):
187
187
  config: EmbedderConfig
188
188
 
189
- def init(self, *kwargs: Any) -> None:
189
+ def init(self, **kwargs: Any) -> None:
190
190
  self.config.get_embedder().initialize()
191
191
 
192
192
  def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.4
3
+ Version: 0.5.6
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: opentelemetry-sdk
26
- Requires-Dist: pandas
27
- Requires-Dist: python-dateutil
28
25
  Requires-Dist: dataclasses_json
26
+ Requires-Dist: click
27
+ Requires-Dist: python-dateutil
29
28
  Requires-Dist: tqdm
30
29
  Requires-Dist: pydantic>=2.7
31
- Requires-Dist: click
30
+ Requires-Dist: opentelemetry-sdk
31
+ Requires-Dist: pandas
32
32
  Provides-Extra: remote
33
33
  Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
34
34
  Provides-Extra: csv
@@ -66,23 +66,23 @@ Requires-Dist: pyairtable; extra == "airtable"
66
66
  Provides-Extra: astradb
67
67
  Requires-Dist: astrapy; extra == "astradb"
68
68
  Provides-Extra: azure
69
- Requires-Dist: fsspec; extra == "azure"
70
69
  Requires-Dist: adlfs; extra == "azure"
70
+ Requires-Dist: fsspec; extra == "azure"
71
71
  Provides-Extra: azure-ai-search
72
72
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
73
73
  Provides-Extra: biomed
74
- Requires-Dist: requests; extra == "biomed"
75
74
  Requires-Dist: bs4; extra == "biomed"
75
+ Requires-Dist: requests; extra == "biomed"
76
76
  Provides-Extra: box
77
- Requires-Dist: boxfs; extra == "box"
78
77
  Requires-Dist: fsspec; extra == "box"
78
+ Requires-Dist: boxfs; extra == "box"
79
79
  Provides-Extra: chroma
80
80
  Requires-Dist: chromadb; extra == "chroma"
81
81
  Provides-Extra: clarifai
82
82
  Requires-Dist: clarifai; extra == "clarifai"
83
83
  Provides-Extra: confluence
84
- Requires-Dist: requests; extra == "confluence"
85
84
  Requires-Dist: atlassian-python-api; extra == "confluence"
85
+ Requires-Dist: requests; extra == "confluence"
86
86
  Provides-Extra: couchbase
87
87
  Requires-Dist: couchbase; extra == "couchbase"
88
88
  Provides-Extra: delta-table
@@ -98,9 +98,9 @@ Requires-Dist: duckdb; extra == "duckdb"
98
98
  Provides-Extra: elasticsearch
99
99
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
100
100
  Provides-Extra: gcs
101
- Requires-Dist: gcsfs; extra == "gcs"
102
- Requires-Dist: fsspec; extra == "gcs"
103
101
  Requires-Dist: bs4; extra == "gcs"
102
+ Requires-Dist: fsspec; extra == "gcs"
103
+ Requires-Dist: gcsfs; extra == "gcs"
104
104
  Provides-Extra: github
105
105
  Requires-Dist: pygithub>1.58.0; extra == "github"
106
106
  Requires-Dist: requests; extra == "github"
@@ -125,22 +125,22 @@ Provides-Extra: mongodb
125
125
  Requires-Dist: pymongo; extra == "mongodb"
126
126
  Provides-Extra: neo4j
127
127
  Requires-Dist: cymple; extra == "neo4j"
128
- Requires-Dist: networkx; extra == "neo4j"
129
128
  Requires-Dist: neo4j-rust-ext; extra == "neo4j"
129
+ Requires-Dist: networkx; extra == "neo4j"
130
130
  Provides-Extra: notion
131
+ Requires-Dist: backoff; extra == "notion"
131
132
  Requires-Dist: httpx; extra == "notion"
132
133
  Requires-Dist: notion-client; extra == "notion"
133
- Requires-Dist: backoff; extra == "notion"
134
134
  Requires-Dist: htmlBuilder; extra == "notion"
135
135
  Provides-Extra: onedrive
136
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
- Requires-Dist: msal; extra == "onedrive"
138
136
  Requires-Dist: bs4; extra == "onedrive"
137
+ Requires-Dist: msal; extra == "onedrive"
138
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
139
139
  Provides-Extra: opensearch
140
140
  Requires-Dist: opensearch-py; extra == "opensearch"
141
141
  Provides-Extra: outlook
142
- Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
143
142
  Requires-Dist: msal; extra == "outlook"
143
+ Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
144
144
  Provides-Extra: pinecone
145
145
  Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
146
146
  Provides-Extra: postgres
@@ -155,8 +155,8 @@ Provides-Extra: s3
155
155
  Requires-Dist: fsspec; extra == "s3"
156
156
  Requires-Dist: s3fs; extra == "s3"
157
157
  Provides-Extra: sharepoint
158
- Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
159
158
  Requires-Dist: msal; extra == "sharepoint"
159
+ Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
160
160
  Provides-Extra: salesforce
161
161
  Requires-Dist: simple-salesforce; extra == "salesforce"
162
162
  Provides-Extra: sftp
@@ -178,18 +178,18 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
178
178
  Provides-Extra: singlestore
179
179
  Requires-Dist: singlestoredb; extra == "singlestore"
180
180
  Provides-Extra: vectara
181
- Requires-Dist: requests; extra == "vectara"
182
181
  Requires-Dist: httpx; extra == "vectara"
182
+ Requires-Dist: requests; extra == "vectara"
183
183
  Requires-Dist: aiofiles; extra == "vectara"
184
184
  Provides-Extra: vastdb
185
+ Requires-Dist: vastdb; extra == "vastdb"
185
186
  Requires-Dist: pyarrow; extra == "vastdb"
186
187
  Requires-Dist: ibis; extra == "vastdb"
187
- Requires-Dist: vastdb; extra == "vastdb"
188
188
  Provides-Extra: embed-huggingface
189
189
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
190
190
  Provides-Extra: embed-octoai
191
- Requires-Dist: tiktoken; extra == "embed-octoai"
192
191
  Requires-Dist: openai; extra == "embed-octoai"
192
+ Requires-Dist: tiktoken; extra == "embed-octoai"
193
193
  Provides-Extra: embed-vertexai
194
194
  Requires-Dist: vertexai; extra == "embed-vertexai"
195
195
  Provides-Extra: embed-voyageai
@@ -197,11 +197,11 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
197
197
  Provides-Extra: embed-mixedbreadai
198
198
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
199
199
  Provides-Extra: openai
200
- Requires-Dist: tiktoken; extra == "openai"
201
200
  Requires-Dist: openai; extra == "openai"
201
+ Requires-Dist: tiktoken; extra == "openai"
202
202
  Provides-Extra: bedrock
203
- Requires-Dist: aioboto3; extra == "bedrock"
204
203
  Requires-Dist: boto3; extra == "bedrock"
204
+ Requires-Dist: aioboto3; extra == "bedrock"
205
205
  Provides-Extra: togetherai
206
206
  Requires-Dist: together; extra == "togetherai"
207
207
  Dynamic: author
@@ -17,7 +17,7 @@ test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8u
17
17
  test/integration/connectors/test_neo4j.py,sha256=r4TRYtTXeeOdcRcfa_gvslhSKvoIWrwN1FRJ5XRoH4k,8456
18
18
  test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qkoV3srltFOSSKTA,5403
19
19
  test/integration/connectors/test_onedrive.py,sha256=iwiDK0kWCfQbIEPnWUzzAA5PiCsHcmFZSxEcIZy_6cc,5229
20
- test/integration/connectors/test_pinecone.py,sha256=acKEu1vnAk0Ht3FhCnGtOEKaj_YlgCzZB7wRU17ehQ0,12407
20
+ test/integration/connectors/test_pinecone.py,sha256=9FC0frer7gtDzk5A6OhGsV8S4ggYfa5ReEO9t7L3Am0,13649
21
21
  test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
22
22
  test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
23
23
  test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
@@ -54,7 +54,7 @@ test/integration/connectors/utils/validation/utils.py,sha256=xYYvAbqP6_lZyH09_Jj
54
54
  test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
55
  test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
56
56
  test/integration/connectors/weaviate/test_cloud.py,sha256=U1ZS6a7wTPX7h3XGvaJHaT-Uwg4IeGgzxx1YBywgVhM,1284
57
- test/integration/connectors/weaviate/test_local.py,sha256=gXMpnzVcrNQdptDjx0haPWBU-dm1MQTkalgxocI3-L8,5287
57
+ test/integration/connectors/weaviate/test_local.py,sha256=NMQh9kV_BoIrpXe5abGkUSJYsY2ipRSqyFS4EzH1o7s,5333
58
58
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
59
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
60
60
  test/integration/embedders/test_azure_openai.py,sha256=YQ3uq2-NuxtTyGsSgMNa10pcITLKMJ4E1scTGFgwujw,1790
@@ -107,7 +107,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
107
107
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
108
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
109
109
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
110
- unstructured_ingest/__version__.py,sha256=2QfHN0aecwYlnmO8dtgtrQp_DCM5nNLT2FX_S7HbQPk,42
110
+ unstructured_ingest/__version__.py,sha256=8heXQJ79JSGfqiDjjQtqcfkCTWOYFwgErKEt_wwF3c4,42
111
111
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
112
112
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
113
113
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -397,14 +397,14 @@ unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIws
397
397
  unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
398
398
  unstructured_ingest/v2/interfaces/file_data.py,sha256=7MyRlj5dijQsCR6W18wQ8fEgJigGKwoOYc10g9A6PSo,3834
399
399
  unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
400
- unstructured_ingest/v2/interfaces/process.py,sha256=6Ll0O9ATcdm36dx2_TOg9PfCEJrADgyd8OQK3TTNzZM,448
400
+ unstructured_ingest/v2/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
401
401
  unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
402
402
  unstructured_ingest/v2/interfaces/upload_stager.py,sha256=9EV9863ODDv0Y5liDT3xh2yiVuFiaVVyCcnwCy6nfkM,3172
403
- unstructured_ingest/v2/interfaces/uploader.py,sha256=rrZLTjmTcrDL-amQIKzIP6j2fW-iZPCVsUaL0rljcME,2090
403
+ unstructured_ingest/v2/interfaces/uploader.py,sha256=diMkAD5HY8IYpeP1DoFeRD_SexAgOEl1nUcimNnyATc,2063
404
404
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
405
405
  unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
406
406
  unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
407
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=b37fQGm_lGutQ3Jc0qePB15lkBiFavH9tCso3inm-3I,16564
407
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=UeOk5SywJZIn3kCnHclQ2cP7JJIXb4NDjpwzsCP_cF0,16523
408
408
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
409
409
  unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
410
410
  unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
@@ -418,13 +418,13 @@ unstructured_ingest/v2/pipeline/steps/upload.py,sha256=We4OAtStuZwWKKBCOPhfeAz_v
418
418
  unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
419
419
  unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
420
420
  unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
421
- unstructured_ingest/v2/processes/embedder.py,sha256=64mWxKMzDtrm0_QWDUA3J8gChPDEVLP6bFnac_JPBRY,7925
421
+ unstructured_ingest/v2/processes/embedder.py,sha256=4x-Rt5UCvwdgihDAr24hvTGDEd1CdKF9xJrf3aMU-ck,7926
422
422
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
423
423
  unstructured_ingest/v2/processes/partitioner.py,sha256=ZC9mt85I3o_SLR4DvE7vPBGphMET994phFkTuT-L9B8,9998
424
424
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
425
425
  unstructured_ingest/v2/processes/connectors/__init__.py,sha256=KO1zn-96Qa49TOSZn-gv_RUMGMCmUcdtHoeJqCpxPLY,6219
426
426
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
427
- unstructured_ingest/v2/processes/connectors/astradb.py,sha256=xhUMoUdnrfAY1isZGqsV4lZUsnZNpbvgLyQWQbR4hVo,14814
427
+ unstructured_ingest/v2/processes/connectors/astradb.py,sha256=v2M-xpI7NViikEaHCmuWUQU5XokDOOWbOFXYUXF63Ps,15002
428
428
  unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
429
429
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=VHCnM56qNXuHzovJihrNfJnZbWLJShOe8j12PJFrbL0,7219
430
430
  unstructured_ingest/v2/processes/connectors/confluence.py,sha256=_zkiST0FTggEKNORalCcZZIRGZKnCM0LLcavgQZfDVE,11112
@@ -437,15 +437,15 @@ unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOG
437
437
  unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
438
438
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
439
439
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
440
- unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=sjwQWp6gPP_MR8vh4aaMJUzPmkGT_3FODTlB5-7tVh0,17525
440
+ unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=ijp5hjmDpLoIHL9UJzV4_4vVtQBlQ2R_vLatlUYivX4,17464
441
441
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=EM9fq67RsiudZvZbi6nDXkS-i6W0xLvbkNvD0G-Ni5E,17779
442
442
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
443
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=U5gSa8S08JvCwmAhE8aV0yxGTIFnUlKVsQDybE8Fqb8,10746
443
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=93WmYO9OT8er9DSlh8odbJCtjcLsVMlqyXlYADgDEjc,14013
444
444
  unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
445
445
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
446
446
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=2T9Bm1H_ALwHhG_YP7vsuUUW-mUg61zcaae3aa9BnN4,4827
447
447
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
448
- unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
448
+ unstructured_ingest/v2/processes/connectors/utils.py,sha256=ru_4e5lo5t1jJhR8sGYa5nNhX3gKTgC5B7Oze9qQJjo,2000
449
449
  unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
450
450
  unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
451
451
  unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
@@ -566,10 +566,10 @@ unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWa
566
566
  unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
567
567
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
568
568
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
569
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
570
- unstructured_ingest-0.5.4.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
571
- unstructured_ingest-0.5.4.dist-info/METADATA,sha256=KMDH7tStB6vEdpMqI8VQqTyvloKDieXb47rRsiW9OYk,8316
572
- unstructured_ingest-0.5.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
573
- unstructured_ingest-0.5.4.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
574
- unstructured_ingest-0.5.4.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
575
- unstructured_ingest-0.5.4.dist-info/RECORD,,
569
+ unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yfABJKJGCvPuZ2XCNtDOuCtiscdEAmBCSPPNZnbTKDk,12821
570
+ unstructured_ingest-0.5.6.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
571
+ unstructured_ingest-0.5.6.dist-info/METADATA,sha256=ts8jHfqXkNXKcF9TL5UqQNHkynZuzjiobUomXaqiYgM,8316
572
+ unstructured_ingest-0.5.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
573
+ unstructured_ingest-0.5.6.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
574
+ unstructured_ingest-0.5.6.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
575
+ unstructured_ingest-0.5.6.dist-info/RECORD,,