unstructured-ingest 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_pinecone.py +34 -0
- test/integration/connectors/weaviate/test_local.py +3 -3
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/process.py +1 -1
- unstructured_ingest/v2/interfaces/uploader.py +2 -2
- unstructured_ingest/v2/pipeline/pipeline.py +26 -31
- unstructured_ingest/v2/processes/connectors/astradb.py +2 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +7 -7
- unstructured_ingest/v2/processes/connectors/pinecone.py +93 -5
- unstructured_ingest/v2/processes/connectors/utils.py +28 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +13 -2
- unstructured_ingest/v2/processes/embedder.py +1 -1
- {unstructured_ingest-0.5.4.dist-info → unstructured_ingest-0.5.6.dist-info}/METADATA +22 -22
- {unstructured_ingest-0.5.4.dist-info → unstructured_ingest-0.5.6.dist-info}/RECORD +18 -18
- {unstructured_ingest-0.5.4.dist-info → unstructured_ingest-0.5.6.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.4.dist-info → unstructured_ingest-0.5.6.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.4.dist-info → unstructured_ingest-0.5.6.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.4.dist-info → unstructured_ingest-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -351,3 +351,37 @@ def test_pinecone_stager(
|
|
|
351
351
|
stager=stager,
|
|
352
352
|
tmp_dir=tmp_path,
|
|
353
353
|
)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
@requires_env(API_KEY)
|
|
357
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
358
|
+
def test_pinecone_create_destination(pinecone_index):
|
|
359
|
+
uploader = PineconeUploader(
|
|
360
|
+
connection_config=PineconeConnectionConfig(
|
|
361
|
+
access_config=PineconeAccessConfig(api_key=get_api_key())
|
|
362
|
+
),
|
|
363
|
+
upload_config=PineconeUploaderConfig(),
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
random_id = str(uuid4()).split("-")[0]
|
|
367
|
+
|
|
368
|
+
index_name = f"test-create-destination-{random_id}"
|
|
369
|
+
|
|
370
|
+
assert not uploader.index_exists(index_name=index_name)
|
|
371
|
+
|
|
372
|
+
try:
|
|
373
|
+
uploader.create_destination(destination_name=index_name, vector_length=1536)
|
|
374
|
+
except Exception as e:
|
|
375
|
+
error_body = getattr(e, "body", None)
|
|
376
|
+
raise pytest.fail(f"failed to create destination: {e} {error_body}")
|
|
377
|
+
|
|
378
|
+
assert uploader.index_exists(index_name=index_name), "destination was not created successfully"
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
pc = uploader.connection_config.get_client()
|
|
382
|
+
logger.info(f"deleting index for test create destination: {index_name}")
|
|
383
|
+
pc.delete_index(name=index_name)
|
|
384
|
+
except Exception as e:
|
|
385
|
+
raise pytest.fail(f"failed to cleanup / delete the destination: {e}")
|
|
386
|
+
|
|
387
|
+
assert not uploader.index_exists(index_name=index_name), "cleanup failed"
|
|
@@ -78,7 +78,6 @@ def run_uploader_and_validate(
|
|
|
78
78
|
validate_count(expected_count=expected_count)
|
|
79
79
|
|
|
80
80
|
|
|
81
|
-
@pytest.mark.asyncio
|
|
82
81
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
83
82
|
def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
|
|
84
83
|
file_data = FileData(
|
|
@@ -142,11 +141,12 @@ def test_weaviate_local_create_destination(weaviate_instance):
|
|
|
142
141
|
upload_config=LocalWeaviateUploaderConfig(),
|
|
143
142
|
connection_config=LocalWeaviateConnectionConfig(),
|
|
144
143
|
)
|
|
145
|
-
collection_name = "system_created"
|
|
144
|
+
collection_name = "system_created-123"
|
|
145
|
+
formatted_collection_name = "System_created_123"
|
|
146
146
|
created = uploader.create_destination(destination_name=collection_name)
|
|
147
147
|
assert created
|
|
148
148
|
with uploader.connection_config.get_client() as weaviate_client:
|
|
149
|
-
assert weaviate_client.collections.exists(name=
|
|
149
|
+
assert weaviate_client.collections.exists(name=formatted_collection_name)
|
|
150
150
|
|
|
151
151
|
created = uploader.create_destination(destination_name=collection_name)
|
|
152
152
|
assert not created
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.6" # pragma: no cover
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any,
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -61,6 +61,6 @@ class Uploader(BaseProcess, BaseConnector, ABC):
|
|
|
61
61
|
@dataclass
|
|
62
62
|
class VectorDBUploader(Uploader, ABC):
|
|
63
63
|
def create_destination(
|
|
64
|
-
self, destination_name: str = "elements",
|
|
64
|
+
self, vector_length: int, destination_name: str = "elements", **kwargs: Any
|
|
65
65
|
) -> bool:
|
|
66
66
|
return False
|
|
@@ -126,14 +126,32 @@ class Pipeline:
|
|
|
126
126
|
for kk, vv in v.items():
|
|
127
127
|
logger.error(f"{k}: [{kk}] {vv}")
|
|
128
128
|
|
|
129
|
+
def _run_initialization(self):
|
|
130
|
+
failures = {}
|
|
131
|
+
init_kwargs = {}
|
|
132
|
+
for step in self._get_ordered_steps():
|
|
133
|
+
try:
|
|
134
|
+
step.process.init(**init_kwargs)
|
|
135
|
+
step.process.precheck()
|
|
136
|
+
# Make sure embedder dimensions available for downstream steps
|
|
137
|
+
if isinstance(step.process, Embedder):
|
|
138
|
+
embed_dimensions = step.process.config.get_embedder().dimension
|
|
139
|
+
init_kwargs["vector_length"] = embed_dimensions
|
|
140
|
+
|
|
141
|
+
except Exception as e:
|
|
142
|
+
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
143
|
+
if failures:
|
|
144
|
+
for k, v in failures.items():
|
|
145
|
+
logger.error(f"Step initialization failure: {k}: {v}")
|
|
146
|
+
raise PipelineError("Initialization failed")
|
|
147
|
+
|
|
129
148
|
def run(self):
|
|
130
149
|
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.info)
|
|
131
150
|
try:
|
|
132
151
|
with otel_handler.get_tracer().start_as_current_span(
|
|
133
152
|
"ingest process", record_exception=True
|
|
134
153
|
):
|
|
135
|
-
self.
|
|
136
|
-
self._run_prechecks()
|
|
154
|
+
self._run_initialization()
|
|
137
155
|
self._run()
|
|
138
156
|
finally:
|
|
139
157
|
self.log_statuses()
|
|
@@ -154,43 +172,20 @@ class Pipeline:
|
|
|
154
172
|
final = [f for f in flat if f]
|
|
155
173
|
return final or None
|
|
156
174
|
|
|
157
|
-
def
|
|
158
|
-
steps = [self.indexer_step, self.downloader_step
|
|
175
|
+
def _get_ordered_steps(self) -> list[PipelineStep]:
|
|
176
|
+
steps = [self.indexer_step, self.downloader_step]
|
|
177
|
+
if self.uncompress_step:
|
|
178
|
+
steps.append(self.uncompress_step)
|
|
179
|
+
steps.append(self.partitioner_step)
|
|
159
180
|
if self.chunker_step:
|
|
160
181
|
steps.append(self.chunker_step)
|
|
161
182
|
if self.embedder_step:
|
|
162
183
|
steps.append(self.embedder_step)
|
|
163
|
-
if self.uncompress_step:
|
|
164
|
-
steps.append(self.uncompress_step)
|
|
165
184
|
if self.stager_step:
|
|
166
185
|
steps.append(self.stager_step)
|
|
186
|
+
steps.append(self.uploader_step)
|
|
167
187
|
return steps
|
|
168
188
|
|
|
169
|
-
def _run_inits(self):
|
|
170
|
-
failures = {}
|
|
171
|
-
|
|
172
|
-
for step in self._get_all_steps():
|
|
173
|
-
try:
|
|
174
|
-
step.process.init()
|
|
175
|
-
except Exception as e:
|
|
176
|
-
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
177
|
-
if failures:
|
|
178
|
-
for k, v in failures.items():
|
|
179
|
-
logger.error(f"Step init failure: {k}: {v}")
|
|
180
|
-
raise PipelineError("Init failed")
|
|
181
|
-
|
|
182
|
-
def _run_prechecks(self):
|
|
183
|
-
failures = {}
|
|
184
|
-
for step in self._get_all_steps():
|
|
185
|
-
try:
|
|
186
|
-
step.process.precheck()
|
|
187
|
-
except Exception as e:
|
|
188
|
-
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
189
|
-
if failures:
|
|
190
|
-
for k, v in failures.items():
|
|
191
|
-
logger.error(f"Step precheck failure: {k}: {v}")
|
|
192
|
-
raise PipelineError("Precheck failed")
|
|
193
|
-
|
|
194
189
|
def apply_filter(self, records: list[dict]) -> list[dict]:
|
|
195
190
|
if not self.filter_step:
|
|
196
191
|
return records
|
|
@@ -42,6 +42,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
42
42
|
DestinationRegistryEntry,
|
|
43
43
|
SourceRegistryEntry,
|
|
44
44
|
)
|
|
45
|
+
from unstructured_ingest.v2.processes.connectors.utils import format_and_truncate_orig_elements
|
|
45
46
|
|
|
46
47
|
if TYPE_CHECKING:
|
|
47
48
|
from astrapy import AsyncCollection as AstraDBAsyncCollection
|
|
@@ -318,6 +319,7 @@ class AstraDBUploadStager(UploadStager):
|
|
|
318
319
|
element_dict["metadata"]["text_as_html"] = truncate_string_bytes(
|
|
319
320
|
text_as_html, MAX_CONTENT_PARAM_BYTE_SIZE
|
|
320
321
|
)
|
|
322
|
+
metadata["original_elements"] = format_and_truncate_orig_elements(element_dict)
|
|
321
323
|
|
|
322
324
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
323
325
|
self.truncate_dict_elements(element_dict)
|
|
@@ -14,7 +14,6 @@ from pydantic import BaseModel, ConfigDict, Field, Secret, field_validator
|
|
|
14
14
|
|
|
15
15
|
from unstructured_ingest.error import DestinationConnectionError
|
|
16
16
|
from unstructured_ingest.logger import logger
|
|
17
|
-
from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
|
|
18
17
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
19
18
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
20
19
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -29,6 +28,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
29
28
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
30
29
|
DestinationRegistryEntry,
|
|
31
30
|
)
|
|
31
|
+
from unstructured_ingest.v2.processes.connectors.utils import format_and_truncate_orig_elements
|
|
32
32
|
|
|
33
33
|
SimilarityFunction = Literal["cosine"]
|
|
34
34
|
|
|
@@ -132,7 +132,7 @@ class Neo4jUploadStager(UploadStager):
|
|
|
132
132
|
if self._is_chunk(element):
|
|
133
133
|
origin_element_nodes = [
|
|
134
134
|
self._create_element_node(origin_element)
|
|
135
|
-
for origin_element in
|
|
135
|
+
for origin_element in format_and_truncate_orig_elements(element)
|
|
136
136
|
]
|
|
137
137
|
graph.add_edges_from(
|
|
138
138
|
[
|
|
@@ -166,7 +166,11 @@ class Neo4jUploadStager(UploadStager):
|
|
|
166
166
|
return _Node(id_=file_data.identifier, properties=properties, labels=[Label.DOCUMENT])
|
|
167
167
|
|
|
168
168
|
def _create_element_node(self, element: dict) -> _Node:
|
|
169
|
-
properties = {"id": element["element_id"]
|
|
169
|
+
properties = {"id": element["element_id"]}
|
|
170
|
+
|
|
171
|
+
if text := element.get("text"):
|
|
172
|
+
# if we have chunks, we won't have text here for the original elements
|
|
173
|
+
properties["text"] = text
|
|
170
174
|
|
|
171
175
|
if embeddings := element.get("embeddings"):
|
|
172
176
|
properties["embeddings"] = embeddings
|
|
@@ -174,10 +178,6 @@ class Neo4jUploadStager(UploadStager):
|
|
|
174
178
|
label = Label.CHUNK if self._is_chunk(element) else Label.UNSTRUCTURED_ELEMENT
|
|
175
179
|
return _Node(id_=element["element_id"], properties=properties, labels=[label])
|
|
176
180
|
|
|
177
|
-
def _get_origin_elements(self, chunk_element: dict) -> list[dict]:
|
|
178
|
-
orig_elements = chunk_element.get("metadata", {}).get("orig_elements")
|
|
179
|
-
return elements_from_base64_gzipped_json(raw_s=orig_elements)
|
|
180
|
-
|
|
181
181
|
|
|
182
182
|
class _GraphData(BaseModel):
|
|
183
183
|
nodes: list[_Node]
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import re
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
4
5
|
|
|
5
6
|
from pydantic import Field, Secret
|
|
6
7
|
|
|
@@ -13,10 +14,10 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
13
14
|
AccessConfig,
|
|
14
15
|
ConnectionConfig,
|
|
15
16
|
FileData,
|
|
16
|
-
Uploader,
|
|
17
17
|
UploaderConfig,
|
|
18
18
|
UploadStager,
|
|
19
19
|
UploadStagerConfig,
|
|
20
|
+
VectorDBUploader,
|
|
20
21
|
)
|
|
21
22
|
from unstructured_ingest.v2.logger import logger
|
|
22
23
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
@@ -41,7 +42,7 @@ class PineconeAccessConfig(AccessConfig):
|
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
class PineconeConnectionConfig(ConnectionConfig):
|
|
44
|
-
index_name: str = Field(description="Name of the index to connect to.")
|
|
45
|
+
index_name: Optional[str] = Field(description="Name of the index to connect to.", default=None)
|
|
45
46
|
access_config: Secret[PineconeAccessConfig] = Field(
|
|
46
47
|
default=PineconeAccessConfig(), validate_default=True
|
|
47
48
|
)
|
|
@@ -160,18 +161,101 @@ class PineconeUploadStager(UploadStager):
|
|
|
160
161
|
|
|
161
162
|
|
|
162
163
|
@dataclass
|
|
163
|
-
class PineconeUploader(
|
|
164
|
+
class PineconeUploader(VectorDBUploader):
|
|
164
165
|
upload_config: PineconeUploaderConfig
|
|
165
166
|
connection_config: PineconeConnectionConfig
|
|
166
167
|
connector_type: str = CONNECTOR_TYPE
|
|
167
168
|
|
|
169
|
+
def init(self, **kwargs: Any) -> None:
|
|
170
|
+
self.create_destination(**kwargs)
|
|
171
|
+
|
|
172
|
+
def index_exists(self, index_name: Optional[str]) -> bool:
|
|
173
|
+
from pinecone.exceptions import NotFoundException
|
|
174
|
+
|
|
175
|
+
index_name = index_name or self.connection_config.index_name
|
|
176
|
+
pc = self.connection_config.get_client()
|
|
177
|
+
try:
|
|
178
|
+
pc.describe_index(index_name)
|
|
179
|
+
return True
|
|
180
|
+
except NotFoundException:
|
|
181
|
+
return False
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.error(f"failed to check if pinecone index exists : {e}")
|
|
184
|
+
raise DestinationConnectionError(f"failed to check if pinecone index exists : {e}")
|
|
185
|
+
|
|
168
186
|
def precheck(self):
|
|
169
187
|
try:
|
|
170
|
-
|
|
188
|
+
# just a connection check here. not an actual index_exists check
|
|
189
|
+
self.index_exists("just-checking-our-connection")
|
|
190
|
+
|
|
191
|
+
if self.connection_config.index_name and not self.index_exists(
|
|
192
|
+
self.connection_config.index_name
|
|
193
|
+
):
|
|
194
|
+
raise DestinationConnectionError(
|
|
195
|
+
f"index {self.connection_config.index_name} does not exist"
|
|
196
|
+
)
|
|
171
197
|
except Exception as e:
|
|
172
198
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
173
199
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
174
200
|
|
|
201
|
+
def format_destination_name(self, destination_name: str) -> str:
|
|
202
|
+
# Pinecone naming requirements:
|
|
203
|
+
# can only contain lowercase letters, numbers, and hyphens
|
|
204
|
+
# must be 45 characters or less
|
|
205
|
+
formatted = re.sub(r"[^a-z0-9]", "-", destination_name.lower())
|
|
206
|
+
return formatted
|
|
207
|
+
|
|
208
|
+
def create_destination(
|
|
209
|
+
self,
|
|
210
|
+
vector_length: int,
|
|
211
|
+
destination_name: str = "elements",
|
|
212
|
+
destination_type: Literal["pod", "serverless"] = "serverless",
|
|
213
|
+
serverless_cloud: str = "aws",
|
|
214
|
+
serverless_region: str = "us-west-2",
|
|
215
|
+
pod_environment: str = "us-east1-gcp",
|
|
216
|
+
pod_type: str = "p1.x1",
|
|
217
|
+
pod_count: int = 1,
|
|
218
|
+
**kwargs: Any,
|
|
219
|
+
) -> bool:
|
|
220
|
+
from pinecone import PodSpec, ServerlessSpec
|
|
221
|
+
|
|
222
|
+
index_name = destination_name or self.connection_config.index_name
|
|
223
|
+
index_name = self.format_destination_name(index_name)
|
|
224
|
+
self.connection_config.index_name = index_name
|
|
225
|
+
|
|
226
|
+
if not self.index_exists(index_name):
|
|
227
|
+
|
|
228
|
+
logger.info(f"creating pinecone index {index_name}")
|
|
229
|
+
|
|
230
|
+
pc = self.connection_config.get_client()
|
|
231
|
+
|
|
232
|
+
if destination_type == "serverless":
|
|
233
|
+
pc.create_index(
|
|
234
|
+
name=destination_name,
|
|
235
|
+
dimension=vector_length,
|
|
236
|
+
spec=ServerlessSpec(cloud=serverless_cloud, region=serverless_region),
|
|
237
|
+
**kwargs,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
return True
|
|
241
|
+
|
|
242
|
+
elif destination_type == "pod":
|
|
243
|
+
pc.create_index(
|
|
244
|
+
name=destination_name,
|
|
245
|
+
dimension=vector_length,
|
|
246
|
+
spec=PodSpec(environment=pod_environment, pod_type=pod_type, pods=pod_count),
|
|
247
|
+
**kwargs,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
return True
|
|
251
|
+
|
|
252
|
+
else:
|
|
253
|
+
raise ValueError(f"unexpected destination type: {destination_type}")
|
|
254
|
+
|
|
255
|
+
else:
|
|
256
|
+
logger.debug(f"index {index_name} already exists, skipping creation")
|
|
257
|
+
return False
|
|
258
|
+
|
|
175
259
|
def pod_delete_by_record_id(self, file_data: FileData) -> None:
|
|
176
260
|
logger.debug(
|
|
177
261
|
f"deleting any content with metadata "
|
|
@@ -266,6 +350,10 @@ class PineconeUploader(Uploader):
|
|
|
266
350
|
)
|
|
267
351
|
# Determine if serverless or pod based index
|
|
268
352
|
pinecone_client = self.connection_config.get_client()
|
|
353
|
+
|
|
354
|
+
if not self.connection_config.index_name:
|
|
355
|
+
raise ValueError("No index name specified")
|
|
356
|
+
|
|
269
357
|
index_description = pinecone_client.describe_index(name=self.connection_config.index_name)
|
|
270
358
|
if "serverless" in index_description.get("spec"):
|
|
271
359
|
self.serverless_delete_by_record_id(file_data=file_data)
|
|
@@ -5,6 +5,8 @@ from typing import Any, Union
|
|
|
5
5
|
from dateutil import parser
|
|
6
6
|
from pydantic import ValidationError
|
|
7
7
|
|
|
8
|
+
from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
def parse_datetime(date_value: Union[int, str, float, datetime]) -> datetime:
|
|
10
12
|
if isinstance(date_value, datetime):
|
|
@@ -27,3 +29,29 @@ def conform_string_to_dict(value: Any) -> dict:
|
|
|
27
29
|
if isinstance(value, str):
|
|
28
30
|
return json.loads(value)
|
|
29
31
|
raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def format_and_truncate_orig_elements(element: dict) -> list[dict[str, Any]]:
|
|
35
|
+
"""
|
|
36
|
+
This function is used to format and truncate the orig_elements field in the metadata.
|
|
37
|
+
This is used to remove the text field and other larger fields from the orig_elements
|
|
38
|
+
that are not helpful in filtering/searching when used along with chunked elements.
|
|
39
|
+
"""
|
|
40
|
+
metadata = element.get("metadata", {})
|
|
41
|
+
raw_orig_elements = metadata.get("orig_elements", None)
|
|
42
|
+
orig_elements = []
|
|
43
|
+
if raw_orig_elements is not None:
|
|
44
|
+
for element in elements_from_base64_gzipped_json(raw_orig_elements):
|
|
45
|
+
element.pop("text", None)
|
|
46
|
+
for prop in (
|
|
47
|
+
"image_base64",
|
|
48
|
+
"text_as_html",
|
|
49
|
+
"table_as_cells",
|
|
50
|
+
"link_urls",
|
|
51
|
+
"link_texts",
|
|
52
|
+
"link_start_indexes",
|
|
53
|
+
"emphasized_text_contents",
|
|
54
|
+
):
|
|
55
|
+
element["metadata"].pop(prop, None)
|
|
56
|
+
orig_elements.append(element)
|
|
57
|
+
return orig_elements
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import re
|
|
2
3
|
from abc import ABC, abstractmethod
|
|
3
4
|
from contextlib import contextmanager
|
|
4
5
|
from dataclasses import dataclass, field
|
|
@@ -229,19 +230,29 @@ class WeaviateUploader(VectorDBUploader, ABC):
|
|
|
229
230
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
230
231
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
231
232
|
|
|
232
|
-
def init(self,
|
|
233
|
-
self.create_destination()
|
|
233
|
+
def init(self, **kwargs: Any) -> None:
|
|
234
|
+
self.create_destination(**kwargs)
|
|
235
|
+
|
|
236
|
+
def format_destination_name(self, destination_name: str) -> str:
|
|
237
|
+
# Weaviate naming requirements:
|
|
238
|
+
# must be alphanumeric and underscores only
|
|
239
|
+
formatted = re.sub(r"[^a-zA-Z0-9]", "_", destination_name)
|
|
240
|
+
# must begin with capital letter
|
|
241
|
+
return formatted.capitalize()
|
|
234
242
|
|
|
235
243
|
def create_destination(
|
|
236
244
|
self, destination_name: str = "elements", vector_length: Optional[int] = None, **kwargs: Any
|
|
237
245
|
) -> bool:
|
|
246
|
+
destination_name = self.format_destination_name(destination_name)
|
|
238
247
|
collection_name = self.upload_config.collection or destination_name
|
|
239
248
|
self.upload_config.collection = collection_name
|
|
249
|
+
|
|
240
250
|
connectors_dir = Path(__file__).parents[1]
|
|
241
251
|
collection_config_file = connectors_dir / "assets" / "weaviate_collection_config.json"
|
|
242
252
|
with collection_config_file.open() as f:
|
|
243
253
|
collection_config = json.load(f)
|
|
244
254
|
collection_config["class"] = collection_name
|
|
255
|
+
|
|
245
256
|
if not self._collection_exists():
|
|
246
257
|
logger.info(
|
|
247
258
|
f"creating default weaviate collection '{collection_name}' with default configs"
|
|
@@ -186,7 +186,7 @@ class EmbedderConfig(BaseModel):
|
|
|
186
186
|
class Embedder(BaseProcess, ABC):
|
|
187
187
|
config: EmbedderConfig
|
|
188
188
|
|
|
189
|
-
def init(self,
|
|
189
|
+
def init(self, **kwargs: Any) -> None:
|
|
190
190
|
self.config.get_embedder().initialize()
|
|
191
191
|
|
|
192
192
|
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.6
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: opentelemetry-sdk
|
|
26
|
-
Requires-Dist: pandas
|
|
27
|
-
Requires-Dist: python-dateutil
|
|
28
25
|
Requires-Dist: dataclasses_json
|
|
26
|
+
Requires-Dist: click
|
|
27
|
+
Requires-Dist: python-dateutil
|
|
29
28
|
Requires-Dist: tqdm
|
|
30
29
|
Requires-Dist: pydantic>=2.7
|
|
31
|
-
Requires-Dist:
|
|
30
|
+
Requires-Dist: opentelemetry-sdk
|
|
31
|
+
Requires-Dist: pandas
|
|
32
32
|
Provides-Extra: remote
|
|
33
33
|
Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
|
|
34
34
|
Provides-Extra: csv
|
|
@@ -66,23 +66,23 @@ Requires-Dist: pyairtable; extra == "airtable"
|
|
|
66
66
|
Provides-Extra: astradb
|
|
67
67
|
Requires-Dist: astrapy; extra == "astradb"
|
|
68
68
|
Provides-Extra: azure
|
|
69
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
70
69
|
Requires-Dist: adlfs; extra == "azure"
|
|
70
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
71
71
|
Provides-Extra: azure-ai-search
|
|
72
72
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
73
73
|
Provides-Extra: biomed
|
|
74
|
-
Requires-Dist: requests; extra == "biomed"
|
|
75
74
|
Requires-Dist: bs4; extra == "biomed"
|
|
75
|
+
Requires-Dist: requests; extra == "biomed"
|
|
76
76
|
Provides-Extra: box
|
|
77
|
-
Requires-Dist: boxfs; extra == "box"
|
|
78
77
|
Requires-Dist: fsspec; extra == "box"
|
|
78
|
+
Requires-Dist: boxfs; extra == "box"
|
|
79
79
|
Provides-Extra: chroma
|
|
80
80
|
Requires-Dist: chromadb; extra == "chroma"
|
|
81
81
|
Provides-Extra: clarifai
|
|
82
82
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
83
83
|
Provides-Extra: confluence
|
|
84
|
-
Requires-Dist: requests; extra == "confluence"
|
|
85
84
|
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
85
|
+
Requires-Dist: requests; extra == "confluence"
|
|
86
86
|
Provides-Extra: couchbase
|
|
87
87
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
88
88
|
Provides-Extra: delta-table
|
|
@@ -98,9 +98,9 @@ Requires-Dist: duckdb; extra == "duckdb"
|
|
|
98
98
|
Provides-Extra: elasticsearch
|
|
99
99
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
100
100
|
Provides-Extra: gcs
|
|
101
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
102
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
103
101
|
Requires-Dist: bs4; extra == "gcs"
|
|
102
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
103
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
104
104
|
Provides-Extra: github
|
|
105
105
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
106
106
|
Requires-Dist: requests; extra == "github"
|
|
@@ -125,22 +125,22 @@ Provides-Extra: mongodb
|
|
|
125
125
|
Requires-Dist: pymongo; extra == "mongodb"
|
|
126
126
|
Provides-Extra: neo4j
|
|
127
127
|
Requires-Dist: cymple; extra == "neo4j"
|
|
128
|
-
Requires-Dist: networkx; extra == "neo4j"
|
|
129
128
|
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
129
|
+
Requires-Dist: networkx; extra == "neo4j"
|
|
130
130
|
Provides-Extra: notion
|
|
131
|
+
Requires-Dist: backoff; extra == "notion"
|
|
131
132
|
Requires-Dist: httpx; extra == "notion"
|
|
132
133
|
Requires-Dist: notion-client; extra == "notion"
|
|
133
|
-
Requires-Dist: backoff; extra == "notion"
|
|
134
134
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
135
135
|
Provides-Extra: onedrive
|
|
136
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
137
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
138
136
|
Requires-Dist: bs4; extra == "onedrive"
|
|
137
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
138
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
139
139
|
Provides-Extra: opensearch
|
|
140
140
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
141
141
|
Provides-Extra: outlook
|
|
142
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
143
142
|
Requires-Dist: msal; extra == "outlook"
|
|
143
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
144
144
|
Provides-Extra: pinecone
|
|
145
145
|
Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
|
|
146
146
|
Provides-Extra: postgres
|
|
@@ -155,8 +155,8 @@ Provides-Extra: s3
|
|
|
155
155
|
Requires-Dist: fsspec; extra == "s3"
|
|
156
156
|
Requires-Dist: s3fs; extra == "s3"
|
|
157
157
|
Provides-Extra: sharepoint
|
|
158
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
159
158
|
Requires-Dist: msal; extra == "sharepoint"
|
|
159
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
160
160
|
Provides-Extra: salesforce
|
|
161
161
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
162
162
|
Provides-Extra: sftp
|
|
@@ -178,18 +178,18 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
|
178
178
|
Provides-Extra: singlestore
|
|
179
179
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
180
180
|
Provides-Extra: vectara
|
|
181
|
-
Requires-Dist: requests; extra == "vectara"
|
|
182
181
|
Requires-Dist: httpx; extra == "vectara"
|
|
182
|
+
Requires-Dist: requests; extra == "vectara"
|
|
183
183
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
184
184
|
Provides-Extra: vastdb
|
|
185
|
+
Requires-Dist: vastdb; extra == "vastdb"
|
|
185
186
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
186
187
|
Requires-Dist: ibis; extra == "vastdb"
|
|
187
|
-
Requires-Dist: vastdb; extra == "vastdb"
|
|
188
188
|
Provides-Extra: embed-huggingface
|
|
189
189
|
Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
190
190
|
Provides-Extra: embed-octoai
|
|
191
|
-
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
192
191
|
Requires-Dist: openai; extra == "embed-octoai"
|
|
192
|
+
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
193
193
|
Provides-Extra: embed-vertexai
|
|
194
194
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
195
195
|
Provides-Extra: embed-voyageai
|
|
@@ -197,11 +197,11 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
197
197
|
Provides-Extra: embed-mixedbreadai
|
|
198
198
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
199
199
|
Provides-Extra: openai
|
|
200
|
-
Requires-Dist: tiktoken; extra == "openai"
|
|
201
200
|
Requires-Dist: openai; extra == "openai"
|
|
201
|
+
Requires-Dist: tiktoken; extra == "openai"
|
|
202
202
|
Provides-Extra: bedrock
|
|
203
|
-
Requires-Dist: aioboto3; extra == "bedrock"
|
|
204
203
|
Requires-Dist: boto3; extra == "bedrock"
|
|
204
|
+
Requires-Dist: aioboto3; extra == "bedrock"
|
|
205
205
|
Provides-Extra: togetherai
|
|
206
206
|
Requires-Dist: together; extra == "togetherai"
|
|
207
207
|
Dynamic: author
|
|
@@ -17,7 +17,7 @@ test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8u
|
|
|
17
17
|
test/integration/connectors/test_neo4j.py,sha256=r4TRYtTXeeOdcRcfa_gvslhSKvoIWrwN1FRJ5XRoH4k,8456
|
|
18
18
|
test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qkoV3srltFOSSKTA,5403
|
|
19
19
|
test/integration/connectors/test_onedrive.py,sha256=iwiDK0kWCfQbIEPnWUzzAA5PiCsHcmFZSxEcIZy_6cc,5229
|
|
20
|
-
test/integration/connectors/test_pinecone.py,sha256=
|
|
20
|
+
test/integration/connectors/test_pinecone.py,sha256=9FC0frer7gtDzk5A6OhGsV8S4ggYfa5ReEO9t7L3Am0,13649
|
|
21
21
|
test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
|
|
22
22
|
test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
|
|
23
23
|
test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
|
|
@@ -54,7 +54,7 @@ test/integration/connectors/utils/validation/utils.py,sha256=xYYvAbqP6_lZyH09_Jj
|
|
|
54
54
|
test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
55
55
|
test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
|
|
56
56
|
test/integration/connectors/weaviate/test_cloud.py,sha256=U1ZS6a7wTPX7h3XGvaJHaT-Uwg4IeGgzxx1YBywgVhM,1284
|
|
57
|
-
test/integration/connectors/weaviate/test_local.py,sha256=
|
|
57
|
+
test/integration/connectors/weaviate/test_local.py,sha256=NMQh9kV_BoIrpXe5abGkUSJYsY2ipRSqyFS4EzH1o7s,5333
|
|
58
58
|
test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
59
|
test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
|
|
60
60
|
test/integration/embedders/test_azure_openai.py,sha256=YQ3uq2-NuxtTyGsSgMNa10pcITLKMJ4E1scTGFgwujw,1790
|
|
@@ -107,7 +107,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
107
107
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
108
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
109
109
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
110
|
-
unstructured_ingest/__version__.py,sha256=
|
|
110
|
+
unstructured_ingest/__version__.py,sha256=8heXQJ79JSGfqiDjjQtqcfkCTWOYFwgErKEt_wwF3c4,42
|
|
111
111
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
112
112
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
113
113
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -397,14 +397,14 @@ unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIws
|
|
|
397
397
|
unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
|
|
398
398
|
unstructured_ingest/v2/interfaces/file_data.py,sha256=7MyRlj5dijQsCR6W18wQ8fEgJigGKwoOYc10g9A6PSo,3834
|
|
399
399
|
unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
|
|
400
|
-
unstructured_ingest/v2/interfaces/process.py,sha256=
|
|
400
|
+
unstructured_ingest/v2/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
|
|
401
401
|
unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
|
|
402
402
|
unstructured_ingest/v2/interfaces/upload_stager.py,sha256=9EV9863ODDv0Y5liDT3xh2yiVuFiaVVyCcnwCy6nfkM,3172
|
|
403
|
-
unstructured_ingest/v2/interfaces/uploader.py,sha256=
|
|
403
|
+
unstructured_ingest/v2/interfaces/uploader.py,sha256=diMkAD5HY8IYpeP1DoFeRD_SexAgOEl1nUcimNnyATc,2063
|
|
404
404
|
unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
405
405
|
unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
|
|
406
406
|
unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
|
|
407
|
-
unstructured_ingest/v2/pipeline/pipeline.py,sha256=
|
|
407
|
+
unstructured_ingest/v2/pipeline/pipeline.py,sha256=UeOk5SywJZIn3kCnHclQ2cP7JJIXb4NDjpwzsCP_cF0,16523
|
|
408
408
|
unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
409
409
|
unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
|
|
410
410
|
unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
|
|
@@ -418,13 +418,13 @@ unstructured_ingest/v2/pipeline/steps/upload.py,sha256=We4OAtStuZwWKKBCOPhfeAz_v
|
|
|
418
418
|
unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
|
|
419
419
|
unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
|
|
420
420
|
unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
|
|
421
|
-
unstructured_ingest/v2/processes/embedder.py,sha256=
|
|
421
|
+
unstructured_ingest/v2/processes/embedder.py,sha256=4x-Rt5UCvwdgihDAr24hvTGDEd1CdKF9xJrf3aMU-ck,7926
|
|
422
422
|
unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
|
|
423
423
|
unstructured_ingest/v2/processes/partitioner.py,sha256=ZC9mt85I3o_SLR4DvE7vPBGphMET994phFkTuT-L9B8,9998
|
|
424
424
|
unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
|
|
425
425
|
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=KO1zn-96Qa49TOSZn-gv_RUMGMCmUcdtHoeJqCpxPLY,6219
|
|
426
426
|
unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
|
|
427
|
-
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=
|
|
427
|
+
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=v2M-xpI7NViikEaHCmuWUQU5XokDOOWbOFXYUXF63Ps,15002
|
|
428
428
|
unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
|
|
429
429
|
unstructured_ingest/v2/processes/connectors/chroma.py,sha256=VHCnM56qNXuHzovJihrNfJnZbWLJShOe8j12PJFrbL0,7219
|
|
430
430
|
unstructured_ingest/v2/processes/connectors/confluence.py,sha256=_zkiST0FTggEKNORalCcZZIRGZKnCM0LLcavgQZfDVE,11112
|
|
@@ -437,15 +437,15 @@ unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOG
|
|
|
437
437
|
unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
|
|
438
438
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
|
|
439
439
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
|
|
440
|
-
unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=
|
|
440
|
+
unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=ijp5hjmDpLoIHL9UJzV4_4vVtQBlQ2R_vLatlUYivX4,17464
|
|
441
441
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=EM9fq67RsiudZvZbi6nDXkS-i6W0xLvbkNvD0G-Ni5E,17779
|
|
442
442
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
443
|
-
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=
|
|
443
|
+
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=93WmYO9OT8er9DSlh8odbJCtjcLsVMlqyXlYADgDEjc,14013
|
|
444
444
|
unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
|
|
445
445
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
446
446
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=2T9Bm1H_ALwHhG_YP7vsuUUW-mUg61zcaae3aa9BnN4,4827
|
|
447
447
|
unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
|
|
448
|
-
unstructured_ingest/v2/processes/connectors/utils.py,sha256=
|
|
448
|
+
unstructured_ingest/v2/processes/connectors/utils.py,sha256=ru_4e5lo5t1jJhR8sGYa5nNhX3gKTgC5B7Oze9qQJjo,2000
|
|
449
449
|
unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
|
|
450
450
|
unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
451
451
|
unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
|
|
@@ -566,10 +566,10 @@ unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWa
|
|
|
566
566
|
unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
|
|
567
567
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
568
568
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
569
|
-
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=
|
|
570
|
-
unstructured_ingest-0.5.
|
|
571
|
-
unstructured_ingest-0.5.
|
|
572
|
-
unstructured_ingest-0.5.
|
|
573
|
-
unstructured_ingest-0.5.
|
|
574
|
-
unstructured_ingest-0.5.
|
|
575
|
-
unstructured_ingest-0.5.
|
|
569
|
+
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yfABJKJGCvPuZ2XCNtDOuCtiscdEAmBCSPPNZnbTKDk,12821
|
|
570
|
+
unstructured_ingest-0.5.6.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
571
|
+
unstructured_ingest-0.5.6.dist-info/METADATA,sha256=ts8jHfqXkNXKcF9TL5UqQNHkynZuzjiobUomXaqiYgM,8316
|
|
572
|
+
unstructured_ingest-0.5.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
573
|
+
unstructured_ingest-0.5.6.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
574
|
+
unstructured_ingest-0.5.6.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
575
|
+
unstructured_ingest-0.5.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.5.4.dist-info → unstructured_ingest-0.5.6.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|