unstructured-ingest 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_pinecone.py +34 -0
- test/integration/connectors/weaviate/test_local.py +3 -3
- test/integration/embedders/test_mixedbread.py +2 -2
- test/integration/embedders/test_voyageai.py +16 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/interfaces.py +2 -2
- unstructured_ingest/embed/voyageai.py +12 -2
- unstructured_ingest/v2/interfaces/process.py +1 -1
- unstructured_ingest/v2/interfaces/uploader.py +2 -2
- unstructured_ingest/v2/pipeline/pipeline.py +26 -31
- unstructured_ingest/v2/processes/connectors/pinecone.py +93 -5
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +13 -2
- unstructured_ingest/v2/processes/embedder.py +1 -1
- {unstructured_ingest-0.5.5.dist-info → unstructured_ingest-0.5.7.dist-info}/METADATA +23 -23
- {unstructured_ingest-0.5.5.dist-info → unstructured_ingest-0.5.7.dist-info}/RECORD +19 -19
- {unstructured_ingest-0.5.5.dist-info → unstructured_ingest-0.5.7.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.5.dist-info → unstructured_ingest-0.5.7.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.5.dist-info → unstructured_ingest-0.5.7.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.5.dist-info → unstructured_ingest-0.5.7.dist-info}/top_level.txt +0 -0
|
@@ -351,3 +351,37 @@ def test_pinecone_stager(
|
|
|
351
351
|
stager=stager,
|
|
352
352
|
tmp_dir=tmp_path,
|
|
353
353
|
)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
@requires_env(API_KEY)
|
|
357
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
358
|
+
def test_pinecone_create_destination(pinecone_index):
|
|
359
|
+
uploader = PineconeUploader(
|
|
360
|
+
connection_config=PineconeConnectionConfig(
|
|
361
|
+
access_config=PineconeAccessConfig(api_key=get_api_key())
|
|
362
|
+
),
|
|
363
|
+
upload_config=PineconeUploaderConfig(),
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
random_id = str(uuid4()).split("-")[0]
|
|
367
|
+
|
|
368
|
+
index_name = f"test-create-destination-{random_id}"
|
|
369
|
+
|
|
370
|
+
assert not uploader.index_exists(index_name=index_name)
|
|
371
|
+
|
|
372
|
+
try:
|
|
373
|
+
uploader.create_destination(destination_name=index_name, vector_length=1536)
|
|
374
|
+
except Exception as e:
|
|
375
|
+
error_body = getattr(e, "body", None)
|
|
376
|
+
raise pytest.fail(f"failed to create destination: {e} {error_body}")
|
|
377
|
+
|
|
378
|
+
assert uploader.index_exists(index_name=index_name), "destination was not created successfully"
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
pc = uploader.connection_config.get_client()
|
|
382
|
+
logger.info(f"deleting index for test create destination: {index_name}")
|
|
383
|
+
pc.delete_index(name=index_name)
|
|
384
|
+
except Exception as e:
|
|
385
|
+
raise pytest.fail(f"failed to cleanup / delete the destination: {e}")
|
|
386
|
+
|
|
387
|
+
assert not uploader.index_exists(index_name=index_name), "cleanup failed"
|
|
@@ -78,7 +78,6 @@ def run_uploader_and_validate(
|
|
|
78
78
|
validate_count(expected_count=expected_count)
|
|
79
79
|
|
|
80
80
|
|
|
81
|
-
@pytest.mark.asyncio
|
|
82
81
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
83
82
|
def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
|
|
84
83
|
file_data = FileData(
|
|
@@ -142,11 +141,12 @@ def test_weaviate_local_create_destination(weaviate_instance):
|
|
|
142
141
|
upload_config=LocalWeaviateUploaderConfig(),
|
|
143
142
|
connection_config=LocalWeaviateConnectionConfig(),
|
|
144
143
|
)
|
|
145
|
-
collection_name = "system_created"
|
|
144
|
+
collection_name = "system_created-123"
|
|
145
|
+
formatted_collection_name = "System_created_123"
|
|
146
146
|
created = uploader.create_destination(destination_name=collection_name)
|
|
147
147
|
assert created
|
|
148
148
|
with uploader.connection_config.get_client() as weaviate_client:
|
|
149
|
-
assert weaviate_client.collections.exists(name=
|
|
149
|
+
assert weaviate_client.collections.exists(name=formatted_collection_name)
|
|
150
150
|
|
|
151
151
|
created = uploader.create_destination(destination_name=collection_name)
|
|
152
152
|
assert not created
|
|
@@ -50,7 +50,7 @@ def test_raw_mixedbread_embedder(embedder_file: Path):
|
|
|
50
50
|
embedder=embedder,
|
|
51
51
|
embedder_file=embedder_file,
|
|
52
52
|
expected_dimension=1024,
|
|
53
|
-
expected_is_unit_vector=
|
|
53
|
+
expected_is_unit_vector=True,
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
|
|
@@ -67,5 +67,5 @@ async def test_raw_async_mixedbread_embedder(embedder_file: Path):
|
|
|
67
67
|
embedder=embedder,
|
|
68
68
|
embedder_file=embedder_file,
|
|
69
69
|
expected_dimension=1024,
|
|
70
|
-
expected_is_unit_vector=
|
|
70
|
+
expected_is_unit_vector=True,
|
|
71
71
|
)
|
|
@@ -61,3 +61,19 @@ async def test_raw_async_voyageai_embedder(embedder_file: Path):
|
|
|
61
61
|
await validate_raw_embedder_async(
|
|
62
62
|
embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
|
|
63
63
|
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@requires_env(API_KEY)
|
|
67
|
+
def test_voyageai_multimodal_embedder(embedder_file: Path):
|
|
68
|
+
api_key = get_api_key()
|
|
69
|
+
embedder_config = EmbedderConfig(
|
|
70
|
+
embedding_provider="voyageai",
|
|
71
|
+
embedding_api_key=api_key,
|
|
72
|
+
embedding_model_name="voyage-multimodal-3",
|
|
73
|
+
)
|
|
74
|
+
embedder = Embedder(config=embedder_config)
|
|
75
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
76
|
+
assert results
|
|
77
|
+
with embedder_file.open("r") as f:
|
|
78
|
+
original_elements = json.load(f)
|
|
79
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.7" # pragma: no cover
|
|
@@ -49,7 +49,7 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
49
49
|
def is_unit_vector(self) -> bool:
|
|
50
50
|
"""Denotes if the embedding vector is a unit vector."""
|
|
51
51
|
exemplary_embedding = self.get_exemplary_embedding()
|
|
52
|
-
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
52
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
|
|
53
53
|
|
|
54
54
|
def get_client(self):
|
|
55
55
|
raise NotImplementedError
|
|
@@ -103,7 +103,7 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
103
103
|
async def is_unit_vector(self) -> bool:
|
|
104
104
|
"""Denotes if the embedding vector is a unit vector."""
|
|
105
105
|
exemplary_embedding = await self.get_exemplary_embedding()
|
|
106
|
-
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
106
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
|
|
107
107
|
|
|
108
108
|
def get_client(self):
|
|
109
109
|
raise NotImplementedError
|
|
@@ -96,7 +96,11 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
96
96
|
return self.config.get_client()
|
|
97
97
|
|
|
98
98
|
def embed_batch(self, client: "VoyageAIClient", batch: list[str]) -> list[list[float]]:
|
|
99
|
-
|
|
99
|
+
if self.config.embedder_model_name == "voyage-multimodal-3":
|
|
100
|
+
batch = [[text] for text in batch]
|
|
101
|
+
response = client.multimodal_embed(inputs=batch, model=self.config.embedder_model_name)
|
|
102
|
+
else:
|
|
103
|
+
response = client.embed(texts=batch, model=self.config.embedder_model_name)
|
|
100
104
|
return response.embeddings
|
|
101
105
|
|
|
102
106
|
|
|
@@ -113,5 +117,11 @@ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
113
117
|
async def embed_batch(
|
|
114
118
|
self, client: "AsyncVoyageAIClient", batch: list[str]
|
|
115
119
|
) -> list[list[float]]:
|
|
116
|
-
|
|
120
|
+
if self.config.embedder_model_name == "voyage-multimodal-3":
|
|
121
|
+
batch = [[text] for text in batch]
|
|
122
|
+
response = await client.multimodal_embed(
|
|
123
|
+
inputs=batch, model=self.config.embedder_model_name
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
response = await client.embed(texts=batch, model=self.config.embedder_model_name)
|
|
117
127
|
return response.embeddings
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any,
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -61,6 +61,6 @@ class Uploader(BaseProcess, BaseConnector, ABC):
|
|
|
61
61
|
@dataclass
|
|
62
62
|
class VectorDBUploader(Uploader, ABC):
|
|
63
63
|
def create_destination(
|
|
64
|
-
self, destination_name: str = "elements",
|
|
64
|
+
self, vector_length: int, destination_name: str = "elements", **kwargs: Any
|
|
65
65
|
) -> bool:
|
|
66
66
|
return False
|
|
@@ -126,14 +126,32 @@ class Pipeline:
|
|
|
126
126
|
for kk, vv in v.items():
|
|
127
127
|
logger.error(f"{k}: [{kk}] {vv}")
|
|
128
128
|
|
|
129
|
+
def _run_initialization(self):
|
|
130
|
+
failures = {}
|
|
131
|
+
init_kwargs = {}
|
|
132
|
+
for step in self._get_ordered_steps():
|
|
133
|
+
try:
|
|
134
|
+
step.process.init(**init_kwargs)
|
|
135
|
+
step.process.precheck()
|
|
136
|
+
# Make sure embedder dimensions available for downstream steps
|
|
137
|
+
if isinstance(step.process, Embedder):
|
|
138
|
+
embed_dimensions = step.process.config.get_embedder().dimension
|
|
139
|
+
init_kwargs["vector_length"] = embed_dimensions
|
|
140
|
+
|
|
141
|
+
except Exception as e:
|
|
142
|
+
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
143
|
+
if failures:
|
|
144
|
+
for k, v in failures.items():
|
|
145
|
+
logger.error(f"Step initialization failure: {k}: {v}")
|
|
146
|
+
raise PipelineError("Initialization failed")
|
|
147
|
+
|
|
129
148
|
def run(self):
|
|
130
149
|
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.info)
|
|
131
150
|
try:
|
|
132
151
|
with otel_handler.get_tracer().start_as_current_span(
|
|
133
152
|
"ingest process", record_exception=True
|
|
134
153
|
):
|
|
135
|
-
self.
|
|
136
|
-
self._run_prechecks()
|
|
154
|
+
self._run_initialization()
|
|
137
155
|
self._run()
|
|
138
156
|
finally:
|
|
139
157
|
self.log_statuses()
|
|
@@ -154,43 +172,20 @@ class Pipeline:
|
|
|
154
172
|
final = [f for f in flat if f]
|
|
155
173
|
return final or None
|
|
156
174
|
|
|
157
|
-
def
|
|
158
|
-
steps = [self.indexer_step, self.downloader_step
|
|
175
|
+
def _get_ordered_steps(self) -> list[PipelineStep]:
|
|
176
|
+
steps = [self.indexer_step, self.downloader_step]
|
|
177
|
+
if self.uncompress_step:
|
|
178
|
+
steps.append(self.uncompress_step)
|
|
179
|
+
steps.append(self.partitioner_step)
|
|
159
180
|
if self.chunker_step:
|
|
160
181
|
steps.append(self.chunker_step)
|
|
161
182
|
if self.embedder_step:
|
|
162
183
|
steps.append(self.embedder_step)
|
|
163
|
-
if self.uncompress_step:
|
|
164
|
-
steps.append(self.uncompress_step)
|
|
165
184
|
if self.stager_step:
|
|
166
185
|
steps.append(self.stager_step)
|
|
186
|
+
steps.append(self.uploader_step)
|
|
167
187
|
return steps
|
|
168
188
|
|
|
169
|
-
def _run_inits(self):
|
|
170
|
-
failures = {}
|
|
171
|
-
|
|
172
|
-
for step in self._get_all_steps():
|
|
173
|
-
try:
|
|
174
|
-
step.process.init()
|
|
175
|
-
except Exception as e:
|
|
176
|
-
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
177
|
-
if failures:
|
|
178
|
-
for k, v in failures.items():
|
|
179
|
-
logger.error(f"Step init failure: {k}: {v}")
|
|
180
|
-
raise PipelineError("Init failed")
|
|
181
|
-
|
|
182
|
-
def _run_prechecks(self):
|
|
183
|
-
failures = {}
|
|
184
|
-
for step in self._get_all_steps():
|
|
185
|
-
try:
|
|
186
|
-
step.process.precheck()
|
|
187
|
-
except Exception as e:
|
|
188
|
-
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
189
|
-
if failures:
|
|
190
|
-
for k, v in failures.items():
|
|
191
|
-
logger.error(f"Step precheck failure: {k}: {v}")
|
|
192
|
-
raise PipelineError("Precheck failed")
|
|
193
|
-
|
|
194
189
|
def apply_filter(self, records: list[dict]) -> list[dict]:
|
|
195
190
|
if not self.filter_step:
|
|
196
191
|
return records
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import re
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
4
5
|
|
|
5
6
|
from pydantic import Field, Secret
|
|
6
7
|
|
|
@@ -13,10 +14,10 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
13
14
|
AccessConfig,
|
|
14
15
|
ConnectionConfig,
|
|
15
16
|
FileData,
|
|
16
|
-
Uploader,
|
|
17
17
|
UploaderConfig,
|
|
18
18
|
UploadStager,
|
|
19
19
|
UploadStagerConfig,
|
|
20
|
+
VectorDBUploader,
|
|
20
21
|
)
|
|
21
22
|
from unstructured_ingest.v2.logger import logger
|
|
22
23
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
@@ -41,7 +42,7 @@ class PineconeAccessConfig(AccessConfig):
|
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
class PineconeConnectionConfig(ConnectionConfig):
|
|
44
|
-
index_name: str = Field(description="Name of the index to connect to.")
|
|
45
|
+
index_name: Optional[str] = Field(description="Name of the index to connect to.", default=None)
|
|
45
46
|
access_config: Secret[PineconeAccessConfig] = Field(
|
|
46
47
|
default=PineconeAccessConfig(), validate_default=True
|
|
47
48
|
)
|
|
@@ -160,18 +161,101 @@ class PineconeUploadStager(UploadStager):
|
|
|
160
161
|
|
|
161
162
|
|
|
162
163
|
@dataclass
|
|
163
|
-
class PineconeUploader(
|
|
164
|
+
class PineconeUploader(VectorDBUploader):
|
|
164
165
|
upload_config: PineconeUploaderConfig
|
|
165
166
|
connection_config: PineconeConnectionConfig
|
|
166
167
|
connector_type: str = CONNECTOR_TYPE
|
|
167
168
|
|
|
169
|
+
def init(self, **kwargs: Any) -> None:
|
|
170
|
+
self.create_destination(**kwargs)
|
|
171
|
+
|
|
172
|
+
def index_exists(self, index_name: Optional[str]) -> bool:
|
|
173
|
+
from pinecone.exceptions import NotFoundException
|
|
174
|
+
|
|
175
|
+
index_name = index_name or self.connection_config.index_name
|
|
176
|
+
pc = self.connection_config.get_client()
|
|
177
|
+
try:
|
|
178
|
+
pc.describe_index(index_name)
|
|
179
|
+
return True
|
|
180
|
+
except NotFoundException:
|
|
181
|
+
return False
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.error(f"failed to check if pinecone index exists : {e}")
|
|
184
|
+
raise DestinationConnectionError(f"failed to check if pinecone index exists : {e}")
|
|
185
|
+
|
|
168
186
|
def precheck(self):
|
|
169
187
|
try:
|
|
170
|
-
|
|
188
|
+
# just a connection check here. not an actual index_exists check
|
|
189
|
+
self.index_exists("just-checking-our-connection")
|
|
190
|
+
|
|
191
|
+
if self.connection_config.index_name and not self.index_exists(
|
|
192
|
+
self.connection_config.index_name
|
|
193
|
+
):
|
|
194
|
+
raise DestinationConnectionError(
|
|
195
|
+
f"index {self.connection_config.index_name} does not exist"
|
|
196
|
+
)
|
|
171
197
|
except Exception as e:
|
|
172
198
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
173
199
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
174
200
|
|
|
201
|
+
def format_destination_name(self, destination_name: str) -> str:
|
|
202
|
+
# Pinecone naming requirements:
|
|
203
|
+
# can only contain lowercase letters, numbers, and hyphens
|
|
204
|
+
# must be 45 characters or less
|
|
205
|
+
formatted = re.sub(r"[^a-z0-9]", "-", destination_name.lower())
|
|
206
|
+
return formatted
|
|
207
|
+
|
|
208
|
+
def create_destination(
|
|
209
|
+
self,
|
|
210
|
+
vector_length: int,
|
|
211
|
+
destination_name: str = "elements",
|
|
212
|
+
destination_type: Literal["pod", "serverless"] = "serverless",
|
|
213
|
+
serverless_cloud: str = "aws",
|
|
214
|
+
serverless_region: str = "us-west-2",
|
|
215
|
+
pod_environment: str = "us-east1-gcp",
|
|
216
|
+
pod_type: str = "p1.x1",
|
|
217
|
+
pod_count: int = 1,
|
|
218
|
+
**kwargs: Any,
|
|
219
|
+
) -> bool:
|
|
220
|
+
from pinecone import PodSpec, ServerlessSpec
|
|
221
|
+
|
|
222
|
+
index_name = destination_name or self.connection_config.index_name
|
|
223
|
+
index_name = self.format_destination_name(index_name)
|
|
224
|
+
self.connection_config.index_name = index_name
|
|
225
|
+
|
|
226
|
+
if not self.index_exists(index_name):
|
|
227
|
+
|
|
228
|
+
logger.info(f"creating pinecone index {index_name}")
|
|
229
|
+
|
|
230
|
+
pc = self.connection_config.get_client()
|
|
231
|
+
|
|
232
|
+
if destination_type == "serverless":
|
|
233
|
+
pc.create_index(
|
|
234
|
+
name=destination_name,
|
|
235
|
+
dimension=vector_length,
|
|
236
|
+
spec=ServerlessSpec(cloud=serverless_cloud, region=serverless_region),
|
|
237
|
+
**kwargs,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
return True
|
|
241
|
+
|
|
242
|
+
elif destination_type == "pod":
|
|
243
|
+
pc.create_index(
|
|
244
|
+
name=destination_name,
|
|
245
|
+
dimension=vector_length,
|
|
246
|
+
spec=PodSpec(environment=pod_environment, pod_type=pod_type, pods=pod_count),
|
|
247
|
+
**kwargs,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
return True
|
|
251
|
+
|
|
252
|
+
else:
|
|
253
|
+
raise ValueError(f"unexpected destination type: {destination_type}")
|
|
254
|
+
|
|
255
|
+
else:
|
|
256
|
+
logger.debug(f"index {index_name} already exists, skipping creation")
|
|
257
|
+
return False
|
|
258
|
+
|
|
175
259
|
def pod_delete_by_record_id(self, file_data: FileData) -> None:
|
|
176
260
|
logger.debug(
|
|
177
261
|
f"deleting any content with metadata "
|
|
@@ -266,6 +350,10 @@ class PineconeUploader(Uploader):
|
|
|
266
350
|
)
|
|
267
351
|
# Determine if serverless or pod based index
|
|
268
352
|
pinecone_client = self.connection_config.get_client()
|
|
353
|
+
|
|
354
|
+
if not self.connection_config.index_name:
|
|
355
|
+
raise ValueError("No index name specified")
|
|
356
|
+
|
|
269
357
|
index_description = pinecone_client.describe_index(name=self.connection_config.index_name)
|
|
270
358
|
if "serverless" in index_description.get("spec"):
|
|
271
359
|
self.serverless_delete_by_record_id(file_data=file_data)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import re
|
|
2
3
|
from abc import ABC, abstractmethod
|
|
3
4
|
from contextlib import contextmanager
|
|
4
5
|
from dataclasses import dataclass, field
|
|
@@ -229,19 +230,29 @@ class WeaviateUploader(VectorDBUploader, ABC):
|
|
|
229
230
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
230
231
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
231
232
|
|
|
232
|
-
def init(self,
|
|
233
|
-
self.create_destination()
|
|
233
|
+
def init(self, **kwargs: Any) -> None:
|
|
234
|
+
self.create_destination(**kwargs)
|
|
235
|
+
|
|
236
|
+
def format_destination_name(self, destination_name: str) -> str:
|
|
237
|
+
# Weaviate naming requirements:
|
|
238
|
+
# must be alphanumeric and underscores only
|
|
239
|
+
formatted = re.sub(r"[^a-zA-Z0-9]", "_", destination_name)
|
|
240
|
+
# must begin with capital letter
|
|
241
|
+
return formatted.capitalize()
|
|
234
242
|
|
|
235
243
|
def create_destination(
|
|
236
244
|
self, destination_name: str = "elements", vector_length: Optional[int] = None, **kwargs: Any
|
|
237
245
|
) -> bool:
|
|
246
|
+
destination_name = self.format_destination_name(destination_name)
|
|
238
247
|
collection_name = self.upload_config.collection or destination_name
|
|
239
248
|
self.upload_config.collection = collection_name
|
|
249
|
+
|
|
240
250
|
connectors_dir = Path(__file__).parents[1]
|
|
241
251
|
collection_config_file = connectors_dir / "assets" / "weaviate_collection_config.json"
|
|
242
252
|
with collection_config_file.open() as f:
|
|
243
253
|
collection_config = json.load(f)
|
|
244
254
|
collection_config["class"] = collection_name
|
|
255
|
+
|
|
245
256
|
if not self._collection_exists():
|
|
246
257
|
logger.info(
|
|
247
258
|
f"creating default weaviate collection '{collection_name}' with default configs"
|
|
@@ -186,7 +186,7 @@ class EmbedderConfig(BaseModel):
|
|
|
186
186
|
class Embedder(BaseProcess, ABC):
|
|
187
187
|
config: EmbedderConfig
|
|
188
188
|
|
|
189
|
-
def init(self,
|
|
189
|
+
def init(self, **kwargs: Any) -> None:
|
|
190
190
|
self.config.get_embedder().initialize()
|
|
191
191
|
|
|
192
192
|
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.7
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: tqdm
|
|
26
|
-
Requires-Dist: pydantic>=2.7
|
|
27
|
-
Requires-Dist: click
|
|
28
25
|
Requires-Dist: opentelemetry-sdk
|
|
26
|
+
Requires-Dist: pydantic>=2.7
|
|
27
|
+
Requires-Dist: dataclasses_json
|
|
29
28
|
Requires-Dist: python-dateutil
|
|
29
|
+
Requires-Dist: tqdm
|
|
30
|
+
Requires-Dist: click
|
|
30
31
|
Requires-Dist: pandas
|
|
31
|
-
Requires-Dist: dataclasses_json
|
|
32
32
|
Provides-Extra: remote
|
|
33
33
|
Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
|
|
34
34
|
Provides-Extra: csv
|
|
@@ -66,16 +66,16 @@ Requires-Dist: pyairtable; extra == "airtable"
|
|
|
66
66
|
Provides-Extra: astradb
|
|
67
67
|
Requires-Dist: astrapy; extra == "astradb"
|
|
68
68
|
Provides-Extra: azure
|
|
69
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
70
69
|
Requires-Dist: adlfs; extra == "azure"
|
|
70
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
71
71
|
Provides-Extra: azure-ai-search
|
|
72
72
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
73
73
|
Provides-Extra: biomed
|
|
74
|
-
Requires-Dist: requests; extra == "biomed"
|
|
75
74
|
Requires-Dist: bs4; extra == "biomed"
|
|
75
|
+
Requires-Dist: requests; extra == "biomed"
|
|
76
76
|
Provides-Extra: box
|
|
77
|
-
Requires-Dist: fsspec; extra == "box"
|
|
78
77
|
Requires-Dist: boxfs; extra == "box"
|
|
78
|
+
Requires-Dist: fsspec; extra == "box"
|
|
79
79
|
Provides-Extra: chroma
|
|
80
80
|
Requires-Dist: chromadb; extra == "chroma"
|
|
81
81
|
Provides-Extra: clarifai
|
|
@@ -98,19 +98,19 @@ Requires-Dist: duckdb; extra == "duckdb"
|
|
|
98
98
|
Provides-Extra: elasticsearch
|
|
99
99
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
100
100
|
Provides-Extra: gcs
|
|
101
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
102
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
103
101
|
Requires-Dist: bs4; extra == "gcs"
|
|
102
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
103
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
104
104
|
Provides-Extra: github
|
|
105
|
-
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
106
105
|
Requires-Dist: requests; extra == "github"
|
|
106
|
+
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
107
107
|
Provides-Extra: gitlab
|
|
108
108
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
109
109
|
Provides-Extra: google-drive
|
|
110
110
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
111
111
|
Provides-Extra: hubspot
|
|
112
|
-
Requires-Dist: urllib3; extra == "hubspot"
|
|
113
112
|
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
113
|
+
Requires-Dist: urllib3; extra == "hubspot"
|
|
114
114
|
Provides-Extra: jira
|
|
115
115
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
116
116
|
Provides-Extra: kafka
|
|
@@ -124,23 +124,23 @@ Requires-Dist: pymilvus; extra == "milvus"
|
|
|
124
124
|
Provides-Extra: mongodb
|
|
125
125
|
Requires-Dist: pymongo; extra == "mongodb"
|
|
126
126
|
Provides-Extra: neo4j
|
|
127
|
-
Requires-Dist: cymple; extra == "neo4j"
|
|
128
127
|
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
128
|
+
Requires-Dist: cymple; extra == "neo4j"
|
|
129
129
|
Requires-Dist: networkx; extra == "neo4j"
|
|
130
130
|
Provides-Extra: notion
|
|
131
131
|
Requires-Dist: notion-client; extra == "notion"
|
|
132
|
-
Requires-Dist: htmlBuilder; extra == "notion"
|
|
133
132
|
Requires-Dist: backoff; extra == "notion"
|
|
133
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
134
134
|
Requires-Dist: httpx; extra == "notion"
|
|
135
135
|
Provides-Extra: onedrive
|
|
136
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
137
136
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
137
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
138
138
|
Requires-Dist: bs4; extra == "onedrive"
|
|
139
139
|
Provides-Extra: opensearch
|
|
140
140
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
141
141
|
Provides-Extra: outlook
|
|
142
|
-
Requires-Dist: msal; extra == "outlook"
|
|
143
142
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
143
|
+
Requires-Dist: msal; extra == "outlook"
|
|
144
144
|
Provides-Extra: pinecone
|
|
145
145
|
Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
|
|
146
146
|
Provides-Extra: postgres
|
|
@@ -152,11 +152,11 @@ Requires-Dist: praw; extra == "reddit"
|
|
|
152
152
|
Provides-Extra: redis
|
|
153
153
|
Requires-Dist: redis; extra == "redis"
|
|
154
154
|
Provides-Extra: s3
|
|
155
|
-
Requires-Dist: fsspec; extra == "s3"
|
|
156
155
|
Requires-Dist: s3fs; extra == "s3"
|
|
156
|
+
Requires-Dist: fsspec; extra == "s3"
|
|
157
157
|
Provides-Extra: sharepoint
|
|
158
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
159
158
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
159
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
160
160
|
Provides-Extra: salesforce
|
|
161
161
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
162
162
|
Provides-Extra: sftp
|
|
@@ -178,9 +178,9 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
|
178
178
|
Provides-Extra: singlestore
|
|
179
179
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
180
180
|
Provides-Extra: vectara
|
|
181
|
-
Requires-Dist: aiofiles; extra == "vectara"
|
|
182
|
-
Requires-Dist: requests; extra == "vectara"
|
|
183
181
|
Requires-Dist: httpx; extra == "vectara"
|
|
182
|
+
Requires-Dist: requests; extra == "vectara"
|
|
183
|
+
Requires-Dist: aiofiles; extra == "vectara"
|
|
184
184
|
Provides-Extra: vastdb
|
|
185
185
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
186
186
|
Requires-Dist: ibis; extra == "vastdb"
|
|
@@ -188,8 +188,8 @@ Requires-Dist: vastdb; extra == "vastdb"
|
|
|
188
188
|
Provides-Extra: embed-huggingface
|
|
189
189
|
Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
190
190
|
Provides-Extra: embed-octoai
|
|
191
|
-
Requires-Dist: openai; extra == "embed-octoai"
|
|
192
191
|
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
192
|
+
Requires-Dist: openai; extra == "embed-octoai"
|
|
193
193
|
Provides-Extra: embed-vertexai
|
|
194
194
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
195
195
|
Provides-Extra: embed-voyageai
|
|
@@ -197,11 +197,11 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
197
197
|
Provides-Extra: embed-mixedbreadai
|
|
198
198
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
199
199
|
Provides-Extra: openai
|
|
200
|
-
Requires-Dist: openai; extra == "openai"
|
|
201
200
|
Requires-Dist: tiktoken; extra == "openai"
|
|
201
|
+
Requires-Dist: openai; extra == "openai"
|
|
202
202
|
Provides-Extra: bedrock
|
|
203
|
-
Requires-Dist: aioboto3; extra == "bedrock"
|
|
204
203
|
Requires-Dist: boto3; extra == "bedrock"
|
|
204
|
+
Requires-Dist: aioboto3; extra == "bedrock"
|
|
205
205
|
Provides-Extra: togetherai
|
|
206
206
|
Requires-Dist: together; extra == "togetherai"
|
|
207
207
|
Dynamic: author
|
|
@@ -17,7 +17,7 @@ test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8u
|
|
|
17
17
|
test/integration/connectors/test_neo4j.py,sha256=r4TRYtTXeeOdcRcfa_gvslhSKvoIWrwN1FRJ5XRoH4k,8456
|
|
18
18
|
test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qkoV3srltFOSSKTA,5403
|
|
19
19
|
test/integration/connectors/test_onedrive.py,sha256=iwiDK0kWCfQbIEPnWUzzAA5PiCsHcmFZSxEcIZy_6cc,5229
|
|
20
|
-
test/integration/connectors/test_pinecone.py,sha256=
|
|
20
|
+
test/integration/connectors/test_pinecone.py,sha256=9FC0frer7gtDzk5A6OhGsV8S4ggYfa5ReEO9t7L3Am0,13649
|
|
21
21
|
test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
|
|
22
22
|
test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
|
|
23
23
|
test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
|
|
@@ -54,18 +54,18 @@ test/integration/connectors/utils/validation/utils.py,sha256=xYYvAbqP6_lZyH09_Jj
|
|
|
54
54
|
test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
55
55
|
test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
|
|
56
56
|
test/integration/connectors/weaviate/test_cloud.py,sha256=U1ZS6a7wTPX7h3XGvaJHaT-Uwg4IeGgzxx1YBywgVhM,1284
|
|
57
|
-
test/integration/connectors/weaviate/test_local.py,sha256=
|
|
57
|
+
test/integration/connectors/weaviate/test_local.py,sha256=NMQh9kV_BoIrpXe5abGkUSJYsY2ipRSqyFS4EzH1o7s,5333
|
|
58
58
|
test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
59
|
test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
|
|
60
60
|
test/integration/embedders/test_azure_openai.py,sha256=YQ3uq2-NuxtTyGsSgMNa10pcITLKMJ4E1scTGFgwujw,1790
|
|
61
61
|
test/integration/embedders/test_bedrock.py,sha256=vmjoi1uUk-LX4Yz0ZPn6Ry1JdVEsyIhLhPbSPmkeT9o,3553
|
|
62
62
|
test/integration/embedders/test_huggingface.py,sha256=qFblyXounVNRaNkk3gbKoBqU5E2dNecgKU2Bz2LyOa8,989
|
|
63
|
-
test/integration/embedders/test_mixedbread.py,sha256=
|
|
63
|
+
test/integration/embedders/test_mixedbread.py,sha256=oesaTY8H7es72vhctmNVU0oWkHNJQckHd_KD-K6kWxI,1996
|
|
64
64
|
test/integration/embedders/test_octoai.py,sha256=qs-bqZ7iGWO_BzUZvKJmOHBT3cmFSkEYbleWhj3snJc,2197
|
|
65
65
|
test/integration/embedders/test_openai.py,sha256=9XioXuvdnbh_3vRmRwpMsi1D5heCcY7KA4nHb5vOU_M,2127
|
|
66
66
|
test/integration/embedders/test_togetherai.py,sha256=hsg3c3SGJGd93unz4-VLYmFXxLA1vmrD5xK5Gj-g0R4,2205
|
|
67
67
|
test/integration/embedders/test_vertexai.py,sha256=4-E4plJXFf1b02RhOqOCBHR2GA4gTnc8K4AnHm6EgPU,1830
|
|
68
|
-
test/integration/embedders/test_voyageai.py,sha256=
|
|
68
|
+
test/integration/embedders/test_voyageai.py,sha256=hf8JP8eSL1MMFsmQ9rErM8oxCcwO6kC1WfzzBn7bnME,2414
|
|
69
69
|
test/integration/embedders/utils.py,sha256=Sqqg-X31ZV1hojqPQBaZgM2lb2u8cG6s6OnH9JRsFjs,2717
|
|
70
70
|
test/integration/partitioners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
71
|
test/integration/partitioners/test_partitioner.py,sha256=6sdZhhtqEICBPqEgpKrCQIfJ-7hKcwuTFqjWs1mbQf8,2787
|
|
@@ -107,7 +107,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
107
107
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
108
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
109
109
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
110
|
-
unstructured_ingest/__version__.py,sha256=
|
|
110
|
+
unstructured_ingest/__version__.py,sha256=SJI27PQ23gz4_g984Mn5VF7Lgitn3vm0GQDyvqnYbdc,42
|
|
111
111
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
112
112
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
113
113
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -278,13 +278,13 @@ unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
278
278
|
unstructured_ingest/embed/azure_openai.py,sha256=_-I-nwd-wdCiKkSdYBL4UKrTZ2UPWsM_0T69fcObs_I,1707
|
|
279
279
|
unstructured_ingest/embed/bedrock.py,sha256=tZumLLXafSr1zIFVjckapRoiiY-7u65GPuWmwsdhY0I,7726
|
|
280
280
|
unstructured_ingest/embed/huggingface.py,sha256=EWU1kd5Cm6ajgCw6hP5w_4pniGSgxnR0wM9vjuPQ6Yk,2334
|
|
281
|
-
unstructured_ingest/embed/interfaces.py,sha256=
|
|
281
|
+
unstructured_ingest/embed/interfaces.py,sha256=_-CqasY6R5nnNUY-X6PS5lz8dsmGaUw5zIGRdPfx16o,4918
|
|
282
282
|
unstructured_ingest/embed/mixedbreadai.py,sha256=-Y0J27G9CL1t3ZTIeNjTjRviErSMAzJRf2zgDgMHUmg,4499
|
|
283
283
|
unstructured_ingest/embed/octoai.py,sha256=hNLEskDEP-2qWExUgVz2Eyw3KTIFwdUE9elbJ5qp4Ao,3855
|
|
284
284
|
unstructured_ingest/embed/openai.py,sha256=Fe_17y-YpkiGcfrOxZFmgjV-Y-u8svhDVYyAjV-GeBM,3279
|
|
285
285
|
unstructured_ingest/embed/togetherai.py,sha256=i1qeX2fwWtUf1vdGOGnpA_bJB__VzU1NQsR8k-KhxIw,2983
|
|
286
286
|
unstructured_ingest/embed/vertexai.py,sha256=EcXhhm1IbCZVq4KA0sbJjyABu8jpF2ZL3JCqmuxPsjo,3688
|
|
287
|
-
unstructured_ingest/embed/voyageai.py,sha256=
|
|
287
|
+
unstructured_ingest/embed/voyageai.py,sha256=lsdiTHVE3CMUX4gXdn2AaRJcKPcKptzgYdF2McvQcvA,4496
|
|
288
288
|
unstructured_ingest/enhanced_dataclass/__init__.py,sha256=gDZOUsv5eo-8jm4Yu7DdDwi101aGbfG7JctTdOYnTOM,151
|
|
289
289
|
unstructured_ingest/enhanced_dataclass/core.py,sha256=d6aUkDynuKX87cHx9_N5UDUWrvISR4jYRFRTvd_avlI,3038
|
|
290
290
|
unstructured_ingest/enhanced_dataclass/dataclasses.py,sha256=aZMsoCzAGRb8Rmh3BTSBFtNr6FmFTY93KYGLk3gYJKQ,1949
|
|
@@ -397,14 +397,14 @@ unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIws
|
|
|
397
397
|
unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
|
|
398
398
|
unstructured_ingest/v2/interfaces/file_data.py,sha256=7MyRlj5dijQsCR6W18wQ8fEgJigGKwoOYc10g9A6PSo,3834
|
|
399
399
|
unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
|
|
400
|
-
unstructured_ingest/v2/interfaces/process.py,sha256=
|
|
400
|
+
unstructured_ingest/v2/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
|
|
401
401
|
unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
|
|
402
402
|
unstructured_ingest/v2/interfaces/upload_stager.py,sha256=9EV9863ODDv0Y5liDT3xh2yiVuFiaVVyCcnwCy6nfkM,3172
|
|
403
|
-
unstructured_ingest/v2/interfaces/uploader.py,sha256=
|
|
403
|
+
unstructured_ingest/v2/interfaces/uploader.py,sha256=diMkAD5HY8IYpeP1DoFeRD_SexAgOEl1nUcimNnyATc,2063
|
|
404
404
|
unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
405
405
|
unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
|
|
406
406
|
unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
|
|
407
|
-
unstructured_ingest/v2/pipeline/pipeline.py,sha256=
|
|
407
|
+
unstructured_ingest/v2/pipeline/pipeline.py,sha256=UeOk5SywJZIn3kCnHclQ2cP7JJIXb4NDjpwzsCP_cF0,16523
|
|
408
408
|
unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
409
409
|
unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
|
|
410
410
|
unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
|
|
@@ -418,7 +418,7 @@ unstructured_ingest/v2/pipeline/steps/upload.py,sha256=We4OAtStuZwWKKBCOPhfeAz_v
|
|
|
418
418
|
unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
|
|
419
419
|
unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
|
|
420
420
|
unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
|
|
421
|
-
unstructured_ingest/v2/processes/embedder.py,sha256=
|
|
421
|
+
unstructured_ingest/v2/processes/embedder.py,sha256=4x-Rt5UCvwdgihDAr24hvTGDEd1CdKF9xJrf3aMU-ck,7926
|
|
422
422
|
unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
|
|
423
423
|
unstructured_ingest/v2/processes/partitioner.py,sha256=ZC9mt85I3o_SLR4DvE7vPBGphMET994phFkTuT-L9B8,9998
|
|
424
424
|
unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
|
|
@@ -440,7 +440,7 @@ unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNN
|
|
|
440
440
|
unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=ijp5hjmDpLoIHL9UJzV4_4vVtQBlQ2R_vLatlUYivX4,17464
|
|
441
441
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=EM9fq67RsiudZvZbi6nDXkS-i6W0xLvbkNvD0G-Ni5E,17779
|
|
442
442
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
443
|
-
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=
|
|
443
|
+
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=93WmYO9OT8er9DSlh8odbJCtjcLsVMlqyXlYADgDEjc,14013
|
|
444
444
|
unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
|
|
445
445
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
446
446
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=2T9Bm1H_ALwHhG_YP7vsuUUW-mUg61zcaae3aa9BnN4,4827
|
|
@@ -566,10 +566,10 @@ unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWa
|
|
|
566
566
|
unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
|
|
567
567
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
568
568
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
569
|
-
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=
|
|
570
|
-
unstructured_ingest-0.5.
|
|
571
|
-
unstructured_ingest-0.5.
|
|
572
|
-
unstructured_ingest-0.5.
|
|
573
|
-
unstructured_ingest-0.5.
|
|
574
|
-
unstructured_ingest-0.5.
|
|
575
|
-
unstructured_ingest-0.5.
|
|
569
|
+
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yfABJKJGCvPuZ2XCNtDOuCtiscdEAmBCSPPNZnbTKDk,12821
|
|
570
|
+
unstructured_ingest-0.5.7.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
571
|
+
unstructured_ingest-0.5.7.dist-info/METADATA,sha256=Zr_UTJd0V_0vUjwukPd2BgrEh47hqfLSiwivBPAxJos,8316
|
|
572
|
+
unstructured_ingest-0.5.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
573
|
+
unstructured_ingest-0.5.7.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
574
|
+
unstructured_ingest-0.5.7.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
575
|
+
unstructured_ingest-0.5.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.5.5.dist-info → unstructured_ingest-0.5.7.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|