unstructured-ingest 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +32 -5
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/connector/pinecone.py +12 -2
- unstructured_ingest/v2/interfaces/uploader.py +4 -2
- unstructured_ingest/v2/pipeline/steps/index.py +1 -1
- unstructured_ingest/v2/processes/connectors/astradb.py +85 -13
- unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -6
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +6 -5
- {unstructured_ingest-0.5.7.dist-info → unstructured_ingest-0.5.9.dist-info}/METADATA +26 -26
- {unstructured_ingest-0.5.7.dist-info → unstructured_ingest-0.5.9.dist-info}/RECORD +15 -15
- {unstructured_ingest-0.5.7.dist-info → unstructured_ingest-0.5.9.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.7.dist-info → unstructured_ingest-0.5.9.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.7.dist-info → unstructured_ingest-0.5.9.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.7.dist-info → unstructured_ingest-0.5.9.dist-info}/top_level.txt +0 -0
|
@@ -56,12 +56,7 @@ def test_precheck_succeeds_indexer(connection_config: AstraDBConnectionConfig):
|
|
|
56
56
|
connection_config=connection_config,
|
|
57
57
|
index_config=AstraDBIndexerConfig(collection_name=EXISTENT_COLLECTION_NAME),
|
|
58
58
|
)
|
|
59
|
-
uploader = AstraDBUploader(
|
|
60
|
-
connection_config=connection_config,
|
|
61
|
-
upload_config=AstraDBUploaderConfig(collection_name=EXISTENT_COLLECTION_NAME),
|
|
62
|
-
)
|
|
63
59
|
indexer.precheck()
|
|
64
|
-
uploader.precheck()
|
|
65
60
|
|
|
66
61
|
|
|
67
62
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
@@ -73,6 +68,12 @@ def test_precheck_succeeds_uploader(connection_config: AstraDBConnectionConfig):
|
|
|
73
68
|
)
|
|
74
69
|
uploader.precheck()
|
|
75
70
|
|
|
71
|
+
uploader2 = AstraDBUploader(
|
|
72
|
+
connection_config=connection_config,
|
|
73
|
+
upload_config=AstraDBUploaderConfig(),
|
|
74
|
+
)
|
|
75
|
+
uploader2.precheck()
|
|
76
|
+
|
|
76
77
|
|
|
77
78
|
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, VECTOR_DB_TAG)
|
|
78
79
|
@requires_env("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT")
|
|
@@ -216,6 +217,32 @@ async def test_astra_search_destination(
|
|
|
216
217
|
)
|
|
217
218
|
|
|
218
219
|
|
|
220
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
221
|
+
@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
|
|
222
|
+
def test_astra_create_destination():
|
|
223
|
+
env_data = get_env_data()
|
|
224
|
+
connection_config = AstraDBConnectionConfig(
|
|
225
|
+
access_config=AstraDBAccessConfig(api_endpoint=env_data.api_endpoint, token=env_data.token),
|
|
226
|
+
)
|
|
227
|
+
uploader = AstraDBUploader(
|
|
228
|
+
connection_config=connection_config,
|
|
229
|
+
upload_config=AstraDBUploaderConfig(),
|
|
230
|
+
)
|
|
231
|
+
collection_name = "system_created-123"
|
|
232
|
+
formatted_collection_name = "system_created_123"
|
|
233
|
+
created = uploader.create_destination(destination_name=collection_name, vector_length=3072)
|
|
234
|
+
assert created
|
|
235
|
+
assert uploader.upload_config.collection_name == formatted_collection_name
|
|
236
|
+
|
|
237
|
+
created = uploader.create_destination(destination_name=collection_name, vector_length=3072)
|
|
238
|
+
assert not created
|
|
239
|
+
|
|
240
|
+
# cleanup
|
|
241
|
+
client = AstraDBClient()
|
|
242
|
+
db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
|
|
243
|
+
db.drop_collection(formatted_collection_name)
|
|
244
|
+
|
|
245
|
+
|
|
219
246
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
220
247
|
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
|
|
221
248
|
def test_astra_stager(
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.9" # pragma: no cover
|
|
@@ -63,14 +63,14 @@ class PineconeDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationC
|
|
|
63
63
|
@property
|
|
64
64
|
def pinecone_index(self):
|
|
65
65
|
if self._index is None:
|
|
66
|
-
self._index = self.
|
|
66
|
+
self._index = self.get_index()
|
|
67
67
|
return self._index
|
|
68
68
|
|
|
69
69
|
def initialize(self):
|
|
70
70
|
pass
|
|
71
71
|
|
|
72
72
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
73
|
-
def
|
|
73
|
+
def get_index(self) -> "PineconeIndex":
|
|
74
74
|
from pinecone import Pinecone
|
|
75
75
|
from unstructured import __version__ as unstructured_version
|
|
76
76
|
|
|
@@ -83,6 +83,16 @@ class PineconeDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationC
|
|
|
83
83
|
logger.debug(f"connected to index: {pc.describe_index(self.connector_config.index_name)}")
|
|
84
84
|
return index
|
|
85
85
|
|
|
86
|
+
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
87
|
+
def create_index(self) -> "PineconeIndex":
|
|
88
|
+
logger.warning(
|
|
89
|
+
"create_index (a misleading name as of now) will be deprecated soon. "
|
|
90
|
+
+ "Use get_index instead. This is due to unstructured supporting actual "
|
|
91
|
+
+ "index creation/provisioning now. "
|
|
92
|
+
+ "(Support for v2 connectors only. you are currently using a v1 connector.)"
|
|
93
|
+
)
|
|
94
|
+
return self.get_index()
|
|
95
|
+
|
|
86
96
|
@DestinationConnectionError.wrap
|
|
87
97
|
def check_connection(self):
|
|
88
98
|
_ = self.pinecone_index
|
|
@@ -38,7 +38,9 @@ class Uploader(BaseProcess, BaseConnector, ABC):
|
|
|
38
38
|
def run_batch(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
39
39
|
raise NotImplementedError()
|
|
40
40
|
|
|
41
|
-
def create_destination(
|
|
41
|
+
def create_destination(
|
|
42
|
+
self, destination_name: str = "unstructuredautocreated", **kwargs: Any
|
|
43
|
+
) -> bool:
|
|
42
44
|
# Update the uploader config if needed with a new destination that gets created.
|
|
43
45
|
# Return a flag on if anything was created or not.
|
|
44
46
|
return False
|
|
@@ -61,6 +63,6 @@ class Uploader(BaseProcess, BaseConnector, ABC):
|
|
|
61
63
|
@dataclass
|
|
62
64
|
class VectorDBUploader(Uploader, ABC):
|
|
63
65
|
def create_destination(
|
|
64
|
-
self, vector_length: int, destination_name: str = "
|
|
66
|
+
self, vector_length: int, destination_name: str = "unstructuredautocreated", **kwargs: Any
|
|
65
67
|
) -> bool:
|
|
66
68
|
return False
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import hashlib
|
|
3
|
+
import re
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from time import time
|
|
@@ -48,6 +49,7 @@ if TYPE_CHECKING:
|
|
|
48
49
|
from astrapy import AsyncCollection as AstraDBAsyncCollection
|
|
49
50
|
from astrapy import Collection as AstraDBCollection
|
|
50
51
|
from astrapy import DataAPIClient as AstraDBClient
|
|
52
|
+
from astrapy import Database as AstraDB
|
|
51
53
|
|
|
52
54
|
|
|
53
55
|
CONNECTOR_TYPE = "astradb"
|
|
@@ -85,11 +87,10 @@ class AstraDBConnectionConfig(ConnectionConfig):
|
|
|
85
87
|
)
|
|
86
88
|
|
|
87
89
|
|
|
88
|
-
def
|
|
90
|
+
def get_astra_db(
|
|
89
91
|
connection_config: AstraDBConnectionConfig,
|
|
90
|
-
collection_name: str,
|
|
91
92
|
keyspace: str,
|
|
92
|
-
) -> "
|
|
93
|
+
) -> "AstraDB":
|
|
93
94
|
# Build the Astra DB object.
|
|
94
95
|
access_configs = connection_config.access_config.get_secret_value()
|
|
95
96
|
|
|
@@ -103,9 +104,20 @@ def get_astra_collection(
|
|
|
103
104
|
token=access_configs.token,
|
|
104
105
|
keyspace=keyspace,
|
|
105
106
|
)
|
|
107
|
+
return astra_db
|
|
108
|
+
|
|
106
109
|
|
|
107
|
-
|
|
110
|
+
def get_astra_collection(
|
|
111
|
+
connection_config: AstraDBConnectionConfig,
|
|
112
|
+
collection_name: str,
|
|
113
|
+
keyspace: str,
|
|
114
|
+
) -> "AstraDBCollection":
|
|
115
|
+
|
|
116
|
+
astra_db = get_astra_db(connection_config=connection_config, keyspace=keyspace)
|
|
117
|
+
|
|
118
|
+
# astradb will return a collection object in all cases (even if it doesn't exist)
|
|
108
119
|
astra_db_collection = astra_db.get_collection(name=collection_name)
|
|
120
|
+
|
|
109
121
|
return astra_db_collection
|
|
110
122
|
|
|
111
123
|
|
|
@@ -151,10 +163,11 @@ class AstraDBDownloaderConfig(DownloaderConfig):
|
|
|
151
163
|
|
|
152
164
|
|
|
153
165
|
class AstraDBUploaderConfig(UploaderConfig):
|
|
154
|
-
collection_name: str = Field(
|
|
166
|
+
collection_name: Optional[str] = Field(
|
|
155
167
|
description="The name of the Astra DB collection. "
|
|
156
168
|
"Note that the collection name must only include letters, "
|
|
157
|
-
"numbers, and underscores."
|
|
169
|
+
"numbers, and underscores.",
|
|
170
|
+
default=None,
|
|
158
171
|
)
|
|
159
172
|
keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
|
|
160
173
|
requested_indexing_policy: Optional[dict[str, Any]] = Field(
|
|
@@ -337,25 +350,84 @@ class AstraDBUploader(Uploader):
|
|
|
337
350
|
upload_config: AstraDBUploaderConfig
|
|
338
351
|
connector_type: str = CONNECTOR_TYPE
|
|
339
352
|
|
|
353
|
+
def init(self, **kwargs: Any) -> None:
|
|
354
|
+
self.create_destination(**kwargs)
|
|
355
|
+
|
|
340
356
|
def precheck(self) -> None:
|
|
341
357
|
try:
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
358
|
+
if self.upload_config.collection_name:
|
|
359
|
+
self.get_collection(collection_name=self.upload_config.collection_name).options()
|
|
360
|
+
else:
|
|
361
|
+
# check for db connection only if collection name is not provided
|
|
362
|
+
get_astra_db(
|
|
363
|
+
connection_config=self.connection_config,
|
|
364
|
+
keyspace=self.upload_config.keyspace,
|
|
365
|
+
)
|
|
347
366
|
except Exception as e:
|
|
348
367
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
349
368
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
350
369
|
|
|
351
370
|
@requires_dependencies(["astrapy"], extras="astradb")
|
|
352
|
-
def get_collection(self) -> "AstraDBCollection":
|
|
371
|
+
def get_collection(self, collection_name: Optional[str] = None) -> "AstraDBCollection":
|
|
353
372
|
return get_astra_collection(
|
|
354
373
|
connection_config=self.connection_config,
|
|
355
|
-
collection_name=self.upload_config.collection_name,
|
|
374
|
+
collection_name=collection_name or self.upload_config.collection_name,
|
|
356
375
|
keyspace=self.upload_config.keyspace,
|
|
357
376
|
)
|
|
358
377
|
|
|
378
|
+
def _collection_exists(self, collection_name: str):
|
|
379
|
+
from astrapy.exceptions import CollectionNotFoundException
|
|
380
|
+
|
|
381
|
+
collection = get_astra_collection(
|
|
382
|
+
connection_config=self.connection_config,
|
|
383
|
+
collection_name=collection_name,
|
|
384
|
+
keyspace=self.upload_config.keyspace,
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
try:
|
|
388
|
+
collection.options()
|
|
389
|
+
return True
|
|
390
|
+
except CollectionNotFoundException:
|
|
391
|
+
return False
|
|
392
|
+
except Exception as e:
|
|
393
|
+
logger.error(f"failed to check if astra collection exists : {e}")
|
|
394
|
+
raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
|
|
395
|
+
|
|
396
|
+
def format_destination_name(self, destination_name: str) -> str:
|
|
397
|
+
# AstraDB collection naming requirements:
|
|
398
|
+
# must be below 50 characters
|
|
399
|
+
# must be lowercase alphanumeric and underscores only
|
|
400
|
+
formatted = re.sub(r"[^a-z0-9]", "_", destination_name.lower())
|
|
401
|
+
return formatted
|
|
402
|
+
|
|
403
|
+
def create_destination(
|
|
404
|
+
self,
|
|
405
|
+
vector_length: int,
|
|
406
|
+
destination_name: str = "unstructuredautocreated",
|
|
407
|
+
similarity_metric: Optional[str] = "cosine",
|
|
408
|
+
**kwargs: Any,
|
|
409
|
+
) -> bool:
|
|
410
|
+
destination_name = self.format_destination_name(destination_name)
|
|
411
|
+
collection_name = self.upload_config.collection_name or destination_name
|
|
412
|
+
self.upload_config.collection_name = collection_name
|
|
413
|
+
|
|
414
|
+
if not self._collection_exists(collection_name):
|
|
415
|
+
astra_db = get_astra_db(
|
|
416
|
+
connection_config=self.connection_config, keyspace=self.upload_config.keyspace
|
|
417
|
+
)
|
|
418
|
+
logger.info(
|
|
419
|
+
f"creating default astra collection '{collection_name}' with dimension "
|
|
420
|
+
f"{vector_length} and metric {similarity_metric}"
|
|
421
|
+
)
|
|
422
|
+
astra_db.create_collection(
|
|
423
|
+
collection_name,
|
|
424
|
+
dimension=vector_length,
|
|
425
|
+
metric=similarity_metric,
|
|
426
|
+
)
|
|
427
|
+
return True
|
|
428
|
+
logger.debug(f"collection with name '{collection_name}' already exists, skipping creation")
|
|
429
|
+
return False
|
|
430
|
+
|
|
359
431
|
def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
|
|
360
432
|
logger.debug(
|
|
361
433
|
f"deleting records from collection {collection.name} "
|
|
@@ -233,9 +233,9 @@ class ConfluenceDownloader(Downloader):
|
|
|
233
233
|
raise ValueError(f"Page with ID {doc_id} does not exist.")
|
|
234
234
|
|
|
235
235
|
content = page["body"]["view"]["value"]
|
|
236
|
-
# This supports v2 html parsing in unstructured
|
|
237
236
|
title = page["title"]
|
|
238
|
-
|
|
237
|
+
# Using h1 for title is supported by both v1 and v2 html parsing in unstructured
|
|
238
|
+
title_html = f"<h1>{title}</h1>"
|
|
239
239
|
content = f"<body class='Document' >{title_html}{content}</body>"
|
|
240
240
|
if self.download_config.extract_images:
|
|
241
241
|
with self.connection_config.get_client() as client:
|
|
@@ -208,7 +208,7 @@ class PineconeUploader(VectorDBUploader):
|
|
|
208
208
|
def create_destination(
|
|
209
209
|
self,
|
|
210
210
|
vector_length: int,
|
|
211
|
-
destination_name: str = "
|
|
211
|
+
destination_name: str = "unstructuredautocreated",
|
|
212
212
|
destination_type: Literal["pod", "serverless"] = "serverless",
|
|
213
213
|
serverless_cloud: str = "aws",
|
|
214
214
|
serverless_region: str = "us-west-2",
|
|
@@ -219,7 +219,7 @@ class PineconeUploader(VectorDBUploader):
|
|
|
219
219
|
) -> bool:
|
|
220
220
|
from pinecone import PodSpec, ServerlessSpec
|
|
221
221
|
|
|
222
|
-
index_name =
|
|
222
|
+
index_name = self.connection_config.index_name or destination_name
|
|
223
223
|
index_name = self.format_destination_name(index_name)
|
|
224
224
|
self.connection_config.index_name = index_name
|
|
225
225
|
|
|
@@ -228,13 +228,11 @@ class PineconeUploader(VectorDBUploader):
|
|
|
228
228
|
logger.info(f"creating pinecone index {index_name}")
|
|
229
229
|
|
|
230
230
|
pc = self.connection_config.get_client()
|
|
231
|
-
|
|
232
231
|
if destination_type == "serverless":
|
|
233
232
|
pc.create_index(
|
|
234
|
-
name=
|
|
233
|
+
name=index_name,
|
|
235
234
|
dimension=vector_length,
|
|
236
235
|
spec=ServerlessSpec(cloud=serverless_cloud, region=serverless_region),
|
|
237
|
-
**kwargs,
|
|
238
236
|
)
|
|
239
237
|
|
|
240
238
|
return True
|
|
@@ -244,7 +242,6 @@ class PineconeUploader(VectorDBUploader):
|
|
|
244
242
|
name=destination_name,
|
|
245
243
|
dimension=vector_length,
|
|
246
244
|
spec=PodSpec(environment=pod_environment, pod_type=pod_type, pods=pod_count),
|
|
247
|
-
**kwargs,
|
|
248
245
|
)
|
|
249
246
|
|
|
250
247
|
return True
|
|
@@ -241,10 +241,13 @@ class WeaviateUploader(VectorDBUploader, ABC):
|
|
|
241
241
|
return formatted.capitalize()
|
|
242
242
|
|
|
243
243
|
def create_destination(
|
|
244
|
-
self,
|
|
244
|
+
self,
|
|
245
|
+
destination_name: str = "unstructuredautocreated",
|
|
246
|
+
vector_length: Optional[int] = None,
|
|
247
|
+
**kwargs: Any,
|
|
245
248
|
) -> bool:
|
|
246
|
-
destination_name = self.format_destination_name(destination_name)
|
|
247
249
|
collection_name = self.upload_config.collection or destination_name
|
|
250
|
+
collection_name = self.format_destination_name(collection_name)
|
|
248
251
|
self.upload_config.collection = collection_name
|
|
249
252
|
|
|
250
253
|
connectors_dir = Path(__file__).parents[1]
|
|
@@ -254,9 +257,7 @@ class WeaviateUploader(VectorDBUploader, ABC):
|
|
|
254
257
|
collection_config["class"] = collection_name
|
|
255
258
|
|
|
256
259
|
if not self._collection_exists():
|
|
257
|
-
logger.info(
|
|
258
|
-
f"creating default weaviate collection '{collection_name}' with default configs"
|
|
259
|
-
)
|
|
260
|
+
logger.info(f"creating weaviate collection '{collection_name}' with default configs")
|
|
260
261
|
with self.connection_config.get_client() as weaviate_client:
|
|
261
262
|
weaviate_client.collections.create_from_dict(config=collection_config)
|
|
262
263
|
return True
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.9
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: pandas
|
|
26
26
|
Requires-Dist: pydantic>=2.7
|
|
27
|
+
Requires-Dist: click
|
|
27
28
|
Requires-Dist: dataclasses_json
|
|
28
|
-
Requires-Dist: python-dateutil
|
|
29
29
|
Requires-Dist: tqdm
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist:
|
|
30
|
+
Requires-Dist: opentelemetry-sdk
|
|
31
|
+
Requires-Dist: python-dateutil
|
|
32
32
|
Provides-Extra: remote
|
|
33
33
|
Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
|
|
34
34
|
Provides-Extra: csv
|
|
@@ -71,46 +71,46 @@ Requires-Dist: fsspec; extra == "azure"
|
|
|
71
71
|
Provides-Extra: azure-ai-search
|
|
72
72
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
73
73
|
Provides-Extra: biomed
|
|
74
|
-
Requires-Dist: bs4; extra == "biomed"
|
|
75
74
|
Requires-Dist: requests; extra == "biomed"
|
|
75
|
+
Requires-Dist: bs4; extra == "biomed"
|
|
76
76
|
Provides-Extra: box
|
|
77
|
-
Requires-Dist: boxfs; extra == "box"
|
|
78
77
|
Requires-Dist: fsspec; extra == "box"
|
|
78
|
+
Requires-Dist: boxfs; extra == "box"
|
|
79
79
|
Provides-Extra: chroma
|
|
80
80
|
Requires-Dist: chromadb; extra == "chroma"
|
|
81
81
|
Provides-Extra: clarifai
|
|
82
82
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
83
83
|
Provides-Extra: confluence
|
|
84
|
-
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
85
84
|
Requires-Dist: requests; extra == "confluence"
|
|
85
|
+
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
86
86
|
Provides-Extra: couchbase
|
|
87
87
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
88
88
|
Provides-Extra: delta-table
|
|
89
|
-
Requires-Dist: deltalake; extra == "delta-table"
|
|
90
89
|
Requires-Dist: boto3; extra == "delta-table"
|
|
90
|
+
Requires-Dist: deltalake; extra == "delta-table"
|
|
91
91
|
Provides-Extra: discord
|
|
92
92
|
Requires-Dist: discord.py; extra == "discord"
|
|
93
93
|
Provides-Extra: dropbox
|
|
94
|
-
Requires-Dist: fsspec; extra == "dropbox"
|
|
95
94
|
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
95
|
+
Requires-Dist: fsspec; extra == "dropbox"
|
|
96
96
|
Provides-Extra: duckdb
|
|
97
97
|
Requires-Dist: duckdb; extra == "duckdb"
|
|
98
98
|
Provides-Extra: elasticsearch
|
|
99
99
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
100
100
|
Provides-Extra: gcs
|
|
101
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
102
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
103
101
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
102
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
103
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
104
104
|
Provides-Extra: github
|
|
105
|
-
Requires-Dist: requests; extra == "github"
|
|
106
105
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
106
|
+
Requires-Dist: requests; extra == "github"
|
|
107
107
|
Provides-Extra: gitlab
|
|
108
108
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
109
109
|
Provides-Extra: google-drive
|
|
110
110
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
111
111
|
Provides-Extra: hubspot
|
|
112
|
-
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
113
112
|
Requires-Dist: urllib3; extra == "hubspot"
|
|
113
|
+
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
114
114
|
Provides-Extra: jira
|
|
115
115
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
116
116
|
Provides-Extra: kafka
|
|
@@ -124,23 +124,23 @@ Requires-Dist: pymilvus; extra == "milvus"
|
|
|
124
124
|
Provides-Extra: mongodb
|
|
125
125
|
Requires-Dist: pymongo; extra == "mongodb"
|
|
126
126
|
Provides-Extra: neo4j
|
|
127
|
-
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
128
127
|
Requires-Dist: cymple; extra == "neo4j"
|
|
128
|
+
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
129
129
|
Requires-Dist: networkx; extra == "neo4j"
|
|
130
130
|
Provides-Extra: notion
|
|
131
131
|
Requires-Dist: notion-client; extra == "notion"
|
|
132
|
-
Requires-Dist: backoff; extra == "notion"
|
|
133
132
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
133
|
+
Requires-Dist: backoff; extra == "notion"
|
|
134
134
|
Requires-Dist: httpx; extra == "notion"
|
|
135
135
|
Provides-Extra: onedrive
|
|
136
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
137
136
|
Requires-Dist: msal; extra == "onedrive"
|
|
138
137
|
Requires-Dist: bs4; extra == "onedrive"
|
|
138
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
139
139
|
Provides-Extra: opensearch
|
|
140
140
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
141
141
|
Provides-Extra: outlook
|
|
142
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
143
142
|
Requires-Dist: msal; extra == "outlook"
|
|
143
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
144
144
|
Provides-Extra: pinecone
|
|
145
145
|
Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
|
|
146
146
|
Provides-Extra: postgres
|
|
@@ -155,8 +155,8 @@ Provides-Extra: s3
|
|
|
155
155
|
Requires-Dist: s3fs; extra == "s3"
|
|
156
156
|
Requires-Dist: fsspec; extra == "s3"
|
|
157
157
|
Provides-Extra: sharepoint
|
|
158
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
159
158
|
Requires-Dist: msal; extra == "sharepoint"
|
|
159
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
160
160
|
Provides-Extra: salesforce
|
|
161
161
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
162
162
|
Provides-Extra: sftp
|
|
@@ -165,8 +165,8 @@ Requires-Dist: fsspec; extra == "sftp"
|
|
|
165
165
|
Provides-Extra: slack
|
|
166
166
|
Requires-Dist: slack_sdk[optional]; extra == "slack"
|
|
167
167
|
Provides-Extra: snowflake
|
|
168
|
-
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
169
168
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
169
|
+
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
170
170
|
Provides-Extra: wikipedia
|
|
171
171
|
Requires-Dist: wikipedia; extra == "wikipedia"
|
|
172
172
|
Provides-Extra: weaviate
|
|
@@ -178,18 +178,18 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
|
178
178
|
Provides-Extra: singlestore
|
|
179
179
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
180
180
|
Provides-Extra: vectara
|
|
181
|
-
Requires-Dist: httpx; extra == "vectara"
|
|
182
|
-
Requires-Dist: requests; extra == "vectara"
|
|
183
181
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
182
|
+
Requires-Dist: requests; extra == "vectara"
|
|
183
|
+
Requires-Dist: httpx; extra == "vectara"
|
|
184
184
|
Provides-Extra: vastdb
|
|
185
|
-
Requires-Dist: pyarrow; extra == "vastdb"
|
|
186
|
-
Requires-Dist: ibis; extra == "vastdb"
|
|
187
185
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
186
|
+
Requires-Dist: ibis; extra == "vastdb"
|
|
187
|
+
Requires-Dist: pyarrow; extra == "vastdb"
|
|
188
188
|
Provides-Extra: embed-huggingface
|
|
189
189
|
Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
190
190
|
Provides-Extra: embed-octoai
|
|
191
|
-
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
192
191
|
Requires-Dist: openai; extra == "embed-octoai"
|
|
192
|
+
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
193
193
|
Provides-Extra: embed-vertexai
|
|
194
194
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
195
195
|
Provides-Extra: embed-voyageai
|
|
@@ -197,8 +197,8 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
197
197
|
Provides-Extra: embed-mixedbreadai
|
|
198
198
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
199
199
|
Provides-Extra: openai
|
|
200
|
-
Requires-Dist: tiktoken; extra == "openai"
|
|
201
200
|
Requires-Dist: openai; extra == "openai"
|
|
201
|
+
Requires-Dist: tiktoken; extra == "openai"
|
|
202
202
|
Provides-Extra: bedrock
|
|
203
203
|
Requires-Dist: boto3; extra == "bedrock"
|
|
204
204
|
Requires-Dist: aioboto3; extra == "bedrock"
|
|
@@ -5,7 +5,7 @@ test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
5
5
|
test/integration/chunkers/test_chunkers.py,sha256=USkltQN_mVVCxI0FkJsrS1gnLXlVr-fvsc0tPaK2sWI,1062
|
|
6
6
|
test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
test/integration/connectors/conftest.py,sha256=vYs4WDlCuieAwwErkJxCk4a1lGvr3qpeiAm-YaDznSo,1018
|
|
8
|
-
test/integration/connectors/test_astradb.py,sha256=
|
|
8
|
+
test/integration/connectors/test_astradb.py,sha256=pZmUItFzS91etJONk5HaX8ayarXmFH7RhKmtBxmCClQ,8995
|
|
9
9
|
test/integration/connectors/test_azure_ai_search.py,sha256=MxFwk84vI_HT4taQTGrNpJ8ewGPqHSGrx626j8hC_Pw,9695
|
|
10
10
|
test/integration/connectors/test_chroma.py,sha256=NuQv0PWPM0_LQfdPeUd6IYKqaKKXWmVaHGWjq5aBfOY,3721
|
|
11
11
|
test/integration/connectors/test_confluence.py,sha256=Ju0gRQbD2g9l9iRf2HDZKi7RyPnBGtFRWcGpsqhO3F8,3588
|
|
@@ -107,7 +107,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
107
107
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
108
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
109
109
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
110
|
-
unstructured_ingest/__version__.py,sha256=
|
|
110
|
+
unstructured_ingest/__version__.py,sha256=Shgafr3Iliv3VjkCZFY-nW2PV7lNrzP2f2kMUaHsecA,42
|
|
111
111
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
112
112
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
113
113
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -189,7 +189,7 @@ unstructured_ingest/connector/mongodb.py,sha256=UD8T1V435YvGY68dpL-fyFesD7bcLckp
|
|
|
189
189
|
unstructured_ingest/connector/onedrive.py,sha256=-yy3scFHVIUiPAAQdmJXel3_BMZnZc9qUI8HwecuoJ4,8911
|
|
190
190
|
unstructured_ingest/connector/opensearch.py,sha256=kvzqEqanP6nGHjxCJ2e2CAz9iK8na3yYBX1l4ZuVq0A,7937
|
|
191
191
|
unstructured_ingest/connector/outlook.py,sha256=f7WXb1xhf4iA3B7HTOCz2KuqxrywuChoDsDSy-erwYY,10443
|
|
192
|
-
unstructured_ingest/connector/pinecone.py,sha256=
|
|
192
|
+
unstructured_ingest/connector/pinecone.py,sha256=wS5hkKPDt2hqbqGEcg0s1T_iYJsxGPtDV8f_S-4YCkw,5273
|
|
193
193
|
unstructured_ingest/connector/qdrant.py,sha256=Y1PAW6ueAzkTxoeViZ7JjkErFJNJlSYvzaRU1c-hcJA,4964
|
|
194
194
|
unstructured_ingest/connector/reddit.py,sha256=8pyVSXXKGS9vOlNBeXw1ev5oqu-uWka5hzgUI8CFRos,5457
|
|
195
195
|
unstructured_ingest/connector/registry.py,sha256=SxXKzOGimHGYOPDSCsYm_xhbwNb-DIcv6XqxoPRIaIY,4846
|
|
@@ -400,7 +400,7 @@ unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGS
|
|
|
400
400
|
unstructured_ingest/v2/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
|
|
401
401
|
unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
|
|
402
402
|
unstructured_ingest/v2/interfaces/upload_stager.py,sha256=9EV9863ODDv0Y5liDT3xh2yiVuFiaVVyCcnwCy6nfkM,3172
|
|
403
|
-
unstructured_ingest/v2/interfaces/uploader.py,sha256=
|
|
403
|
+
unstructured_ingest/v2/interfaces/uploader.py,sha256=AMgp0uaJ5XeqiyURLIUnWyoIqhUT9Ak5P_LT9-qasYk,2107
|
|
404
404
|
unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
405
405
|
unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
|
|
406
406
|
unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
|
|
@@ -410,7 +410,7 @@ unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53o
|
|
|
410
410
|
unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
|
|
411
411
|
unstructured_ingest/v2/pipeline/steps/embed.py,sha256=iL6X0G5AvKnlfI-3XRWudlb0-6rD_PqyzA3MFmmcn6M,3199
|
|
412
412
|
unstructured_ingest/v2/pipeline/steps/filter.py,sha256=pju7knTSbB2ll1jC9DPePRDnHlOlvEcU1-sjk6xYGGc,1211
|
|
413
|
-
unstructured_ingest/v2/pipeline/steps/index.py,sha256=
|
|
413
|
+
unstructured_ingest/v2/pipeline/steps/index.py,sha256=m0BbUwe_7s_gFxR9K31IJdAf3_GgKXXajGJec5jcSXA,3557
|
|
414
414
|
unstructured_ingest/v2/pipeline/steps/partition.py,sha256=IJQWaOTcyFlH2bz8WbmynE5Zkd5D8ELOKTnSCnt9Wcc,3282
|
|
415
415
|
unstructured_ingest/v2/pipeline/steps/stage.py,sha256=VR8SLUJdVva61aieVKyxUHzupTCQbQeaMA0CKu4Fx7o,2347
|
|
416
416
|
unstructured_ingest/v2/pipeline/steps/uncompress.py,sha256=p2nPFGbcpivPAZO5jDogTfn0iaL5bCFsgBNMejxVbzE,1768
|
|
@@ -424,10 +424,10 @@ unstructured_ingest/v2/processes/partitioner.py,sha256=ZC9mt85I3o_SLR4DvE7vPBGph
|
|
|
424
424
|
unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
|
|
425
425
|
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=KO1zn-96Qa49TOSZn-gv_RUMGMCmUcdtHoeJqCpxPLY,6219
|
|
426
426
|
unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
|
|
427
|
-
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=
|
|
427
|
+
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=3WFJUNEjeuZFhsLW9KzOIOsiStCjpnqKokS1oIQLUR0,17816
|
|
428
428
|
unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
|
|
429
429
|
unstructured_ingest/v2/processes/connectors/chroma.py,sha256=VHCnM56qNXuHzovJihrNfJnZbWLJShOe8j12PJFrbL0,7219
|
|
430
|
-
unstructured_ingest/v2/processes/connectors/confluence.py,sha256=
|
|
430
|
+
unstructured_ingest/v2/processes/connectors/confluence.py,sha256=uLpbOtTwbl9TmkWVKbAhH-1UOQvYuCN-v1PIA3BFndc,11139
|
|
431
431
|
unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVmg4--MO0ZgbjvhIqt46oYqk9zFSQ,12250
|
|
432
432
|
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=SotSXZQ85_6TO906YvFi3yTml8jE9A_zV6nBJ4oTx8A,7075
|
|
433
433
|
unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
|
|
@@ -440,7 +440,7 @@ unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNN
|
|
|
440
440
|
unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=ijp5hjmDpLoIHL9UJzV4_4vVtQBlQ2R_vLatlUYivX4,17464
|
|
441
441
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=EM9fq67RsiudZvZbi6nDXkS-i6W0xLvbkNvD0G-Ni5E,17779
|
|
442
442
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
443
|
-
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=
|
|
443
|
+
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=O9lC4mZ9V_exg9apiCJSWHsgkuYDSEOlI6CaUS5ZB7c,13961
|
|
444
444
|
unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
|
|
445
445
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
446
446
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=2T9Bm1H_ALwHhG_YP7vsuUUW-mUg61zcaae3aa9BnN4,4827
|
|
@@ -566,10 +566,10 @@ unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWa
|
|
|
566
566
|
unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
|
|
567
567
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
568
568
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
569
|
-
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=
|
|
570
|
-
unstructured_ingest-0.5.
|
|
571
|
-
unstructured_ingest-0.5.
|
|
572
|
-
unstructured_ingest-0.5.
|
|
573
|
-
unstructured_ingest-0.5.
|
|
574
|
-
unstructured_ingest-0.5.
|
|
575
|
-
unstructured_ingest-0.5.
|
|
569
|
+
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=UZ_s8dnVNx9BWFG2fPah4VbQbgEDF4nP78bQeU3jg08,12821
|
|
570
|
+
unstructured_ingest-0.5.9.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
571
|
+
unstructured_ingest-0.5.9.dist-info/METADATA,sha256=MvJkJj8xsL18KTeSJbMCEyDOXQ9aJ1xh9WYAnnLxizM,8316
|
|
572
|
+
unstructured_ingest-0.5.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
573
|
+
unstructured_ingest-0.5.9.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
574
|
+
unstructured_ingest-0.5.9.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
575
|
+
unstructured_ingest-0.5.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.5.7.dist-info → unstructured_ingest-0.5.9.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|