unstructured-ingest 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_lancedb.py +2 -1
- test/integration/connectors/test_mongodb.py +332 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
- unstructured_ingest/v2/processes/connectors/mongodb.py +122 -111
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +10 -0
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.2.dist-info}/METADATA +12 -12
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.2.dist-info}/RECORD +14 -13
- /test/integration/connectors/{test_azure_cog_search.py → test_azure_ai_search.py} +0 -0
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Literal, Union
|
|
4
|
+
from uuid import uuid4
|
|
4
5
|
|
|
5
6
|
import lancedb
|
|
6
7
|
import pandas as pd
|
|
@@ -150,7 +151,7 @@ def _get_uri(target: Literal["local", "s3", "gcs", "az"], local_base_path: Path)
|
|
|
150
151
|
elif target == "az":
|
|
151
152
|
base_uri = UPath(AZURE_BUCKET)
|
|
152
153
|
|
|
153
|
-
return str(base_uri / "destination" / "lancedb" / DATABASE_NAME)
|
|
154
|
+
return str(base_uri / "destination" / "lancedb" / str(uuid4()) / DATABASE_NAME)
|
|
154
155
|
|
|
155
156
|
|
|
156
157
|
def _get_uploader(
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
import uuid
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Generator
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
from pydantic import BaseModel, SecretStr
|
|
11
|
+
from pymongo.collection import Collection
|
|
12
|
+
from pymongo.database import Database
|
|
13
|
+
from pymongo.mongo_client import MongoClient
|
|
14
|
+
from pymongo.operations import SearchIndexModel
|
|
15
|
+
|
|
16
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
|
|
17
|
+
from test.integration.connectors.utils.validation import (
|
|
18
|
+
ValidationConfigs,
|
|
19
|
+
source_connector_validation,
|
|
20
|
+
)
|
|
21
|
+
from test.integration.utils import requires_env
|
|
22
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
23
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
24
|
+
from unstructured_ingest.v2.processes.connectors.mongodb import (
|
|
25
|
+
CONNECTOR_TYPE,
|
|
26
|
+
MongoDBAccessConfig,
|
|
27
|
+
MongoDBConnectionConfig,
|
|
28
|
+
MongoDBDownloader,
|
|
29
|
+
MongoDBDownloaderConfig,
|
|
30
|
+
MongoDBIndexer,
|
|
31
|
+
MongoDBIndexerConfig,
|
|
32
|
+
MongoDBUploader,
|
|
33
|
+
MongoDBUploaderConfig,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
SOURCE_COLLECTION = "sample-mongodb-data"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class EnvData(BaseModel):
|
|
40
|
+
uri: SecretStr
|
|
41
|
+
database: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_env_data() -> EnvData:
|
|
45
|
+
uri = os.getenv("MONGODB_URI")
|
|
46
|
+
assert uri
|
|
47
|
+
database = os.getenv("MONGODB_DATABASE")
|
|
48
|
+
assert database
|
|
49
|
+
return EnvData(uri=uri, database=database)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@contextmanager
|
|
53
|
+
def get_client() -> Generator[MongoClient, None, None]:
|
|
54
|
+
uri = get_env_data().uri.get_secret_value()
|
|
55
|
+
with MongoClient(uri) as client:
|
|
56
|
+
assert client.admin.command("ping")
|
|
57
|
+
yield client
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def wait_for_collection(
|
|
61
|
+
database: Database, collection_name: str, retries: int = 10, interval: int = 1
|
|
62
|
+
):
|
|
63
|
+
collections = database.list_collection_names()
|
|
64
|
+
attempts = 0
|
|
65
|
+
while collection_name not in collections and attempts < retries:
|
|
66
|
+
attempts += 1
|
|
67
|
+
print(
|
|
68
|
+
"Waiting for collection {} to be recognized: {}".format(
|
|
69
|
+
collection_name, ", ".join(collections)
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
time.sleep(interval)
|
|
73
|
+
collections = database.list_collection_names()
|
|
74
|
+
if collection_name not in collection_name:
|
|
75
|
+
raise TimeoutError(f"Collection {collection_name} was not recognized")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_search_index_status(collection: Collection, index_name: str) -> str:
|
|
79
|
+
search_indexes = collection.list_search_indexes(name=index_name)
|
|
80
|
+
search_index = list(search_indexes)[0]
|
|
81
|
+
return search_index["status"]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def wait_for_search_index(
|
|
85
|
+
collection: Collection, index_name: str, retries: int = 60, interval: int = 1
|
|
86
|
+
):
|
|
87
|
+
current_status = get_search_index_status(collection, index_name)
|
|
88
|
+
attempts = 0
|
|
89
|
+
while current_status != "READY" and attempts < retries:
|
|
90
|
+
attempts += 1
|
|
91
|
+
print(f"attempt {attempts}: waiting for search index to be READY: {current_status}")
|
|
92
|
+
time.sleep(interval)
|
|
93
|
+
current_status = get_search_index_status(collection, index_name)
|
|
94
|
+
|
|
95
|
+
if current_status != "READY":
|
|
96
|
+
raise TimeoutError("search index never detected as READY")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@pytest.fixture
|
|
100
|
+
def destination_collection() -> Collection:
|
|
101
|
+
env_data = get_env_data()
|
|
102
|
+
collection_name = f"utic-test-output-{uuid.uuid4()}"
|
|
103
|
+
with get_client() as client:
|
|
104
|
+
database = client[env_data.database]
|
|
105
|
+
print(f"creating collection in database {database}: {collection_name}")
|
|
106
|
+
collection = database.create_collection(name=collection_name)
|
|
107
|
+
search_index_name = "embeddings"
|
|
108
|
+
collection.create_search_index(
|
|
109
|
+
model=SearchIndexModel(
|
|
110
|
+
name=search_index_name,
|
|
111
|
+
definition={
|
|
112
|
+
"mappings": {
|
|
113
|
+
"dynamic": True,
|
|
114
|
+
"fields": {
|
|
115
|
+
"embeddings": [
|
|
116
|
+
{"type": "knnVector", "dimensions": 384, "similarity": "euclidean"}
|
|
117
|
+
]
|
|
118
|
+
},
|
|
119
|
+
}
|
|
120
|
+
},
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
collection.create_index("record_id")
|
|
124
|
+
wait_for_collection(database=database, collection_name=collection_name)
|
|
125
|
+
wait_for_search_index(collection=collection, index_name=search_index_name)
|
|
126
|
+
try:
|
|
127
|
+
yield collection
|
|
128
|
+
finally:
|
|
129
|
+
print(f"deleting collection: {collection_name}")
|
|
130
|
+
collection.drop()
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def validate_collection_count(
|
|
134
|
+
collection: Collection, expected_records: int, retries: int = 10, interval: int = 1
|
|
135
|
+
) -> None:
|
|
136
|
+
count = collection.count_documents(filter={})
|
|
137
|
+
attempt = 0
|
|
138
|
+
while count != expected_records and attempt < retries:
|
|
139
|
+
attempt += 1
|
|
140
|
+
print(f"attempt {attempt} to get count of collection {count} to match {expected_records}")
|
|
141
|
+
time.sleep(interval)
|
|
142
|
+
count = collection.count_documents(filter={})
|
|
143
|
+
assert (
|
|
144
|
+
count == expected_records
|
|
145
|
+
), f"expected count ({expected_records}) does not match how many records were found: {count}"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def validate_collection_vector(
|
|
149
|
+
collection: Collection, embedding: list[float], text: str, retries: int = 30, interval: int = 1
|
|
150
|
+
) -> None:
|
|
151
|
+
pipeline = [
|
|
152
|
+
{
|
|
153
|
+
"$vectorSearch": {
|
|
154
|
+
"index": "embeddings",
|
|
155
|
+
"path": "embeddings",
|
|
156
|
+
"queryVector": embedding,
|
|
157
|
+
"numCandidates": 150,
|
|
158
|
+
"limit": 10,
|
|
159
|
+
},
|
|
160
|
+
},
|
|
161
|
+
{"$project": {"_id": 0, "text": 1, "score": {"$meta": "vectorSearchScore"}}},
|
|
162
|
+
]
|
|
163
|
+
attempts = 0
|
|
164
|
+
results = list(collection.aggregate(pipeline=pipeline))
|
|
165
|
+
while not results and attempts < retries:
|
|
166
|
+
attempts += 1
|
|
167
|
+
print(f"attempt {attempts}, waiting for valid results: {results}")
|
|
168
|
+
time.sleep(interval)
|
|
169
|
+
results = list(collection.aggregate(pipeline=pipeline))
|
|
170
|
+
if not results:
|
|
171
|
+
raise TimeoutError("Timed out waiting for valid results")
|
|
172
|
+
print(f"found results on attempt {attempts}")
|
|
173
|
+
top_result = results[0]
|
|
174
|
+
assert top_result["score"] == 1.0, "score detected should be 1: {}".format(top_result["score"])
|
|
175
|
+
assert top_result["text"] == text, "text detected should be {}, found: {}".format(
|
|
176
|
+
text, top_result["text"]
|
|
177
|
+
)
|
|
178
|
+
for r in results[1:]:
|
|
179
|
+
assert r["score"] < 1.0, "score detected should be less than 1: {}".format(r["score"])
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@pytest.mark.asyncio
|
|
183
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
184
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
185
|
+
async def test_mongodb_source(temp_dir: Path):
|
|
186
|
+
env_data = get_env_data()
|
|
187
|
+
indexer_config = MongoDBIndexerConfig(database=env_data.database, collection=SOURCE_COLLECTION)
|
|
188
|
+
download_config = MongoDBDownloaderConfig(download_dir=temp_dir)
|
|
189
|
+
connection_config = MongoDBConnectionConfig(
|
|
190
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
191
|
+
)
|
|
192
|
+
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
193
|
+
downloader = MongoDBDownloader(
|
|
194
|
+
connection_config=connection_config, download_config=download_config
|
|
195
|
+
)
|
|
196
|
+
await source_connector_validation(
|
|
197
|
+
indexer=indexer,
|
|
198
|
+
downloader=downloader,
|
|
199
|
+
configs=ValidationConfigs(
|
|
200
|
+
test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
|
|
201
|
+
),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
206
|
+
def test_mongodb_indexer_precheck_fail_no_host():
|
|
207
|
+
indexer_config = MongoDBIndexerConfig(
|
|
208
|
+
database="non-existent-database", collection="non-existent-database"
|
|
209
|
+
)
|
|
210
|
+
connection_config = MongoDBConnectionConfig(
|
|
211
|
+
access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
|
|
212
|
+
)
|
|
213
|
+
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
214
|
+
with pytest.raises(SourceConnectionError):
|
|
215
|
+
indexer.precheck()
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
219
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
220
|
+
def test_mongodb_indexer_precheck_fail_no_database():
|
|
221
|
+
env_data = get_env_data()
|
|
222
|
+
indexer_config = MongoDBIndexerConfig(
|
|
223
|
+
database="non-existent-database", collection=SOURCE_COLLECTION
|
|
224
|
+
)
|
|
225
|
+
connection_config = MongoDBConnectionConfig(
|
|
226
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
227
|
+
)
|
|
228
|
+
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
229
|
+
with pytest.raises(SourceConnectionError):
|
|
230
|
+
indexer.precheck()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
234
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
235
|
+
def test_mongodb_indexer_precheck_fail_no_collection():
|
|
236
|
+
env_data = get_env_data()
|
|
237
|
+
indexer_config = MongoDBIndexerConfig(
|
|
238
|
+
database=env_data.database, collection="non-existent-collection"
|
|
239
|
+
)
|
|
240
|
+
connection_config = MongoDBConnectionConfig(
|
|
241
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
242
|
+
)
|
|
243
|
+
indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
244
|
+
with pytest.raises(SourceConnectionError):
|
|
245
|
+
indexer.precheck()
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@pytest.mark.asyncio
|
|
249
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
250
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
251
|
+
async def test_mongodb_destination(
|
|
252
|
+
upload_file: Path,
|
|
253
|
+
destination_collection: Collection,
|
|
254
|
+
tmp_path: Path,
|
|
255
|
+
):
|
|
256
|
+
env_data = get_env_data()
|
|
257
|
+
file_data = FileData(
|
|
258
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
259
|
+
connector_type=CONNECTOR_TYPE,
|
|
260
|
+
identifier="mongodb_mock_id",
|
|
261
|
+
)
|
|
262
|
+
connection_config = MongoDBConnectionConfig(
|
|
263
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
upload_config = MongoDBUploaderConfig(
|
|
267
|
+
database=env_data.database,
|
|
268
|
+
collection=destination_collection.name,
|
|
269
|
+
)
|
|
270
|
+
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
271
|
+
uploader.precheck()
|
|
272
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
273
|
+
|
|
274
|
+
with upload_file.open() as f:
|
|
275
|
+
staged_elements = json.load(f)
|
|
276
|
+
expected_records = len(staged_elements)
|
|
277
|
+
validate_collection_count(collection=destination_collection, expected_records=expected_records)
|
|
278
|
+
first_element = staged_elements[0]
|
|
279
|
+
validate_collection_vector(
|
|
280
|
+
collection=destination_collection,
|
|
281
|
+
embedding=first_element["embeddings"],
|
|
282
|
+
text=first_element["text"],
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
286
|
+
validate_collection_count(collection=destination_collection, expected_records=expected_records)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
290
|
+
def test_mongodb_uploader_precheck_fail_no_host():
|
|
291
|
+
upload_config = MongoDBUploaderConfig(
|
|
292
|
+
database="database",
|
|
293
|
+
collection="collection",
|
|
294
|
+
)
|
|
295
|
+
connection_config = MongoDBConnectionConfig(
|
|
296
|
+
access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
|
|
297
|
+
)
|
|
298
|
+
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
299
|
+
with pytest.raises(DestinationConnectionError):
|
|
300
|
+
uploader.precheck()
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
304
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
305
|
+
def test_mongodb_uploader_precheck_fail_no_database():
|
|
306
|
+
env_data = get_env_data()
|
|
307
|
+
upload_config = MongoDBUploaderConfig(
|
|
308
|
+
database="database",
|
|
309
|
+
collection="collection",
|
|
310
|
+
)
|
|
311
|
+
connection_config = MongoDBConnectionConfig(
|
|
312
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
313
|
+
)
|
|
314
|
+
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
315
|
+
with pytest.raises(DestinationConnectionError):
|
|
316
|
+
uploader.precheck()
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
320
|
+
@requires_env("MONGODB_URI", "MONGODB_DATABASE")
|
|
321
|
+
def test_mongodb_uploader_precheck_fail_no_collection():
|
|
322
|
+
env_data = get_env_data()
|
|
323
|
+
upload_config = MongoDBUploaderConfig(
|
|
324
|
+
database=env_data.database,
|
|
325
|
+
collection="collection",
|
|
326
|
+
)
|
|
327
|
+
connection_config = MongoDBConnectionConfig(
|
|
328
|
+
access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
|
|
329
|
+
)
|
|
330
|
+
uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
|
|
331
|
+
with pytest.raises(DestinationConnectionError):
|
|
332
|
+
uploader.precheck()
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.2" # pragma: no cover
|
|
@@ -161,7 +161,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
161
161
|
and isinstance(parent_root_path, str)
|
|
162
162
|
):
|
|
163
163
|
fullpath = f"{parent_path}/{filename}"
|
|
164
|
-
rel_path = fullpath.
|
|
164
|
+
rel_path = Path(fullpath).relative_to(parent_root_path).as_posix()
|
|
165
165
|
source_identifiers = SourceIdentifiers(
|
|
166
166
|
filename=filename, fullpath=fullpath, rel_path=rel_path
|
|
167
167
|
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import sys
|
|
3
|
-
from
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, replace
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from time import time
|
|
@@ -12,6 +13,7 @@ from unstructured_ingest.__version__ import __version__ as unstructured_version
|
|
|
12
13
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
13
14
|
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
14
15
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
15
17
|
from unstructured_ingest.v2.interfaces import (
|
|
16
18
|
AccessConfig,
|
|
17
19
|
ConnectionConfig,
|
|
@@ -24,8 +26,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
24
26
|
SourceIdentifiers,
|
|
25
27
|
Uploader,
|
|
26
28
|
UploaderConfig,
|
|
27
|
-
UploadStager,
|
|
28
|
-
UploadStagerConfig,
|
|
29
29
|
download_responses,
|
|
30
30
|
)
|
|
31
31
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -36,6 +36,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
36
36
|
|
|
37
37
|
if TYPE_CHECKING:
|
|
38
38
|
from pymongo import MongoClient
|
|
39
|
+
from pymongo.collection import Collection
|
|
39
40
|
|
|
40
41
|
CONNECTOR_TYPE = "mongodb"
|
|
41
42
|
SERVER_API_VERSION = "1"
|
|
@@ -54,18 +55,37 @@ class MongoDBConnectionConfig(ConnectionConfig):
|
|
|
54
55
|
description="hostname or IP address or Unix domain socket path of a single mongod or "
|
|
55
56
|
"mongos instance to connect to, or a list of hostnames",
|
|
56
57
|
)
|
|
57
|
-
database: Optional[str] = Field(default=None, description="database name to connect to")
|
|
58
|
-
collection: Optional[str] = Field(default=None, description="collection name to connect to")
|
|
59
58
|
port: int = Field(default=27017)
|
|
60
59
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
61
60
|
|
|
61
|
+
@contextmanager
|
|
62
|
+
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
63
|
+
def get_client(self) -> Generator["MongoClient", None, None]:
|
|
64
|
+
from pymongo import MongoClient
|
|
65
|
+
from pymongo.driver_info import DriverInfo
|
|
66
|
+
from pymongo.server_api import ServerApi
|
|
62
67
|
|
|
63
|
-
|
|
64
|
-
|
|
68
|
+
access_config = self.access_config.get_secret_value()
|
|
69
|
+
if uri := access_config.uri:
|
|
70
|
+
client_kwargs = {
|
|
71
|
+
"host": uri,
|
|
72
|
+
"server_api": ServerApi(version=SERVER_API_VERSION),
|
|
73
|
+
"driver": DriverInfo(name="unstructured", version=unstructured_version),
|
|
74
|
+
}
|
|
75
|
+
else:
|
|
76
|
+
client_kwargs = {
|
|
77
|
+
"host": self.host,
|
|
78
|
+
"port": self.port,
|
|
79
|
+
"server_api": ServerApi(version=SERVER_API_VERSION),
|
|
80
|
+
}
|
|
81
|
+
with MongoClient(**client_kwargs) as client:
|
|
82
|
+
yield client
|
|
65
83
|
|
|
66
84
|
|
|
67
85
|
class MongoDBIndexerConfig(IndexerConfig):
|
|
68
86
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
87
|
+
database: Optional[str] = Field(default=None, description="database name to connect to")
|
|
88
|
+
collection: Optional[str] = Field(default=None, description="collection name to connect to")
|
|
69
89
|
|
|
70
90
|
|
|
71
91
|
class MongoDBDownloaderConfig(DownloaderConfig):
|
|
@@ -81,42 +101,38 @@ class MongoDBIndexer(Indexer):
|
|
|
81
101
|
def precheck(self) -> None:
|
|
82
102
|
"""Validates the connection to the MongoDB server."""
|
|
83
103
|
try:
|
|
84
|
-
|
|
85
|
-
|
|
104
|
+
with self.connection_config.get_client() as client:
|
|
105
|
+
client.admin.command("ping")
|
|
106
|
+
database_names = client.list_database_names()
|
|
107
|
+
database_name = self.index_config.database
|
|
108
|
+
if database_name not in database_names:
|
|
109
|
+
raise DestinationConnectionError(
|
|
110
|
+
"database {} does not exist: {}".format(
|
|
111
|
+
database_name, ", ".join(database_names)
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
database = client[database_name]
|
|
115
|
+
collection_names = database.list_collection_names()
|
|
116
|
+
collection_name = self.index_config.collection
|
|
117
|
+
if collection_name not in collection_names:
|
|
118
|
+
raise SourceConnectionError(
|
|
119
|
+
"collection {} does not exist: {}".format(
|
|
120
|
+
collection_name, ", ".join(collection_names)
|
|
121
|
+
)
|
|
122
|
+
)
|
|
86
123
|
except Exception as e:
|
|
87
124
|
logger.error(f"Failed to validate connection: {e}", exc_info=True)
|
|
88
125
|
raise SourceConnectionError(f"Failed to validate connection: {e}")
|
|
89
126
|
|
|
90
|
-
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
91
|
-
def create_client(self) -> "MongoClient":
|
|
92
|
-
from pymongo import MongoClient
|
|
93
|
-
from pymongo.driver_info import DriverInfo
|
|
94
|
-
from pymongo.server_api import ServerApi
|
|
95
|
-
|
|
96
|
-
access_config = self.connection_config.access_config.get_secret_value()
|
|
97
|
-
|
|
98
|
-
if access_config.uri:
|
|
99
|
-
return MongoClient(
|
|
100
|
-
access_config.uri,
|
|
101
|
-
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
102
|
-
driver=DriverInfo(name="unstructured", version=unstructured_version),
|
|
103
|
-
)
|
|
104
|
-
else:
|
|
105
|
-
return MongoClient(
|
|
106
|
-
host=self.connection_config.host,
|
|
107
|
-
port=self.connection_config.port,
|
|
108
|
-
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
109
|
-
)
|
|
110
|
-
|
|
111
127
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
112
128
|
"""Generates FileData objects for each document in the MongoDB collection."""
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
129
|
+
with self.connection_config.get_client() as client:
|
|
130
|
+
database = client[self.index_config.database]
|
|
131
|
+
collection = database[self.index_config.collection]
|
|
116
132
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
133
|
+
# Get list of document IDs
|
|
134
|
+
ids = collection.distinct("_id")
|
|
135
|
+
batch_size = self.index_config.batch_size if self.index_config else 100
|
|
120
136
|
|
|
121
137
|
for id_batch in batch_generator(ids, batch_size=batch_size):
|
|
122
138
|
# Make sure the hash is always a positive number to create identifier
|
|
@@ -125,8 +141,8 @@ class MongoDBIndexer(Indexer):
|
|
|
125
141
|
metadata = FileDataSourceMetadata(
|
|
126
142
|
date_processed=str(time()),
|
|
127
143
|
record_locator={
|
|
128
|
-
"database": self.
|
|
129
|
-
"collection": self.
|
|
144
|
+
"database": self.index_config.database,
|
|
145
|
+
"collection": self.index_config.collection,
|
|
130
146
|
},
|
|
131
147
|
)
|
|
132
148
|
|
|
@@ -177,8 +193,8 @@ class MongoDBDownloader(Downloader):
|
|
|
177
193
|
from bson.objectid import ObjectId
|
|
178
194
|
|
|
179
195
|
client = self.create_client()
|
|
180
|
-
database = client[
|
|
181
|
-
collection = database[
|
|
196
|
+
database = client[file_data.metadata.record_locator["database"]]
|
|
197
|
+
collection = database[file_data.metadata.record_locator["collection"]]
|
|
182
198
|
|
|
183
199
|
ids = file_data.additional_metadata.get("ids", [])
|
|
184
200
|
if not ids:
|
|
@@ -222,14 +238,12 @@ class MongoDBDownloader(Downloader):
|
|
|
222
238
|
concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
|
|
223
239
|
|
|
224
240
|
# Create a FileData object for each document with source_identifiers
|
|
225
|
-
individual_file_data =
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
rel_path=str(doc_id),
|
|
232
|
-
),
|
|
241
|
+
individual_file_data = replace(file_data)
|
|
242
|
+
individual_file_data.identifier = str(doc_id)
|
|
243
|
+
individual_file_data.source_identifiers = SourceIdentifiers(
|
|
244
|
+
filename=str(doc_id),
|
|
245
|
+
fullpath=str(doc_id),
|
|
246
|
+
rel_path=str(doc_id),
|
|
233
247
|
)
|
|
234
248
|
|
|
235
249
|
# Determine the download path
|
|
@@ -247,15 +261,8 @@ class MongoDBDownloader(Downloader):
|
|
|
247
261
|
individual_file_data.local_download_path = str(download_path)
|
|
248
262
|
|
|
249
263
|
# Update metadata
|
|
250
|
-
individual_file_data.metadata =
|
|
251
|
-
|
|
252
|
-
date_processed=str(time()),
|
|
253
|
-
record_locator={
|
|
254
|
-
"database": self.connection_config.database,
|
|
255
|
-
"collection": self.connection_config.collection,
|
|
256
|
-
"document_id": str(doc_id),
|
|
257
|
-
},
|
|
258
|
-
)
|
|
264
|
+
individual_file_data.metadata.record_locator["document_id"] = str(doc_id)
|
|
265
|
+
individual_file_data.metadata.date_created = date_created
|
|
259
266
|
|
|
260
267
|
download_response = self.generate_download_response(
|
|
261
268
|
file_data=individual_file_data, download_path=download_path
|
|
@@ -265,31 +272,14 @@ class MongoDBDownloader(Downloader):
|
|
|
265
272
|
return download_responses
|
|
266
273
|
|
|
267
274
|
|
|
268
|
-
@dataclass
|
|
269
|
-
class MongoDBUploadStager(UploadStager):
|
|
270
|
-
upload_stager_config: MongoDBUploadStagerConfig = field(
|
|
271
|
-
default_factory=lambda: MongoDBUploadStagerConfig()
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
def run(
|
|
275
|
-
self,
|
|
276
|
-
elements_filepath: Path,
|
|
277
|
-
file_data: FileData,
|
|
278
|
-
output_dir: Path,
|
|
279
|
-
output_filename: str,
|
|
280
|
-
**kwargs: Any,
|
|
281
|
-
) -> Path:
|
|
282
|
-
with open(elements_filepath) as elements_file:
|
|
283
|
-
elements_contents = json.load(elements_file)
|
|
284
|
-
|
|
285
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
286
|
-
with open(output_path, "w") as output_file:
|
|
287
|
-
json.dump(elements_contents, output_file)
|
|
288
|
-
return output_path
|
|
289
|
-
|
|
290
|
-
|
|
291
275
|
class MongoDBUploaderConfig(UploaderConfig):
|
|
292
276
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
277
|
+
database: Optional[str] = Field(default=None, description="database name to connect to")
|
|
278
|
+
collection: Optional[str] = Field(default=None, description="collection name to connect to")
|
|
279
|
+
record_id_key: str = Field(
|
|
280
|
+
default=RECORD_ID_LABEL,
|
|
281
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
282
|
+
)
|
|
293
283
|
|
|
294
284
|
|
|
295
285
|
@dataclass
|
|
@@ -300,55 +290,76 @@ class MongoDBUploader(Uploader):
|
|
|
300
290
|
|
|
301
291
|
def precheck(self) -> None:
|
|
302
292
|
try:
|
|
303
|
-
|
|
304
|
-
|
|
293
|
+
with self.connection_config.get_client() as client:
|
|
294
|
+
client.admin.command("ping")
|
|
295
|
+
database_names = client.list_database_names()
|
|
296
|
+
database_name = self.upload_config.database
|
|
297
|
+
if database_name not in database_names:
|
|
298
|
+
raise DestinationConnectionError(
|
|
299
|
+
"database {} does not exist: {}".format(
|
|
300
|
+
database_name, ", ".join(database_names)
|
|
301
|
+
)
|
|
302
|
+
)
|
|
303
|
+
database = client[database_name]
|
|
304
|
+
collection_names = database.list_collection_names()
|
|
305
|
+
collection_name = self.upload_config.collection
|
|
306
|
+
if collection_name not in collection_names:
|
|
307
|
+
raise SourceConnectionError(
|
|
308
|
+
"collection {} does not exist: {}".format(
|
|
309
|
+
collection_name, ", ".join(collection_names)
|
|
310
|
+
)
|
|
311
|
+
)
|
|
305
312
|
except Exception as e:
|
|
306
313
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
307
314
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
308
315
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
328
|
-
)
|
|
316
|
+
def can_delete(self, collection: "Collection") -> bool:
|
|
317
|
+
indexed_keys = []
|
|
318
|
+
for index in collection.list_indexes():
|
|
319
|
+
key_bson = index["key"]
|
|
320
|
+
indexed_keys.extend(key_bson.keys())
|
|
321
|
+
return self.upload_config.record_id_key in indexed_keys
|
|
322
|
+
|
|
323
|
+
def delete_by_record_id(self, collection: "Collection", file_data: FileData) -> None:
|
|
324
|
+
logger.debug(
|
|
325
|
+
f"deleting any content with metadata "
|
|
326
|
+
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
327
|
+
f"from collection: {collection.name}"
|
|
328
|
+
)
|
|
329
|
+
query = {self.upload_config.record_id_key: file_data.identifier}
|
|
330
|
+
delete_results = collection.delete_many(filter=query)
|
|
331
|
+
logger.info(
|
|
332
|
+
f"deleted {delete_results.deleted_count} records from collection {collection.name}"
|
|
333
|
+
)
|
|
329
334
|
|
|
330
335
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
331
336
|
with path.open("r") as file:
|
|
332
337
|
elements_dict = json.load(file)
|
|
333
338
|
logger.info(
|
|
334
339
|
f"writing {len(elements_dict)} objects to destination "
|
|
335
|
-
f"db, {self.
|
|
336
|
-
f"collection {self.
|
|
340
|
+
f"db, {self.upload_config.database}, "
|
|
341
|
+
f"collection {self.upload_config.collection} "
|
|
337
342
|
f"at {self.connection_config.host}",
|
|
338
343
|
)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
+
# This would typically live in the stager but since no other manipulation
|
|
345
|
+
# is done, setting the record id field in the uploader
|
|
346
|
+
for element in elements_dict:
|
|
347
|
+
element[self.upload_config.record_id_key] = file_data.identifier
|
|
348
|
+
with self.connection_config.get_client() as client:
|
|
349
|
+
db = client[self.upload_config.database]
|
|
350
|
+
collection = db[self.upload_config.collection]
|
|
351
|
+
if self.can_delete(collection=collection):
|
|
352
|
+
self.delete_by_record_id(file_data=file_data, collection=collection)
|
|
353
|
+
else:
|
|
354
|
+
logger.warning("criteria for deleting previous content not met, skipping")
|
|
355
|
+
for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
|
|
356
|
+
collection.insert_many(chunk)
|
|
344
357
|
|
|
345
358
|
|
|
346
359
|
mongodb_destination_entry = DestinationRegistryEntry(
|
|
347
360
|
connection_config=MongoDBConnectionConfig,
|
|
348
361
|
uploader=MongoDBUploader,
|
|
349
362
|
uploader_config=MongoDBUploaderConfig,
|
|
350
|
-
upload_stager=MongoDBUploadStager,
|
|
351
|
-
upload_stager_config=MongoDBUploadStagerConfig,
|
|
352
363
|
)
|
|
353
364
|
|
|
354
365
|
mongodb_source_entry = SourceRegistryEntry(
|
|
@@ -10,6 +10,8 @@ from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
|
|
|
10
10
|
from .embedded import weaviate_embedded_destination_entry
|
|
11
11
|
from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
|
|
12
12
|
from .local import weaviate_local_destination_entry
|
|
13
|
+
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
|
14
|
+
from .weaviate import weaviate_destination_entry
|
|
13
15
|
|
|
14
16
|
add_destination_entry(
|
|
15
17
|
destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
|
|
@@ -20,3 +22,4 @@ add_destination_entry(
|
|
|
20
22
|
add_destination_entry(
|
|
21
23
|
destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
|
|
22
24
|
)
|
|
25
|
+
add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
|
|
@@ -22,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
22
22
|
UploadStagerConfig,
|
|
23
23
|
)
|
|
24
24
|
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
25
26
|
|
|
26
27
|
if TYPE_CHECKING:
|
|
27
28
|
from weaviate.classes.init import Timeout
|
|
@@ -287,3 +288,12 @@ class WeaviateUploader(Uploader, ABC):
|
|
|
287
288
|
vector=vector,
|
|
288
289
|
)
|
|
289
290
|
self.check_for_errors(client=weaviate_client)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
weaviate_destination_entry = DestinationRegistryEntry(
|
|
294
|
+
connection_config=WeaviateConnectionConfig,
|
|
295
|
+
uploader=WeaviateUploader,
|
|
296
|
+
uploader_config=WeaviateUploaderConfig,
|
|
297
|
+
upload_stager=WeaviateUploadStager,
|
|
298
|
+
upload_stager_config=WeaviateUploadStagerConfig,
|
|
299
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -23,12 +23,12 @@ Requires-Python: >=3.9.0,<3.13
|
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
25
|
Requires-Dist: python-dateutil
|
|
26
|
-
Requires-Dist: tqdm
|
|
27
|
-
Requires-Dist: click
|
|
28
|
-
Requires-Dist: dataclasses-json
|
|
29
26
|
Requires-Dist: pydantic>=2.7
|
|
30
|
-
Requires-Dist: pandas
|
|
31
27
|
Requires-Dist: opentelemetry-sdk
|
|
28
|
+
Requires-Dist: click
|
|
29
|
+
Requires-Dist: tqdm
|
|
30
|
+
Requires-Dist: pandas
|
|
31
|
+
Requires-Dist: dataclasses-json
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
@@ -41,8 +41,8 @@ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
42
|
Requires-Dist: boto3; extra == "bedrock"
|
|
43
43
|
Provides-Extra: biomed
|
|
44
|
-
Requires-Dist: requests; extra == "biomed"
|
|
45
44
|
Requires-Dist: bs4; extra == "biomed"
|
|
45
|
+
Requires-Dist: requests; extra == "biomed"
|
|
46
46
|
Provides-Extra: box
|
|
47
47
|
Requires-Dist: boxfs; extra == "box"
|
|
48
48
|
Requires-Dist: fsspec; extra == "box"
|
|
@@ -78,8 +78,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
|
78
78
|
Provides-Extra: embed-mixedbreadai
|
|
79
79
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
80
80
|
Provides-Extra: embed-octoai
|
|
81
|
-
Requires-Dist: openai; extra == "embed-octoai"
|
|
82
81
|
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
82
|
+
Requires-Dist: openai; extra == "embed-octoai"
|
|
83
83
|
Provides-Extra: embed-vertexai
|
|
84
84
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
85
85
|
Provides-Extra: embed-voyageai
|
|
@@ -98,8 +98,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
|
|
|
98
98
|
Provides-Extra: google-drive
|
|
99
99
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
100
100
|
Provides-Extra: hubspot
|
|
101
|
-
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
102
101
|
Requires-Dist: urllib3; extra == "hubspot"
|
|
102
|
+
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
103
103
|
Provides-Extra: jira
|
|
104
104
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
105
105
|
Provides-Extra: kafka
|
|
@@ -115,19 +115,19 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
115
115
|
Provides-Extra: msg
|
|
116
116
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
117
117
|
Provides-Extra: notion
|
|
118
|
+
Requires-Dist: backoff; extra == "notion"
|
|
118
119
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
119
120
|
Requires-Dist: notion-client; extra == "notion"
|
|
120
|
-
Requires-Dist: backoff; extra == "notion"
|
|
121
121
|
Requires-Dist: httpx; extra == "notion"
|
|
122
122
|
Provides-Extra: odt
|
|
123
123
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
124
124
|
Provides-Extra: onedrive
|
|
125
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
126
125
|
Requires-Dist: bs4; extra == "onedrive"
|
|
126
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
127
127
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
128
128
|
Provides-Extra: openai
|
|
129
|
-
Requires-Dist: openai; extra == "openai"
|
|
130
129
|
Requires-Dist: tiktoken; extra == "openai"
|
|
130
|
+
Requires-Dist: openai; extra == "openai"
|
|
131
131
|
Provides-Extra: opensearch
|
|
132
132
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
133
133
|
Provides-Extra: org
|
|
@@ -156,8 +156,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
156
156
|
Provides-Extra: rtf
|
|
157
157
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
158
158
|
Provides-Extra: s3
|
|
159
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
160
159
|
Requires-Dist: fsspec; extra == "s3"
|
|
160
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
161
161
|
Provides-Extra: salesforce
|
|
162
162
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
163
163
|
Provides-Extra: sftp
|
|
@@ -6,12 +6,13 @@ test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-Lr
|
|
|
6
6
|
test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
test/integration/connectors/conftest.py,sha256=6dVNMBrL6WIO4KXA-0nf2tNrPYk_tsor8uomi6fbi3Q,727
|
|
8
8
|
test/integration/connectors/test_astradb.py,sha256=QPFrODXmOHagpuKaiooxXb3OEW93w2g4fmq8BkaBCnY,5303
|
|
9
|
-
test/integration/connectors/
|
|
9
|
+
test/integration/connectors/test_azure_ai_search.py,sha256=dae4GifRiKue5YpsxworDiaMQoMsxcPDBithb6OFkx4,8876
|
|
10
10
|
test/integration/connectors/test_confluence.py,sha256=xcPmZ_vi_pkCt-tUPn10P49FH9i_9YUbrAPO6fYk5rU,3521
|
|
11
11
|
test/integration/connectors/test_delta_table.py,sha256=GSzWIkbEUzOrRPt2F1uO0dabcp7kTFDj75BhhI2y-WU,6856
|
|
12
12
|
test/integration/connectors/test_kafka.py,sha256=j7jsNWZumNBv9v-5Bpx8geUUXpxxad5EuA4CMRsl4R8,7104
|
|
13
|
-
test/integration/connectors/test_lancedb.py,sha256=
|
|
13
|
+
test/integration/connectors/test_lancedb.py,sha256=O3YF6MVBkCsCgklXCJe8Kpy8aKGfafASVH4PspmpcYs,7628
|
|
14
14
|
test/integration/connectors/test_milvus.py,sha256=CVmYw9iEeKT_0OtShxye2E6i1LbWzzDA8JtwJRkYQlA,4763
|
|
15
|
+
test/integration/connectors/test_mongodb.py,sha256=YeS_DUnVYN02F76j87W8RhXGHnJMzQYb3n-L1-oWGXI,12254
|
|
15
16
|
test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
|
|
16
17
|
test/integration/connectors/test_pinecone.py,sha256=X10OWZ6IrO6YyhuR3ydMAZOQq3u2f5u_lCjKNYUUcnI,7558
|
|
17
18
|
test/integration/connectors/test_qdrant.py,sha256=ASvO-BNyhv8m8or28KljrJy27Da0uaTNeoR5w_QsvFg,5121
|
|
@@ -80,7 +81,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
80
81
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
81
82
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
82
83
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
83
|
-
unstructured_ingest/__version__.py,sha256=
|
|
84
|
+
unstructured_ingest/__version__.py,sha256=Js7MXQhyIj1akVjPNsLkmZxqoOHDGOr2opEPgFOSTZQ,42
|
|
84
85
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
85
86
|
unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
|
|
86
87
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -400,11 +401,11 @@ unstructured_ingest/v2/processes/connectors/confluence.py,sha256=qQApDcmPBGg4tHX
|
|
|
400
401
|
unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=yhMDbpkZXs-Kis7tFlgjvNemU-MdWMdpCZDrpZNFaU4,12180
|
|
401
402
|
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=1yS7ivEyiucwd_kv6LL5HQdGabT43yeG6XCdwiz89hc,8019
|
|
402
403
|
unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=yBgCeLy9iCVI8bBDcHHuHB0H3BO05e9E1OccbHwvKAo,9724
|
|
403
|
-
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=
|
|
404
|
+
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=EEwXK1Anlu-eXl5qxmdDIqPYW7eMSez6WGlTPG2vSn8,13121
|
|
404
405
|
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
|
|
405
406
|
unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
|
|
406
407
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=Bzv2fa852BcM4_Pr-I_DPvLmjPoXv0Z7BeEA8qSKCDc,9725
|
|
407
|
-
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=
|
|
408
|
+
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
|
|
408
409
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
|
|
409
410
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
410
411
|
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=hWkXgVDAzCtrBxf7A4HoexBACGAfVf_Qvn9YHbeiBSY,11505
|
|
@@ -451,14 +452,14 @@ unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=YrmhAL1RQ1
|
|
|
451
452
|
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=jl524VudwmFK63emCT7DmZan_EWJAMiGir5_zoO9FuY,5697
|
|
452
453
|
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=LFzGeAUagLknK07DsXg2oSG7ZAgR6VqT9wfI_tYlHUg,14782
|
|
453
454
|
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
|
|
454
|
-
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=
|
|
455
|
+
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=eXamSnQdzzMvt62z80B8nmlkwDKO-Pogln_K_zLz53A,1067
|
|
455
456
|
unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=2g1Fm2J0ppfy2jCw4b5YtrsWrSD3VcrAaqiE7FlpIAg,6236
|
|
456
457
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
457
458
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
458
|
-
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=
|
|
459
|
-
unstructured_ingest-0.3.
|
|
460
|
-
unstructured_ingest-0.3.
|
|
461
|
-
unstructured_ingest-0.3.
|
|
462
|
-
unstructured_ingest-0.3.
|
|
463
|
-
unstructured_ingest-0.3.
|
|
464
|
-
unstructured_ingest-0.3.
|
|
459
|
+
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=ln1p9ahFTaT-qsL7p4bgw_IqnU60As_l6vVAqUWyQVE,11655
|
|
460
|
+
unstructured_ingest-0.3.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
461
|
+
unstructured_ingest-0.3.2.dist-info/METADATA,sha256=rqTWqewB8eIrgrHJ-8AsNtehy35eSHKseCsveXTwN3Y,7326
|
|
462
|
+
unstructured_ingest-0.3.2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
463
|
+
unstructured_ingest-0.3.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
464
|
+
unstructured_ingest-0.3.2.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
465
|
+
unstructured_ingest-0.3.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|