unstructured-ingest 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_lancedb.py +9 -8
- test/integration/connectors/test_milvus.py +34 -6
- test/integration/connectors/test_mongodb.py +332 -0
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/unit/test_utils.py +21 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/string_and_date_utils.py +10 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +16 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -4
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +7 -7
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +9 -3
- unstructured_ingest/v2/processes/connectors/mongodb.py +122 -111
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +4 -3
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +10 -0
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/METADATA +14 -12
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/RECORD +24 -21
- /test/integration/connectors/{test_azure_cog_search.py → test_azure_ai_search.py} +0 -0
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.interfaces.connector import AccessConfig
|
|
6
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
7
|
+
from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
|
|
8
|
+
LanceDBRemoteConnectionConfig,
|
|
9
|
+
LanceDBUploader,
|
|
10
|
+
LanceDBUploaderConfig,
|
|
11
|
+
LanceDBUploadStager,
|
|
12
|
+
LanceDBUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "lancedb_cloud"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LanceDBCloudAccessConfig(AccessConfig):
|
|
19
|
+
api_key: str = Field(description="Api key associated with LanceDb cloud")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LanceDBCloudConnectionConfig(LanceDBRemoteConnectionConfig):
|
|
23
|
+
access_config: Secret[LanceDBCloudAccessConfig]
|
|
24
|
+
|
|
25
|
+
def get_storage_options(self) -> dict:
|
|
26
|
+
return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class LanceDBCloudUploader(LanceDBUploader):
|
|
31
|
+
upload_config: LanceDBUploaderConfig
|
|
32
|
+
connection_config: LanceDBCloudConnectionConfig
|
|
33
|
+
connector_type: str = CONNECTOR_TYPE
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
lancedb_cloud_destination_entry = DestinationRegistryEntry(
|
|
37
|
+
connection_config=LanceDBCloudConnectionConfig,
|
|
38
|
+
uploader=LanceDBCloudUploader,
|
|
39
|
+
uploader_config=LanceDBUploaderConfig,
|
|
40
|
+
upload_stager_config=LanceDBUploadStagerConfig,
|
|
41
|
+
upload_stager=LanceDBUploadStager,
|
|
42
|
+
)
|
|
@@ -8,7 +8,7 @@ import pandas as pd
|
|
|
8
8
|
from dateutil import parser
|
|
9
9
|
from pydantic import Field, Secret
|
|
10
10
|
|
|
11
|
-
from unstructured_ingest.error import WriteError
|
|
11
|
+
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
12
12
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
14
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
@@ -66,7 +66,6 @@ class MilvusConnectionConfig(ConnectionConfig):
|
|
|
66
66
|
|
|
67
67
|
|
|
68
68
|
class MilvusUploadStagerConfig(UploadStagerConfig):
|
|
69
|
-
|
|
70
69
|
fields_to_include: Optional[list[str]] = None
|
|
71
70
|
"""If set - list of fields to include in the output.
|
|
72
71
|
Unspecified fields are removed from the elements.
|
|
@@ -174,6 +173,14 @@ class MilvusUploader(Uploader):
|
|
|
174
173
|
upload_config: MilvusUploaderConfig
|
|
175
174
|
connector_type: str = CONNECTOR_TYPE
|
|
176
175
|
|
|
176
|
+
@DestinationConnectionError.wrap
|
|
177
|
+
def precheck(self):
|
|
178
|
+
with self.get_client() as client:
|
|
179
|
+
if not client.has_collection(self.upload_config.collection_name):
|
|
180
|
+
raise DestinationConnectionError(
|
|
181
|
+
f"Collection '{self.upload_config.collection_name}' does not exist"
|
|
182
|
+
)
|
|
183
|
+
|
|
177
184
|
@contextmanager
|
|
178
185
|
def get_client(self) -> Generator["MilvusClient", None, None]:
|
|
179
186
|
client = self.connection_config.get_client()
|
|
@@ -218,7 +225,6 @@ class MilvusUploader(Uploader):
|
|
|
218
225
|
f"db in collection {self.upload_config.collection_name}"
|
|
219
226
|
)
|
|
220
227
|
with self.get_client() as client:
|
|
221
|
-
|
|
222
228
|
try:
|
|
223
229
|
res = client.insert(collection_name=self.upload_config.collection_name, data=data)
|
|
224
230
|
except MilvusException as milvus_exception:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import sys
|
|
3
|
-
from
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, replace
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from time import time
|
|
@@ -12,6 +13,7 @@ from unstructured_ingest.__version__ import __version__ as unstructured_version
|
|
|
12
13
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
13
14
|
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
14
15
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
15
17
|
from unstructured_ingest.v2.interfaces import (
|
|
16
18
|
AccessConfig,
|
|
17
19
|
ConnectionConfig,
|
|
@@ -24,8 +26,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
24
26
|
SourceIdentifiers,
|
|
25
27
|
Uploader,
|
|
26
28
|
UploaderConfig,
|
|
27
|
-
UploadStager,
|
|
28
|
-
UploadStagerConfig,
|
|
29
29
|
download_responses,
|
|
30
30
|
)
|
|
31
31
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -36,6 +36,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
36
36
|
|
|
37
37
|
if TYPE_CHECKING:
|
|
38
38
|
from pymongo import MongoClient
|
|
39
|
+
from pymongo.collection import Collection
|
|
39
40
|
|
|
40
41
|
CONNECTOR_TYPE = "mongodb"
|
|
41
42
|
SERVER_API_VERSION = "1"
|
|
@@ -54,18 +55,37 @@ class MongoDBConnectionConfig(ConnectionConfig):
|
|
|
54
55
|
description="hostname or IP address or Unix domain socket path of a single mongod or "
|
|
55
56
|
"mongos instance to connect to, or a list of hostnames",
|
|
56
57
|
)
|
|
57
|
-
database: Optional[str] = Field(default=None, description="database name to connect to")
|
|
58
|
-
collection: Optional[str] = Field(default=None, description="collection name to connect to")
|
|
59
58
|
port: int = Field(default=27017)
|
|
60
59
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
61
60
|
|
|
61
|
+
@contextmanager
|
|
62
|
+
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
63
|
+
def get_client(self) -> Generator["MongoClient", None, None]:
|
|
64
|
+
from pymongo import MongoClient
|
|
65
|
+
from pymongo.driver_info import DriverInfo
|
|
66
|
+
from pymongo.server_api import ServerApi
|
|
62
67
|
|
|
63
|
-
|
|
64
|
-
|
|
68
|
+
access_config = self.access_config.get_secret_value()
|
|
69
|
+
if uri := access_config.uri:
|
|
70
|
+
client_kwargs = {
|
|
71
|
+
"host": uri,
|
|
72
|
+
"server_api": ServerApi(version=SERVER_API_VERSION),
|
|
73
|
+
"driver": DriverInfo(name="unstructured", version=unstructured_version),
|
|
74
|
+
}
|
|
75
|
+
else:
|
|
76
|
+
client_kwargs = {
|
|
77
|
+
"host": self.host,
|
|
78
|
+
"port": self.port,
|
|
79
|
+
"server_api": ServerApi(version=SERVER_API_VERSION),
|
|
80
|
+
}
|
|
81
|
+
with MongoClient(**client_kwargs) as client:
|
|
82
|
+
yield client
|
|
65
83
|
|
|
66
84
|
|
|
67
85
|
class MongoDBIndexerConfig(IndexerConfig):
|
|
68
86
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
87
|
+
database: Optional[str] = Field(default=None, description="database name to connect to")
|
|
88
|
+
collection: Optional[str] = Field(default=None, description="collection name to connect to")
|
|
69
89
|
|
|
70
90
|
|
|
71
91
|
class MongoDBDownloaderConfig(DownloaderConfig):
|
|
@@ -81,42 +101,38 @@ class MongoDBIndexer(Indexer):
|
|
|
81
101
|
def precheck(self) -> None:
|
|
82
102
|
"""Validates the connection to the MongoDB server."""
|
|
83
103
|
try:
|
|
84
|
-
|
|
85
|
-
|
|
104
|
+
with self.connection_config.get_client() as client:
|
|
105
|
+
client.admin.command("ping")
|
|
106
|
+
database_names = client.list_database_names()
|
|
107
|
+
database_name = self.index_config.database
|
|
108
|
+
if database_name not in database_names:
|
|
109
|
+
raise DestinationConnectionError(
|
|
110
|
+
"database {} does not exist: {}".format(
|
|
111
|
+
database_name, ", ".join(database_names)
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
database = client[database_name]
|
|
115
|
+
collection_names = database.list_collection_names()
|
|
116
|
+
collection_name = self.index_config.collection
|
|
117
|
+
if collection_name not in collection_names:
|
|
118
|
+
raise SourceConnectionError(
|
|
119
|
+
"collection {} does not exist: {}".format(
|
|
120
|
+
collection_name, ", ".join(collection_names)
|
|
121
|
+
)
|
|
122
|
+
)
|
|
86
123
|
except Exception as e:
|
|
87
124
|
logger.error(f"Failed to validate connection: {e}", exc_info=True)
|
|
88
125
|
raise SourceConnectionError(f"Failed to validate connection: {e}")
|
|
89
126
|
|
|
90
|
-
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
91
|
-
def create_client(self) -> "MongoClient":
|
|
92
|
-
from pymongo import MongoClient
|
|
93
|
-
from pymongo.driver_info import DriverInfo
|
|
94
|
-
from pymongo.server_api import ServerApi
|
|
95
|
-
|
|
96
|
-
access_config = self.connection_config.access_config.get_secret_value()
|
|
97
|
-
|
|
98
|
-
if access_config.uri:
|
|
99
|
-
return MongoClient(
|
|
100
|
-
access_config.uri,
|
|
101
|
-
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
102
|
-
driver=DriverInfo(name="unstructured", version=unstructured_version),
|
|
103
|
-
)
|
|
104
|
-
else:
|
|
105
|
-
return MongoClient(
|
|
106
|
-
host=self.connection_config.host,
|
|
107
|
-
port=self.connection_config.port,
|
|
108
|
-
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
109
|
-
)
|
|
110
|
-
|
|
111
127
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
112
128
|
"""Generates FileData objects for each document in the MongoDB collection."""
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
129
|
+
with self.connection_config.get_client() as client:
|
|
130
|
+
database = client[self.index_config.database]
|
|
131
|
+
collection = database[self.index_config.collection]
|
|
116
132
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
133
|
+
# Get list of document IDs
|
|
134
|
+
ids = collection.distinct("_id")
|
|
135
|
+
batch_size = self.index_config.batch_size if self.index_config else 100
|
|
120
136
|
|
|
121
137
|
for id_batch in batch_generator(ids, batch_size=batch_size):
|
|
122
138
|
# Make sure the hash is always a positive number to create identifier
|
|
@@ -125,8 +141,8 @@ class MongoDBIndexer(Indexer):
|
|
|
125
141
|
metadata = FileDataSourceMetadata(
|
|
126
142
|
date_processed=str(time()),
|
|
127
143
|
record_locator={
|
|
128
|
-
"database": self.
|
|
129
|
-
"collection": self.
|
|
144
|
+
"database": self.index_config.database,
|
|
145
|
+
"collection": self.index_config.collection,
|
|
130
146
|
},
|
|
131
147
|
)
|
|
132
148
|
|
|
@@ -177,8 +193,8 @@ class MongoDBDownloader(Downloader):
|
|
|
177
193
|
from bson.objectid import ObjectId
|
|
178
194
|
|
|
179
195
|
client = self.create_client()
|
|
180
|
-
database = client[
|
|
181
|
-
collection = database[
|
|
196
|
+
database = client[file_data.metadata.record_locator["database"]]
|
|
197
|
+
collection = database[file_data.metadata.record_locator["collection"]]
|
|
182
198
|
|
|
183
199
|
ids = file_data.additional_metadata.get("ids", [])
|
|
184
200
|
if not ids:
|
|
@@ -222,14 +238,12 @@ class MongoDBDownloader(Downloader):
|
|
|
222
238
|
concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
|
|
223
239
|
|
|
224
240
|
# Create a FileData object for each document with source_identifiers
|
|
225
|
-
individual_file_data =
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
rel_path=str(doc_id),
|
|
232
|
-
),
|
|
241
|
+
individual_file_data = replace(file_data)
|
|
242
|
+
individual_file_data.identifier = str(doc_id)
|
|
243
|
+
individual_file_data.source_identifiers = SourceIdentifiers(
|
|
244
|
+
filename=str(doc_id),
|
|
245
|
+
fullpath=str(doc_id),
|
|
246
|
+
rel_path=str(doc_id),
|
|
233
247
|
)
|
|
234
248
|
|
|
235
249
|
# Determine the download path
|
|
@@ -247,15 +261,8 @@ class MongoDBDownloader(Downloader):
|
|
|
247
261
|
individual_file_data.local_download_path = str(download_path)
|
|
248
262
|
|
|
249
263
|
# Update metadata
|
|
250
|
-
individual_file_data.metadata =
|
|
251
|
-
|
|
252
|
-
date_processed=str(time()),
|
|
253
|
-
record_locator={
|
|
254
|
-
"database": self.connection_config.database,
|
|
255
|
-
"collection": self.connection_config.collection,
|
|
256
|
-
"document_id": str(doc_id),
|
|
257
|
-
},
|
|
258
|
-
)
|
|
264
|
+
individual_file_data.metadata.record_locator["document_id"] = str(doc_id)
|
|
265
|
+
individual_file_data.metadata.date_created = date_created
|
|
259
266
|
|
|
260
267
|
download_response = self.generate_download_response(
|
|
261
268
|
file_data=individual_file_data, download_path=download_path
|
|
@@ -265,31 +272,14 @@ class MongoDBDownloader(Downloader):
|
|
|
265
272
|
return download_responses
|
|
266
273
|
|
|
267
274
|
|
|
268
|
-
@dataclass
|
|
269
|
-
class MongoDBUploadStager(UploadStager):
|
|
270
|
-
upload_stager_config: MongoDBUploadStagerConfig = field(
|
|
271
|
-
default_factory=lambda: MongoDBUploadStagerConfig()
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
def run(
|
|
275
|
-
self,
|
|
276
|
-
elements_filepath: Path,
|
|
277
|
-
file_data: FileData,
|
|
278
|
-
output_dir: Path,
|
|
279
|
-
output_filename: str,
|
|
280
|
-
**kwargs: Any,
|
|
281
|
-
) -> Path:
|
|
282
|
-
with open(elements_filepath) as elements_file:
|
|
283
|
-
elements_contents = json.load(elements_file)
|
|
284
|
-
|
|
285
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
286
|
-
with open(output_path, "w") as output_file:
|
|
287
|
-
json.dump(elements_contents, output_file)
|
|
288
|
-
return output_path
|
|
289
|
-
|
|
290
|
-
|
|
291
275
|
class MongoDBUploaderConfig(UploaderConfig):
|
|
292
276
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
277
|
+
database: Optional[str] = Field(default=None, description="database name to connect to")
|
|
278
|
+
collection: Optional[str] = Field(default=None, description="collection name to connect to")
|
|
279
|
+
record_id_key: str = Field(
|
|
280
|
+
default=RECORD_ID_LABEL,
|
|
281
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
282
|
+
)
|
|
293
283
|
|
|
294
284
|
|
|
295
285
|
@dataclass
|
|
@@ -300,55 +290,76 @@ class MongoDBUploader(Uploader):
|
|
|
300
290
|
|
|
301
291
|
def precheck(self) -> None:
|
|
302
292
|
try:
|
|
303
|
-
|
|
304
|
-
|
|
293
|
+
with self.connection_config.get_client() as client:
|
|
294
|
+
client.admin.command("ping")
|
|
295
|
+
database_names = client.list_database_names()
|
|
296
|
+
database_name = self.upload_config.database
|
|
297
|
+
if database_name not in database_names:
|
|
298
|
+
raise DestinationConnectionError(
|
|
299
|
+
"database {} does not exist: {}".format(
|
|
300
|
+
database_name, ", ".join(database_names)
|
|
301
|
+
)
|
|
302
|
+
)
|
|
303
|
+
database = client[database_name]
|
|
304
|
+
collection_names = database.list_collection_names()
|
|
305
|
+
collection_name = self.upload_config.collection
|
|
306
|
+
if collection_name not in collection_names:
|
|
307
|
+
raise SourceConnectionError(
|
|
308
|
+
"collection {} does not exist: {}".format(
|
|
309
|
+
collection_name, ", ".join(collection_names)
|
|
310
|
+
)
|
|
311
|
+
)
|
|
305
312
|
except Exception as e:
|
|
306
313
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
307
314
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
308
315
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
328
|
-
)
|
|
316
|
+
def can_delete(self, collection: "Collection") -> bool:
|
|
317
|
+
indexed_keys = []
|
|
318
|
+
for index in collection.list_indexes():
|
|
319
|
+
key_bson = index["key"]
|
|
320
|
+
indexed_keys.extend(key_bson.keys())
|
|
321
|
+
return self.upload_config.record_id_key in indexed_keys
|
|
322
|
+
|
|
323
|
+
def delete_by_record_id(self, collection: "Collection", file_data: FileData) -> None:
|
|
324
|
+
logger.debug(
|
|
325
|
+
f"deleting any content with metadata "
|
|
326
|
+
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
327
|
+
f"from collection: {collection.name}"
|
|
328
|
+
)
|
|
329
|
+
query = {self.upload_config.record_id_key: file_data.identifier}
|
|
330
|
+
delete_results = collection.delete_many(filter=query)
|
|
331
|
+
logger.info(
|
|
332
|
+
f"deleted {delete_results.deleted_count} records from collection {collection.name}"
|
|
333
|
+
)
|
|
329
334
|
|
|
330
335
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
331
336
|
with path.open("r") as file:
|
|
332
337
|
elements_dict = json.load(file)
|
|
333
338
|
logger.info(
|
|
334
339
|
f"writing {len(elements_dict)} objects to destination "
|
|
335
|
-
f"db, {self.
|
|
336
|
-
f"collection {self.
|
|
340
|
+
f"db, {self.upload_config.database}, "
|
|
341
|
+
f"collection {self.upload_config.collection} "
|
|
337
342
|
f"at {self.connection_config.host}",
|
|
338
343
|
)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
+
# This would typically live in the stager but since no other manipulation
|
|
345
|
+
# is done, setting the record id field in the uploader
|
|
346
|
+
for element in elements_dict:
|
|
347
|
+
element[self.upload_config.record_id_key] = file_data.identifier
|
|
348
|
+
with self.connection_config.get_client() as client:
|
|
349
|
+
db = client[self.upload_config.database]
|
|
350
|
+
collection = db[self.upload_config.collection]
|
|
351
|
+
if self.can_delete(collection=collection):
|
|
352
|
+
self.delete_by_record_id(file_data=file_data, collection=collection)
|
|
353
|
+
else:
|
|
354
|
+
logger.warning("criteria for deleting previous content not met, skipping")
|
|
355
|
+
for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
|
|
356
|
+
collection.insert_many(chunk)
|
|
344
357
|
|
|
345
358
|
|
|
346
359
|
mongodb_destination_entry = DestinationRegistryEntry(
|
|
347
360
|
connection_config=MongoDBConnectionConfig,
|
|
348
361
|
uploader=MongoDBUploader,
|
|
349
362
|
uploader_config=MongoDBUploaderConfig,
|
|
350
|
-
upload_stager=MongoDBUploadStager,
|
|
351
|
-
upload_stager_config=MongoDBUploadStagerConfig,
|
|
352
363
|
)
|
|
353
364
|
|
|
354
365
|
mongodb_source_entry = SourceRegistryEntry(
|
|
@@ -10,6 +10,8 @@ from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
|
|
|
10
10
|
from .embedded import weaviate_embedded_destination_entry
|
|
11
11
|
from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
|
|
12
12
|
from .local import weaviate_local_destination_entry
|
|
13
|
+
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
|
14
|
+
from .weaviate import weaviate_destination_entry
|
|
13
15
|
|
|
14
16
|
add_destination_entry(
|
|
15
17
|
destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
|
|
@@ -20,3 +22,4 @@ add_destination_entry(
|
|
|
20
22
|
add_destination_entry(
|
|
21
23
|
destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
|
|
22
24
|
)
|
|
25
|
+
add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
|
|
@@ -55,10 +55,11 @@ class CloudWeaviateConnectionConfig(WeaviateConnectionConfig):
|
|
|
55
55
|
"client_secret": access_config.client_secret is not None,
|
|
56
56
|
"client_password": access_config.password is not None and self.username is not None,
|
|
57
57
|
}
|
|
58
|
-
|
|
58
|
+
existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
|
|
59
|
+
|
|
60
|
+
if len(existing_auths) == 0:
|
|
59
61
|
raise ValueError("No auth values provided and anonymous is False")
|
|
60
|
-
if len(
|
|
61
|
-
existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
|
|
62
|
+
if len(existing_auths) > 1:
|
|
62
63
|
raise ValueError(
|
|
63
64
|
"Multiple auth values provided, only one approach can be used: {}".format(
|
|
64
65
|
", ".join(existing_auths)
|
|
@@ -22,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
22
22
|
UploadStagerConfig,
|
|
23
23
|
)
|
|
24
24
|
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
25
26
|
|
|
26
27
|
if TYPE_CHECKING:
|
|
27
28
|
from weaviate.classes.init import Timeout
|
|
@@ -287,3 +288,12 @@ class WeaviateUploader(Uploader, ABC):
|
|
|
287
288
|
vector=vector,
|
|
288
289
|
)
|
|
289
290
|
self.check_for_errors(client=weaviate_client)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
weaviate_destination_entry = DestinationRegistryEntry(
|
|
294
|
+
connection_config=WeaviateConnectionConfig,
|
|
295
|
+
uploader=WeaviateUploader,
|
|
296
|
+
uploader_config=WeaviateUploaderConfig,
|
|
297
|
+
upload_stager=WeaviateUploadStager,
|
|
298
|
+
upload_stager_config=WeaviateUploadStagerConfig,
|
|
299
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,27 +22,27 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: python-dateutil
|
|
26
|
-
Requires-Dist: tqdm
|
|
27
|
-
Requires-Dist: click
|
|
28
25
|
Requires-Dist: dataclasses-json
|
|
29
26
|
Requires-Dist: pydantic>=2.7
|
|
30
27
|
Requires-Dist: pandas
|
|
28
|
+
Requires-Dist: tqdm
|
|
29
|
+
Requires-Dist: python-dateutil
|
|
30
|
+
Requires-Dist: click
|
|
31
31
|
Requires-Dist: opentelemetry-sdk
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
38
37
|
Requires-Dist: adlfs; extra == "azure"
|
|
38
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-ai-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
42
|
Requires-Dist: boto3; extra == "bedrock"
|
|
43
43
|
Provides-Extra: biomed
|
|
44
|
-
Requires-Dist: requests; extra == "biomed"
|
|
45
44
|
Requires-Dist: bs4; extra == "biomed"
|
|
45
|
+
Requires-Dist: requests; extra == "biomed"
|
|
46
46
|
Provides-Extra: box
|
|
47
47
|
Requires-Dist: boxfs; extra == "box"
|
|
48
48
|
Requires-Dist: fsspec; extra == "box"
|
|
@@ -91,21 +91,23 @@ Requires-Dist: bs4; extra == "gcs"
|
|
|
91
91
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
92
92
|
Requires-Dist: fsspec; extra == "gcs"
|
|
93
93
|
Provides-Extra: github
|
|
94
|
-
Requires-Dist: requests; extra == "github"
|
|
95
94
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
95
|
+
Requires-Dist: requests; extra == "github"
|
|
96
96
|
Provides-Extra: gitlab
|
|
97
97
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
98
98
|
Provides-Extra: google-drive
|
|
99
99
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
100
100
|
Provides-Extra: hubspot
|
|
101
|
-
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
102
101
|
Requires-Dist: urllib3; extra == "hubspot"
|
|
102
|
+
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
103
103
|
Provides-Extra: jira
|
|
104
104
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
105
105
|
Provides-Extra: kafka
|
|
106
106
|
Requires-Dist: confluent-kafka; extra == "kafka"
|
|
107
107
|
Provides-Extra: kdbai
|
|
108
108
|
Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
|
|
109
|
+
Provides-Extra: lancedb
|
|
110
|
+
Requires-Dist: lancedb; extra == "lancedb"
|
|
109
111
|
Provides-Extra: md
|
|
110
112
|
Requires-Dist: unstructured[md]; extra == "md"
|
|
111
113
|
Provides-Extra: milvus
|
|
@@ -115,16 +117,16 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
115
117
|
Provides-Extra: msg
|
|
116
118
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
117
119
|
Provides-Extra: notion
|
|
120
|
+
Requires-Dist: backoff; extra == "notion"
|
|
118
121
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
119
122
|
Requires-Dist: notion-client; extra == "notion"
|
|
120
|
-
Requires-Dist: backoff; extra == "notion"
|
|
121
123
|
Requires-Dist: httpx; extra == "notion"
|
|
122
124
|
Provides-Extra: odt
|
|
123
125
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
124
126
|
Provides-Extra: onedrive
|
|
125
127
|
Requires-Dist: msal; extra == "onedrive"
|
|
126
|
-
Requires-Dist: bs4; extra == "onedrive"
|
|
127
128
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
129
|
+
Requires-Dist: bs4; extra == "onedrive"
|
|
128
130
|
Provides-Extra: openai
|
|
129
131
|
Requires-Dist: openai; extra == "openai"
|
|
130
132
|
Requires-Dist: tiktoken; extra == "openai"
|
|
@@ -156,8 +158,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
156
158
|
Provides-Extra: rtf
|
|
157
159
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
158
160
|
Provides-Extra: s3
|
|
159
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
160
161
|
Requires-Dist: fsspec; extra == "s3"
|
|
162
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
161
163
|
Provides-Extra: salesforce
|
|
162
164
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
163
165
|
Provides-Extra: sftp
|
|
@@ -171,8 +173,8 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
171
173
|
Provides-Extra: slack
|
|
172
174
|
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
173
175
|
Provides-Extra: snowflake
|
|
174
|
-
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
175
176
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
177
|
+
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
176
178
|
Provides-Extra: togetherai
|
|
177
179
|
Requires-Dist: together; extra == "togetherai"
|
|
178
180
|
Provides-Extra: tsv
|