unstructured-ingest 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from pathlib import Path
3
3
  from typing import Literal, Union
4
+ from uuid import uuid4
4
5
 
5
6
  import lancedb
6
7
  import pandas as pd
@@ -150,7 +151,7 @@ def _get_uri(target: Literal["local", "s3", "gcs", "az"], local_base_path: Path)
150
151
  elif target == "az":
151
152
  base_uri = UPath(AZURE_BUCKET)
152
153
 
153
- return str(base_uri / "destination" / "lancedb" / DATABASE_NAME)
154
+ return str(base_uri / "destination" / "lancedb" / str(uuid4()) / DATABASE_NAME)
154
155
 
155
156
 
156
157
  def _get_uploader(
@@ -0,0 +1,332 @@
1
+ import json
2
+ import os
3
+ import time
4
+ import uuid
5
+ from contextlib import contextmanager
6
+ from pathlib import Path
7
+ from typing import Generator
8
+
9
+ import pytest
10
+ from pydantic import BaseModel, SecretStr
11
+ from pymongo.collection import Collection
12
+ from pymongo.database import Database
13
+ from pymongo.mongo_client import MongoClient
14
+ from pymongo.operations import SearchIndexModel
15
+
16
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
17
+ from test.integration.connectors.utils.validation import (
18
+ ValidationConfigs,
19
+ source_connector_validation,
20
+ )
21
+ from test.integration.utils import requires_env
22
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
23
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
24
+ from unstructured_ingest.v2.processes.connectors.mongodb import (
25
+ CONNECTOR_TYPE,
26
+ MongoDBAccessConfig,
27
+ MongoDBConnectionConfig,
28
+ MongoDBDownloader,
29
+ MongoDBDownloaderConfig,
30
+ MongoDBIndexer,
31
+ MongoDBIndexerConfig,
32
+ MongoDBUploader,
33
+ MongoDBUploaderConfig,
34
+ )
35
+
36
+ SOURCE_COLLECTION = "sample-mongodb-data"
37
+
38
+
39
+ class EnvData(BaseModel):
40
+ uri: SecretStr
41
+ database: str
42
+
43
+
44
+ def get_env_data() -> EnvData:
45
+ uri = os.getenv("MONGODB_URI")
46
+ assert uri
47
+ database = os.getenv("MONGODB_DATABASE")
48
+ assert database
49
+ return EnvData(uri=uri, database=database)
50
+
51
+
52
+ @contextmanager
53
+ def get_client() -> Generator[MongoClient, None, None]:
54
+ uri = get_env_data().uri.get_secret_value()
55
+ with MongoClient(uri) as client:
56
+ assert client.admin.command("ping")
57
+ yield client
58
+
59
+
60
+ def wait_for_collection(
61
+ database: Database, collection_name: str, retries: int = 10, interval: int = 1
62
+ ):
63
+ collections = database.list_collection_names()
64
+ attempts = 0
65
+ while collection_name not in collections and attempts < retries:
66
+ attempts += 1
67
+ print(
68
+ "Waiting for collection {} to be recognized: {}".format(
69
+ collection_name, ", ".join(collections)
70
+ )
71
+ )
72
+ time.sleep(interval)
73
+ collections = database.list_collection_names()
74
+ if collection_name not in collection_name:
75
+ raise TimeoutError(f"Collection {collection_name} was not recognized")
76
+
77
+
78
+ def get_search_index_status(collection: Collection, index_name: str) -> str:
79
+ search_indexes = collection.list_search_indexes(name=index_name)
80
+ search_index = list(search_indexes)[0]
81
+ return search_index["status"]
82
+
83
+
84
+ def wait_for_search_index(
85
+ collection: Collection, index_name: str, retries: int = 60, interval: int = 1
86
+ ):
87
+ current_status = get_search_index_status(collection, index_name)
88
+ attempts = 0
89
+ while current_status != "READY" and attempts < retries:
90
+ attempts += 1
91
+ print(f"attempt {attempts}: waiting for search index to be READY: {current_status}")
92
+ time.sleep(interval)
93
+ current_status = get_search_index_status(collection, index_name)
94
+
95
+ if current_status != "READY":
96
+ raise TimeoutError("search index never detected as READY")
97
+
98
+
99
+ @pytest.fixture
100
+ def destination_collection() -> Collection:
101
+ env_data = get_env_data()
102
+ collection_name = f"utic-test-output-{uuid.uuid4()}"
103
+ with get_client() as client:
104
+ database = client[env_data.database]
105
+ print(f"creating collection in database {database}: {collection_name}")
106
+ collection = database.create_collection(name=collection_name)
107
+ search_index_name = "embeddings"
108
+ collection.create_search_index(
109
+ model=SearchIndexModel(
110
+ name=search_index_name,
111
+ definition={
112
+ "mappings": {
113
+ "dynamic": True,
114
+ "fields": {
115
+ "embeddings": [
116
+ {"type": "knnVector", "dimensions": 384, "similarity": "euclidean"}
117
+ ]
118
+ },
119
+ }
120
+ },
121
+ )
122
+ )
123
+ collection.create_index("record_id")
124
+ wait_for_collection(database=database, collection_name=collection_name)
125
+ wait_for_search_index(collection=collection, index_name=search_index_name)
126
+ try:
127
+ yield collection
128
+ finally:
129
+ print(f"deleting collection: {collection_name}")
130
+ collection.drop()
131
+
132
+
133
+ def validate_collection_count(
134
+ collection: Collection, expected_records: int, retries: int = 10, interval: int = 1
135
+ ) -> None:
136
+ count = collection.count_documents(filter={})
137
+ attempt = 0
138
+ while count != expected_records and attempt < retries:
139
+ attempt += 1
140
+ print(f"attempt {attempt} to get count of collection {count} to match {expected_records}")
141
+ time.sleep(interval)
142
+ count = collection.count_documents(filter={})
143
+ assert (
144
+ count == expected_records
145
+ ), f"expected count ({expected_records}) does not match how many records were found: {count}"
146
+
147
+
148
+ def validate_collection_vector(
149
+ collection: Collection, embedding: list[float], text: str, retries: int = 30, interval: int = 1
150
+ ) -> None:
151
+ pipeline = [
152
+ {
153
+ "$vectorSearch": {
154
+ "index": "embeddings",
155
+ "path": "embeddings",
156
+ "queryVector": embedding,
157
+ "numCandidates": 150,
158
+ "limit": 10,
159
+ },
160
+ },
161
+ {"$project": {"_id": 0, "text": 1, "score": {"$meta": "vectorSearchScore"}}},
162
+ ]
163
+ attempts = 0
164
+ results = list(collection.aggregate(pipeline=pipeline))
165
+ while not results and attempts < retries:
166
+ attempts += 1
167
+ print(f"attempt {attempts}, waiting for valid results: {results}")
168
+ time.sleep(interval)
169
+ results = list(collection.aggregate(pipeline=pipeline))
170
+ if not results:
171
+ raise TimeoutError("Timed out waiting for valid results")
172
+ print(f"found results on attempt {attempts}")
173
+ top_result = results[0]
174
+ assert top_result["score"] == 1.0, "score detected should be 1: {}".format(top_result["score"])
175
+ assert top_result["text"] == text, "text detected should be {}, found: {}".format(
176
+ text, top_result["text"]
177
+ )
178
+ for r in results[1:]:
179
+ assert r["score"] < 1.0, "score detected should be less than 1: {}".format(r["score"])
180
+
181
+
182
+ @pytest.mark.asyncio
183
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
184
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
185
+ async def test_mongodb_source(temp_dir: Path):
186
+ env_data = get_env_data()
187
+ indexer_config = MongoDBIndexerConfig(database=env_data.database, collection=SOURCE_COLLECTION)
188
+ download_config = MongoDBDownloaderConfig(download_dir=temp_dir)
189
+ connection_config = MongoDBConnectionConfig(
190
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
191
+ )
192
+ indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
193
+ downloader = MongoDBDownloader(
194
+ connection_config=connection_config, download_config=download_config
195
+ )
196
+ await source_connector_validation(
197
+ indexer=indexer,
198
+ downloader=downloader,
199
+ configs=ValidationConfigs(
200
+ test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
201
+ ),
202
+ )
203
+
204
+
205
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
206
+ def test_mongodb_indexer_precheck_fail_no_host():
207
+ indexer_config = MongoDBIndexerConfig(
208
+ database="non-existent-database", collection="non-existent-database"
209
+ )
210
+ connection_config = MongoDBConnectionConfig(
211
+ access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
212
+ )
213
+ indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
214
+ with pytest.raises(SourceConnectionError):
215
+ indexer.precheck()
216
+
217
+
218
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
219
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
220
+ def test_mongodb_indexer_precheck_fail_no_database():
221
+ env_data = get_env_data()
222
+ indexer_config = MongoDBIndexerConfig(
223
+ database="non-existent-database", collection=SOURCE_COLLECTION
224
+ )
225
+ connection_config = MongoDBConnectionConfig(
226
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
227
+ )
228
+ indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
229
+ with pytest.raises(SourceConnectionError):
230
+ indexer.precheck()
231
+
232
+
233
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
234
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
235
+ def test_mongodb_indexer_precheck_fail_no_collection():
236
+ env_data = get_env_data()
237
+ indexer_config = MongoDBIndexerConfig(
238
+ database=env_data.database, collection="non-existent-collection"
239
+ )
240
+ connection_config = MongoDBConnectionConfig(
241
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
242
+ )
243
+ indexer = MongoDBIndexer(connection_config=connection_config, index_config=indexer_config)
244
+ with pytest.raises(SourceConnectionError):
245
+ indexer.precheck()
246
+
247
+
248
+ @pytest.mark.asyncio
249
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
250
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
251
+ async def test_mongodb_destination(
252
+ upload_file: Path,
253
+ destination_collection: Collection,
254
+ tmp_path: Path,
255
+ ):
256
+ env_data = get_env_data()
257
+ file_data = FileData(
258
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
259
+ connector_type=CONNECTOR_TYPE,
260
+ identifier="mongodb_mock_id",
261
+ )
262
+ connection_config = MongoDBConnectionConfig(
263
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
264
+ )
265
+
266
+ upload_config = MongoDBUploaderConfig(
267
+ database=env_data.database,
268
+ collection=destination_collection.name,
269
+ )
270
+ uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
271
+ uploader.precheck()
272
+ uploader.run(path=upload_file, file_data=file_data)
273
+
274
+ with upload_file.open() as f:
275
+ staged_elements = json.load(f)
276
+ expected_records = len(staged_elements)
277
+ validate_collection_count(collection=destination_collection, expected_records=expected_records)
278
+ first_element = staged_elements[0]
279
+ validate_collection_vector(
280
+ collection=destination_collection,
281
+ embedding=first_element["embeddings"],
282
+ text=first_element["text"],
283
+ )
284
+
285
+ uploader.run(path=upload_file, file_data=file_data)
286
+ validate_collection_count(collection=destination_collection, expected_records=expected_records)
287
+
288
+
289
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
290
+ def test_mongodb_uploader_precheck_fail_no_host():
291
+ upload_config = MongoDBUploaderConfig(
292
+ database="database",
293
+ collection="collection",
294
+ )
295
+ connection_config = MongoDBConnectionConfig(
296
+ access_config=MongoDBAccessConfig(uri="mongodb+srv://ingest-test.hgaig.mongodb"),
297
+ )
298
+ uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
299
+ with pytest.raises(DestinationConnectionError):
300
+ uploader.precheck()
301
+
302
+
303
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
304
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
305
+ def test_mongodb_uploader_precheck_fail_no_database():
306
+ env_data = get_env_data()
307
+ upload_config = MongoDBUploaderConfig(
308
+ database="database",
309
+ collection="collection",
310
+ )
311
+ connection_config = MongoDBConnectionConfig(
312
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
313
+ )
314
+ uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
315
+ with pytest.raises(DestinationConnectionError):
316
+ uploader.precheck()
317
+
318
+
319
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
320
+ @requires_env("MONGODB_URI", "MONGODB_DATABASE")
321
+ def test_mongodb_uploader_precheck_fail_no_collection():
322
+ env_data = get_env_data()
323
+ upload_config = MongoDBUploaderConfig(
324
+ database=env_data.database,
325
+ collection="collection",
326
+ )
327
+ connection_config = MongoDBConnectionConfig(
328
+ access_config=MongoDBAccessConfig(uri=env_data.uri.get_secret_value()),
329
+ )
330
+ uploader = MongoDBUploader(connection_config=connection_config, upload_config=upload_config)
331
+ with pytest.raises(DestinationConnectionError):
332
+ uploader.precheck()
@@ -1 +1 @@
1
- __version__ = "0.3.1" # pragma: no cover
1
+ __version__ = "0.3.2" # pragma: no cover
@@ -161,7 +161,7 @@ class GoogleDriveIndexer(Indexer):
161
161
  and isinstance(parent_root_path, str)
162
162
  ):
163
163
  fullpath = f"{parent_path}/{filename}"
164
- rel_path = fullpath.replace(parent_root_path, "")
164
+ rel_path = Path(fullpath).relative_to(parent_root_path).as_posix()
165
165
  source_identifiers = SourceIdentifiers(
166
166
  filename=filename, fullpath=fullpath, rel_path=rel_path
167
167
  )
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import sys
3
- from dataclasses import dataclass, field
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, replace
4
5
  from datetime import datetime
5
6
  from pathlib import Path
6
7
  from time import time
@@ -12,6 +13,7 @@ from unstructured_ingest.__version__ import __version__ as unstructured_version
12
13
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
13
14
  from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
14
15
  from unstructured_ingest.utils.dep_check import requires_dependencies
16
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
15
17
  from unstructured_ingest.v2.interfaces import (
16
18
  AccessConfig,
17
19
  ConnectionConfig,
@@ -24,8 +26,6 @@ from unstructured_ingest.v2.interfaces import (
24
26
  SourceIdentifiers,
25
27
  Uploader,
26
28
  UploaderConfig,
27
- UploadStager,
28
- UploadStagerConfig,
29
29
  download_responses,
30
30
  )
31
31
  from unstructured_ingest.v2.logger import logger
@@ -36,6 +36,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
36
36
 
37
37
  if TYPE_CHECKING:
38
38
  from pymongo import MongoClient
39
+ from pymongo.collection import Collection
39
40
 
40
41
  CONNECTOR_TYPE = "mongodb"
41
42
  SERVER_API_VERSION = "1"
@@ -54,18 +55,37 @@ class MongoDBConnectionConfig(ConnectionConfig):
54
55
  description="hostname or IP address or Unix domain socket path of a single mongod or "
55
56
  "mongos instance to connect to, or a list of hostnames",
56
57
  )
57
- database: Optional[str] = Field(default=None, description="database name to connect to")
58
- collection: Optional[str] = Field(default=None, description="collection name to connect to")
59
58
  port: int = Field(default=27017)
60
59
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
61
60
 
61
+ @contextmanager
62
+ @requires_dependencies(["pymongo"], extras="mongodb")
63
+ def get_client(self) -> Generator["MongoClient", None, None]:
64
+ from pymongo import MongoClient
65
+ from pymongo.driver_info import DriverInfo
66
+ from pymongo.server_api import ServerApi
62
67
 
63
- class MongoDBUploadStagerConfig(UploadStagerConfig):
64
- pass
68
+ access_config = self.access_config.get_secret_value()
69
+ if uri := access_config.uri:
70
+ client_kwargs = {
71
+ "host": uri,
72
+ "server_api": ServerApi(version=SERVER_API_VERSION),
73
+ "driver": DriverInfo(name="unstructured", version=unstructured_version),
74
+ }
75
+ else:
76
+ client_kwargs = {
77
+ "host": self.host,
78
+ "port": self.port,
79
+ "server_api": ServerApi(version=SERVER_API_VERSION),
80
+ }
81
+ with MongoClient(**client_kwargs) as client:
82
+ yield client
65
83
 
66
84
 
67
85
  class MongoDBIndexerConfig(IndexerConfig):
68
86
  batch_size: int = Field(default=100, description="Number of records per batch")
87
+ database: Optional[str] = Field(default=None, description="database name to connect to")
88
+ collection: Optional[str] = Field(default=None, description="collection name to connect to")
69
89
 
70
90
 
71
91
  class MongoDBDownloaderConfig(DownloaderConfig):
@@ -81,42 +101,38 @@ class MongoDBIndexer(Indexer):
81
101
  def precheck(self) -> None:
82
102
  """Validates the connection to the MongoDB server."""
83
103
  try:
84
- client = self.create_client()
85
- client.admin.command("ping")
104
+ with self.connection_config.get_client() as client:
105
+ client.admin.command("ping")
106
+ database_names = client.list_database_names()
107
+ database_name = self.index_config.database
108
+ if database_name not in database_names:
109
+ raise DestinationConnectionError(
110
+ "database {} does not exist: {}".format(
111
+ database_name, ", ".join(database_names)
112
+ )
113
+ )
114
+ database = client[database_name]
115
+ collection_names = database.list_collection_names()
116
+ collection_name = self.index_config.collection
117
+ if collection_name not in collection_names:
118
+ raise SourceConnectionError(
119
+ "collection {} does not exist: {}".format(
120
+ collection_name, ", ".join(collection_names)
121
+ )
122
+ )
86
123
  except Exception as e:
87
124
  logger.error(f"Failed to validate connection: {e}", exc_info=True)
88
125
  raise SourceConnectionError(f"Failed to validate connection: {e}")
89
126
 
90
- @requires_dependencies(["pymongo"], extras="mongodb")
91
- def create_client(self) -> "MongoClient":
92
- from pymongo import MongoClient
93
- from pymongo.driver_info import DriverInfo
94
- from pymongo.server_api import ServerApi
95
-
96
- access_config = self.connection_config.access_config.get_secret_value()
97
-
98
- if access_config.uri:
99
- return MongoClient(
100
- access_config.uri,
101
- server_api=ServerApi(version=SERVER_API_VERSION),
102
- driver=DriverInfo(name="unstructured", version=unstructured_version),
103
- )
104
- else:
105
- return MongoClient(
106
- host=self.connection_config.host,
107
- port=self.connection_config.port,
108
- server_api=ServerApi(version=SERVER_API_VERSION),
109
- )
110
-
111
127
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
112
128
  """Generates FileData objects for each document in the MongoDB collection."""
113
- client = self.create_client()
114
- database = client[self.connection_config.database]
115
- collection = database[self.connection_config.collection]
129
+ with self.connection_config.get_client() as client:
130
+ database = client[self.index_config.database]
131
+ collection = database[self.index_config.collection]
116
132
 
117
- # Get list of document IDs
118
- ids = collection.distinct("_id")
119
- batch_size = self.index_config.batch_size if self.index_config else 100
133
+ # Get list of document IDs
134
+ ids = collection.distinct("_id")
135
+ batch_size = self.index_config.batch_size if self.index_config else 100
120
136
 
121
137
  for id_batch in batch_generator(ids, batch_size=batch_size):
122
138
  # Make sure the hash is always a positive number to create identifier
@@ -125,8 +141,8 @@ class MongoDBIndexer(Indexer):
125
141
  metadata = FileDataSourceMetadata(
126
142
  date_processed=str(time()),
127
143
  record_locator={
128
- "database": self.connection_config.database,
129
- "collection": self.connection_config.collection,
144
+ "database": self.index_config.database,
145
+ "collection": self.index_config.collection,
130
146
  },
131
147
  )
132
148
 
@@ -177,8 +193,8 @@ class MongoDBDownloader(Downloader):
177
193
  from bson.objectid import ObjectId
178
194
 
179
195
  client = self.create_client()
180
- database = client[self.connection_config.database]
181
- collection = database[self.connection_config.collection]
196
+ database = client[file_data.metadata.record_locator["database"]]
197
+ collection = database[file_data.metadata.record_locator["collection"]]
182
198
 
183
199
  ids = file_data.additional_metadata.get("ids", [])
184
200
  if not ids:
@@ -222,14 +238,12 @@ class MongoDBDownloader(Downloader):
222
238
  concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
223
239
 
224
240
  # Create a FileData object for each document with source_identifiers
225
- individual_file_data = FileData(
226
- identifier=str(doc_id),
227
- connector_type=self.connector_type,
228
- source_identifiers=SourceIdentifiers(
229
- filename=str(doc_id),
230
- fullpath=str(doc_id),
231
- rel_path=str(doc_id),
232
- ),
241
+ individual_file_data = replace(file_data)
242
+ individual_file_data.identifier = str(doc_id)
243
+ individual_file_data.source_identifiers = SourceIdentifiers(
244
+ filename=str(doc_id),
245
+ fullpath=str(doc_id),
246
+ rel_path=str(doc_id),
233
247
  )
234
248
 
235
249
  # Determine the download path
@@ -247,15 +261,8 @@ class MongoDBDownloader(Downloader):
247
261
  individual_file_data.local_download_path = str(download_path)
248
262
 
249
263
  # Update metadata
250
- individual_file_data.metadata = FileDataSourceMetadata(
251
- date_created=date_created, # Include date_created here
252
- date_processed=str(time()),
253
- record_locator={
254
- "database": self.connection_config.database,
255
- "collection": self.connection_config.collection,
256
- "document_id": str(doc_id),
257
- },
258
- )
264
+ individual_file_data.metadata.record_locator["document_id"] = str(doc_id)
265
+ individual_file_data.metadata.date_created = date_created
259
266
 
260
267
  download_response = self.generate_download_response(
261
268
  file_data=individual_file_data, download_path=download_path
@@ -265,31 +272,14 @@ class MongoDBDownloader(Downloader):
265
272
  return download_responses
266
273
 
267
274
 
268
- @dataclass
269
- class MongoDBUploadStager(UploadStager):
270
- upload_stager_config: MongoDBUploadStagerConfig = field(
271
- default_factory=lambda: MongoDBUploadStagerConfig()
272
- )
273
-
274
- def run(
275
- self,
276
- elements_filepath: Path,
277
- file_data: FileData,
278
- output_dir: Path,
279
- output_filename: str,
280
- **kwargs: Any,
281
- ) -> Path:
282
- with open(elements_filepath) as elements_file:
283
- elements_contents = json.load(elements_file)
284
-
285
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
286
- with open(output_path, "w") as output_file:
287
- json.dump(elements_contents, output_file)
288
- return output_path
289
-
290
-
291
275
  class MongoDBUploaderConfig(UploaderConfig):
292
276
  batch_size: int = Field(default=100, description="Number of records per batch")
277
+ database: Optional[str] = Field(default=None, description="database name to connect to")
278
+ collection: Optional[str] = Field(default=None, description="collection name to connect to")
279
+ record_id_key: str = Field(
280
+ default=RECORD_ID_LABEL,
281
+ description="searchable key to find entries for the same record on previous runs",
282
+ )
293
283
 
294
284
 
295
285
  @dataclass
@@ -300,55 +290,76 @@ class MongoDBUploader(Uploader):
300
290
 
301
291
  def precheck(self) -> None:
302
292
  try:
303
- client = self.create_client()
304
- client.admin.command("ping")
293
+ with self.connection_config.get_client() as client:
294
+ client.admin.command("ping")
295
+ database_names = client.list_database_names()
296
+ database_name = self.upload_config.database
297
+ if database_name not in database_names:
298
+ raise DestinationConnectionError(
299
+ "database {} does not exist: {}".format(
300
+ database_name, ", ".join(database_names)
301
+ )
302
+ )
303
+ database = client[database_name]
304
+ collection_names = database.list_collection_names()
305
+ collection_name = self.upload_config.collection
306
+ if collection_name not in collection_names:
307
+ raise SourceConnectionError(
308
+ "collection {} does not exist: {}".format(
309
+ collection_name, ", ".join(collection_names)
310
+ )
311
+ )
305
312
  except Exception as e:
306
313
  logger.error(f"failed to validate connection: {e}", exc_info=True)
307
314
  raise DestinationConnectionError(f"failed to validate connection: {e}")
308
315
 
309
- @requires_dependencies(["pymongo"], extras="mongodb")
310
- def create_client(self) -> "MongoClient":
311
- from pymongo import MongoClient
312
- from pymongo.driver_info import DriverInfo
313
- from pymongo.server_api import ServerApi
314
-
315
- access_config = self.connection_config.access_config.get_secret_value()
316
-
317
- if access_config.uri:
318
- return MongoClient(
319
- access_config.uri,
320
- server_api=ServerApi(version=SERVER_API_VERSION),
321
- driver=DriverInfo(name="unstructured", version=unstructured_version),
322
- )
323
- else:
324
- return MongoClient(
325
- host=self.connection_config.host,
326
- port=self.connection_config.port,
327
- server_api=ServerApi(version=SERVER_API_VERSION),
328
- )
316
+ def can_delete(self, collection: "Collection") -> bool:
317
+ indexed_keys = []
318
+ for index in collection.list_indexes():
319
+ key_bson = index["key"]
320
+ indexed_keys.extend(key_bson.keys())
321
+ return self.upload_config.record_id_key in indexed_keys
322
+
323
+ def delete_by_record_id(self, collection: "Collection", file_data: FileData) -> None:
324
+ logger.debug(
325
+ f"deleting any content with metadata "
326
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
327
+ f"from collection: {collection.name}"
328
+ )
329
+ query = {self.upload_config.record_id_key: file_data.identifier}
330
+ delete_results = collection.delete_many(filter=query)
331
+ logger.info(
332
+ f"deleted {delete_results.deleted_count} records from collection {collection.name}"
333
+ )
329
334
 
330
335
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
331
336
  with path.open("r") as file:
332
337
  elements_dict = json.load(file)
333
338
  logger.info(
334
339
  f"writing {len(elements_dict)} objects to destination "
335
- f"db, {self.connection_config.database}, "
336
- f"collection {self.connection_config.collection} "
340
+ f"db, {self.upload_config.database}, "
341
+ f"collection {self.upload_config.collection} "
337
342
  f"at {self.connection_config.host}",
338
343
  )
339
- client = self.create_client()
340
- db = client[self.connection_config.database]
341
- collection = db[self.connection_config.collection]
342
- for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
343
- collection.insert_many(chunk)
344
+ # This would typically live in the stager but since no other manipulation
345
+ # is done, setting the record id field in the uploader
346
+ for element in elements_dict:
347
+ element[self.upload_config.record_id_key] = file_data.identifier
348
+ with self.connection_config.get_client() as client:
349
+ db = client[self.upload_config.database]
350
+ collection = db[self.upload_config.collection]
351
+ if self.can_delete(collection=collection):
352
+ self.delete_by_record_id(file_data=file_data, collection=collection)
353
+ else:
354
+ logger.warning("criteria for deleting previous content not met, skipping")
355
+ for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
356
+ collection.insert_many(chunk)
344
357
 
345
358
 
346
359
  mongodb_destination_entry = DestinationRegistryEntry(
347
360
  connection_config=MongoDBConnectionConfig,
348
361
  uploader=MongoDBUploader,
349
362
  uploader_config=MongoDBUploaderConfig,
350
- upload_stager=MongoDBUploadStager,
351
- upload_stager_config=MongoDBUploadStagerConfig,
352
363
  )
353
364
 
354
365
  mongodb_source_entry = SourceRegistryEntry(
@@ -10,6 +10,8 @@ from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
10
10
  from .embedded import weaviate_embedded_destination_entry
11
11
  from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
12
12
  from .local import weaviate_local_destination_entry
13
+ from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
14
+ from .weaviate import weaviate_destination_entry
13
15
 
14
16
  add_destination_entry(
15
17
  destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
@@ -20,3 +22,4 @@ add_destination_entry(
20
22
  add_destination_entry(
21
23
  destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
22
24
  )
25
+ add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
@@ -22,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
22
22
  UploadStagerConfig,
23
23
  )
24
24
  from unstructured_ingest.v2.logger import logger
25
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
25
26
 
26
27
  if TYPE_CHECKING:
27
28
  from weaviate.classes.init import Timeout
@@ -287,3 +288,12 @@ class WeaviateUploader(Uploader, ABC):
287
288
  vector=vector,
288
289
  )
289
290
  self.check_for_errors(client=weaviate_client)
291
+
292
+
293
+ weaviate_destination_entry = DestinationRegistryEntry(
294
+ connection_config=WeaviateConnectionConfig,
295
+ uploader=WeaviateUploader,
296
+ uploader_config=WeaviateUploaderConfig,
297
+ upload_stager=WeaviateUploadStager,
298
+ upload_stager_config=WeaviateUploadStagerConfig,
299
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -23,12 +23,12 @@ Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: python-dateutil
26
- Requires-Dist: tqdm
27
- Requires-Dist: click
28
- Requires-Dist: dataclasses-json
29
26
  Requires-Dist: pydantic>=2.7
30
- Requires-Dist: pandas
31
27
  Requires-Dist: opentelemetry-sdk
28
+ Requires-Dist: click
29
+ Requires-Dist: tqdm
30
+ Requires-Dist: pandas
31
+ Requires-Dist: dataclasses-json
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
@@ -41,8 +41,8 @@ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
42
  Requires-Dist: boto3; extra == "bedrock"
43
43
  Provides-Extra: biomed
44
- Requires-Dist: requests; extra == "biomed"
45
44
  Requires-Dist: bs4; extra == "biomed"
45
+ Requires-Dist: requests; extra == "biomed"
46
46
  Provides-Extra: box
47
47
  Requires-Dist: boxfs; extra == "box"
48
48
  Requires-Dist: fsspec; extra == "box"
@@ -78,8 +78,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
78
78
  Provides-Extra: embed-mixedbreadai
79
79
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
80
80
  Provides-Extra: embed-octoai
81
- Requires-Dist: openai; extra == "embed-octoai"
82
81
  Requires-Dist: tiktoken; extra == "embed-octoai"
82
+ Requires-Dist: openai; extra == "embed-octoai"
83
83
  Provides-Extra: embed-vertexai
84
84
  Requires-Dist: vertexai; extra == "embed-vertexai"
85
85
  Provides-Extra: embed-voyageai
@@ -98,8 +98,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
98
98
  Provides-Extra: google-drive
99
99
  Requires-Dist: google-api-python-client; extra == "google-drive"
100
100
  Provides-Extra: hubspot
101
- Requires-Dist: hubspot-api-client; extra == "hubspot"
102
101
  Requires-Dist: urllib3; extra == "hubspot"
102
+ Requires-Dist: hubspot-api-client; extra == "hubspot"
103
103
  Provides-Extra: jira
104
104
  Requires-Dist: atlassian-python-api; extra == "jira"
105
105
  Provides-Extra: kafka
@@ -115,19 +115,19 @@ Requires-Dist: pymongo; extra == "mongodb"
115
115
  Provides-Extra: msg
116
116
  Requires-Dist: unstructured[msg]; extra == "msg"
117
117
  Provides-Extra: notion
118
+ Requires-Dist: backoff; extra == "notion"
118
119
  Requires-Dist: htmlBuilder; extra == "notion"
119
120
  Requires-Dist: notion-client; extra == "notion"
120
- Requires-Dist: backoff; extra == "notion"
121
121
  Requires-Dist: httpx; extra == "notion"
122
122
  Provides-Extra: odt
123
123
  Requires-Dist: unstructured[odt]; extra == "odt"
124
124
  Provides-Extra: onedrive
125
- Requires-Dist: msal; extra == "onedrive"
126
125
  Requires-Dist: bs4; extra == "onedrive"
126
+ Requires-Dist: msal; extra == "onedrive"
127
127
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
128
128
  Provides-Extra: openai
129
- Requires-Dist: openai; extra == "openai"
130
129
  Requires-Dist: tiktoken; extra == "openai"
130
+ Requires-Dist: openai; extra == "openai"
131
131
  Provides-Extra: opensearch
132
132
  Requires-Dist: opensearch-py; extra == "opensearch"
133
133
  Provides-Extra: org
@@ -156,8 +156,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
156
156
  Provides-Extra: rtf
157
157
  Requires-Dist: unstructured[rtf]; extra == "rtf"
158
158
  Provides-Extra: s3
159
- Requires-Dist: s3fs; extra == "s3"
160
159
  Requires-Dist: fsspec; extra == "s3"
160
+ Requires-Dist: s3fs; extra == "s3"
161
161
  Provides-Extra: salesforce
162
162
  Requires-Dist: simple-salesforce; extra == "salesforce"
163
163
  Provides-Extra: sftp
@@ -6,12 +6,13 @@ test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-Lr
6
6
  test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  test/integration/connectors/conftest.py,sha256=6dVNMBrL6WIO4KXA-0nf2tNrPYk_tsor8uomi6fbi3Q,727
8
8
  test/integration/connectors/test_astradb.py,sha256=QPFrODXmOHagpuKaiooxXb3OEW93w2g4fmq8BkaBCnY,5303
9
- test/integration/connectors/test_azure_cog_search.py,sha256=dae4GifRiKue5YpsxworDiaMQoMsxcPDBithb6OFkx4,8876
9
+ test/integration/connectors/test_azure_ai_search.py,sha256=dae4GifRiKue5YpsxworDiaMQoMsxcPDBithb6OFkx4,8876
10
10
  test/integration/connectors/test_confluence.py,sha256=xcPmZ_vi_pkCt-tUPn10P49FH9i_9YUbrAPO6fYk5rU,3521
11
11
  test/integration/connectors/test_delta_table.py,sha256=GSzWIkbEUzOrRPt2F1uO0dabcp7kTFDj75BhhI2y-WU,6856
12
12
  test/integration/connectors/test_kafka.py,sha256=j7jsNWZumNBv9v-5Bpx8geUUXpxxad5EuA4CMRsl4R8,7104
13
- test/integration/connectors/test_lancedb.py,sha256=1EqdXOaA3gJqXDe1W-dHUzfOfeL1A4RB0oYwKvlfltg,7590
13
+ test/integration/connectors/test_lancedb.py,sha256=O3YF6MVBkCsCgklXCJe8Kpy8aKGfafASVH4PspmpcYs,7628
14
14
  test/integration/connectors/test_milvus.py,sha256=CVmYw9iEeKT_0OtShxye2E6i1LbWzzDA8JtwJRkYQlA,4763
15
+ test/integration/connectors/test_mongodb.py,sha256=YeS_DUnVYN02F76j87W8RhXGHnJMzQYb3n-L1-oWGXI,12254
15
16
  test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
16
17
  test/integration/connectors/test_pinecone.py,sha256=X10OWZ6IrO6YyhuR3ydMAZOQq3u2f5u_lCjKNYUUcnI,7558
17
18
  test/integration/connectors/test_qdrant.py,sha256=ASvO-BNyhv8m8or28KljrJy27Da0uaTNeoR5w_QsvFg,5121
@@ -80,7 +81,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
80
81
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
82
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
82
83
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
83
- unstructured_ingest/__version__.py,sha256=0bjUtHIzwwONNua74ouSySVzVv9qumqBMBxOWLE7Tbo,42
84
+ unstructured_ingest/__version__.py,sha256=Js7MXQhyIj1akVjPNsLkmZxqoOHDGOr2opEPgFOSTZQ,42
84
85
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
85
86
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
86
87
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -400,11 +401,11 @@ unstructured_ingest/v2/processes/connectors/confluence.py,sha256=qQApDcmPBGg4tHX
400
401
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=yhMDbpkZXs-Kis7tFlgjvNemU-MdWMdpCZDrpZNFaU4,12180
401
402
  unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=1yS7ivEyiucwd_kv6LL5HQdGabT43yeG6XCdwiz89hc,8019
402
403
  unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=yBgCeLy9iCVI8bBDcHHuHB0H3BO05e9E1OccbHwvKAo,9724
403
- unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=a1BAvhX3nsgghjuR5CJ1lOwMtJ5ZJwimg6VtDYvluxA,13104
404
+ unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=EEwXK1Anlu-eXl5qxmdDIqPYW7eMSez6WGlTPG2vSn8,13121
404
405
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
405
406
  unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
406
407
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=Bzv2fa852BcM4_Pr-I_DPvLmjPoXv0Z7BeEA8qSKCDc,9725
407
- unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=A0pt6JcNTD5bEu79jZ8KhnHcBQ2VUJ2AjtQAtdFr_Lo,13175
408
+ unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
408
409
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
409
410
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
410
411
  unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=hWkXgVDAzCtrBxf7A4HoexBACGAfVf_Qvn9YHbeiBSY,11505
@@ -451,14 +452,14 @@ unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=YrmhAL1RQ1
451
452
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=jl524VudwmFK63emCT7DmZan_EWJAMiGir5_zoO9FuY,5697
452
453
  unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=LFzGeAUagLknK07DsXg2oSG7ZAgR6VqT9wfI_tYlHUg,14782
453
454
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
454
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
455
+ unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=eXamSnQdzzMvt62z80B8nmlkwDKO-Pogln_K_zLz53A,1067
455
456
  unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=2g1Fm2J0ppfy2jCw4b5YtrsWrSD3VcrAaqiE7FlpIAg,6236
456
457
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
457
458
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
458
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=dBDC_M8GVKupl7i9UMRCZyRIUv6gTkq8bJE_SILydAc,11291
459
- unstructured_ingest-0.3.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
460
- unstructured_ingest-0.3.1.dist-info/METADATA,sha256=gEXBJbX1y03XJgGGqXpNlkOw1PJ4IhEHmohj2CXHq9g,7326
461
- unstructured_ingest-0.3.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
462
- unstructured_ingest-0.3.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
463
- unstructured_ingest-0.3.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
464
- unstructured_ingest-0.3.1.dist-info/RECORD,,
459
+ unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=ln1p9ahFTaT-qsL7p4bgw_IqnU60As_l6vVAqUWyQVE,11655
460
+ unstructured_ingest-0.3.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
461
+ unstructured_ingest-0.3.2.dist-info/METADATA,sha256=rqTWqewB8eIrgrHJ-8AsNtehy35eSHKseCsveXTwN3Y,7326
462
+ unstructured_ingest-0.3.2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
463
+ unstructured_ingest-0.3.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
464
+ unstructured_ingest-0.3.2.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
465
+ unstructured_ingest-0.3.2.dist-info/RECORD,,