unstructured-ingest 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (24) hide show
  1. test/integration/connectors/test_lancedb.py +9 -8
  2. test/integration/connectors/test_milvus.py +34 -6
  3. test/integration/connectors/test_mongodb.py +332 -0
  4. test/integration/connectors/weaviate/test_cloud.py +34 -0
  5. test/unit/test_utils.py +21 -1
  6. unstructured_ingest/__version__.py +1 -1
  7. unstructured_ingest/utils/string_and_date_utils.py +10 -0
  8. unstructured_ingest/v2/processes/connectors/astradb.py +16 -0
  9. unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
  10. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -4
  11. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +7 -7
  12. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  13. unstructured_ingest/v2/processes/connectors/milvus.py +9 -3
  14. unstructured_ingest/v2/processes/connectors/mongodb.py +122 -111
  15. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +3 -0
  16. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +4 -3
  17. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +10 -0
  18. {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/METADATA +14 -12
  19. {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/RECORD +24 -21
  20. /test/integration/connectors/{test_azure_cog_search.py → test_azure_ai_search.py} +0 -0
  21. {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/LICENSE.md +0 -0
  22. {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/WHEEL +0 -0
  23. {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/entry_points.txt +0 -0
  24. {unstructured_ingest-0.3.1.dist-info → unstructured_ingest-0.3.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,42 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.interfaces.connector import AccessConfig
6
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
7
+ from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
8
+ LanceDBRemoteConnectionConfig,
9
+ LanceDBUploader,
10
+ LanceDBUploaderConfig,
11
+ LanceDBUploadStager,
12
+ LanceDBUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "lancedb_cloud"
16
+
17
+
18
+ class LanceDBCloudAccessConfig(AccessConfig):
19
+ api_key: str = Field(description="Api key associated with LanceDb cloud")
20
+
21
+
22
+ class LanceDBCloudConnectionConfig(LanceDBRemoteConnectionConfig):
23
+ access_config: Secret[LanceDBCloudAccessConfig]
24
+
25
+ def get_storage_options(self) -> dict:
26
+ return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
27
+
28
+
29
+ @dataclass
30
+ class LanceDBCloudUploader(LanceDBUploader):
31
+ upload_config: LanceDBUploaderConfig
32
+ connection_config: LanceDBCloudConnectionConfig
33
+ connector_type: str = CONNECTOR_TYPE
34
+
35
+
36
+ lancedb_cloud_destination_entry = DestinationRegistryEntry(
37
+ connection_config=LanceDBCloudConnectionConfig,
38
+ uploader=LanceDBCloudUploader,
39
+ uploader_config=LanceDBUploaderConfig,
40
+ upload_stager_config=LanceDBUploadStagerConfig,
41
+ upload_stager=LanceDBUploadStager,
42
+ )
@@ -8,7 +8,7 @@ import pandas as pd
8
8
  from dateutil import parser
9
9
  from pydantic import Field, Secret
10
10
 
11
- from unstructured_ingest.error import WriteError
11
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
12
12
  from unstructured_ingest.utils.data_prep import flatten_dict
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
14
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
@@ -66,7 +66,6 @@ class MilvusConnectionConfig(ConnectionConfig):
66
66
 
67
67
 
68
68
  class MilvusUploadStagerConfig(UploadStagerConfig):
69
-
70
69
  fields_to_include: Optional[list[str]] = None
71
70
  """If set - list of fields to include in the output.
72
71
  Unspecified fields are removed from the elements.
@@ -174,6 +173,14 @@ class MilvusUploader(Uploader):
174
173
  upload_config: MilvusUploaderConfig
175
174
  connector_type: str = CONNECTOR_TYPE
176
175
 
176
+ @DestinationConnectionError.wrap
177
+ def precheck(self):
178
+ with self.get_client() as client:
179
+ if not client.has_collection(self.upload_config.collection_name):
180
+ raise DestinationConnectionError(
181
+ f"Collection '{self.upload_config.collection_name}' does not exist"
182
+ )
183
+
177
184
  @contextmanager
178
185
  def get_client(self) -> Generator["MilvusClient", None, None]:
179
186
  client = self.connection_config.get_client()
@@ -218,7 +225,6 @@ class MilvusUploader(Uploader):
218
225
  f"db in collection {self.upload_config.collection_name}"
219
226
  )
220
227
  with self.get_client() as client:
221
-
222
228
  try:
223
229
  res = client.insert(collection_name=self.upload_config.collection_name, data=data)
224
230
  except MilvusException as milvus_exception:
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import sys
3
- from dataclasses import dataclass, field
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, replace
4
5
  from datetime import datetime
5
6
  from pathlib import Path
6
7
  from time import time
@@ -12,6 +13,7 @@ from unstructured_ingest.__version__ import __version__ as unstructured_version
12
13
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
13
14
  from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
14
15
  from unstructured_ingest.utils.dep_check import requires_dependencies
16
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
15
17
  from unstructured_ingest.v2.interfaces import (
16
18
  AccessConfig,
17
19
  ConnectionConfig,
@@ -24,8 +26,6 @@ from unstructured_ingest.v2.interfaces import (
24
26
  SourceIdentifiers,
25
27
  Uploader,
26
28
  UploaderConfig,
27
- UploadStager,
28
- UploadStagerConfig,
29
29
  download_responses,
30
30
  )
31
31
  from unstructured_ingest.v2.logger import logger
@@ -36,6 +36,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
36
36
 
37
37
  if TYPE_CHECKING:
38
38
  from pymongo import MongoClient
39
+ from pymongo.collection import Collection
39
40
 
40
41
  CONNECTOR_TYPE = "mongodb"
41
42
  SERVER_API_VERSION = "1"
@@ -54,18 +55,37 @@ class MongoDBConnectionConfig(ConnectionConfig):
54
55
  description="hostname or IP address or Unix domain socket path of a single mongod or "
55
56
  "mongos instance to connect to, or a list of hostnames",
56
57
  )
57
- database: Optional[str] = Field(default=None, description="database name to connect to")
58
- collection: Optional[str] = Field(default=None, description="collection name to connect to")
59
58
  port: int = Field(default=27017)
60
59
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
61
60
 
61
+ @contextmanager
62
+ @requires_dependencies(["pymongo"], extras="mongodb")
63
+ def get_client(self) -> Generator["MongoClient", None, None]:
64
+ from pymongo import MongoClient
65
+ from pymongo.driver_info import DriverInfo
66
+ from pymongo.server_api import ServerApi
62
67
 
63
- class MongoDBUploadStagerConfig(UploadStagerConfig):
64
- pass
68
+ access_config = self.access_config.get_secret_value()
69
+ if uri := access_config.uri:
70
+ client_kwargs = {
71
+ "host": uri,
72
+ "server_api": ServerApi(version=SERVER_API_VERSION),
73
+ "driver": DriverInfo(name="unstructured", version=unstructured_version),
74
+ }
75
+ else:
76
+ client_kwargs = {
77
+ "host": self.host,
78
+ "port": self.port,
79
+ "server_api": ServerApi(version=SERVER_API_VERSION),
80
+ }
81
+ with MongoClient(**client_kwargs) as client:
82
+ yield client
65
83
 
66
84
 
67
85
  class MongoDBIndexerConfig(IndexerConfig):
68
86
  batch_size: int = Field(default=100, description="Number of records per batch")
87
+ database: Optional[str] = Field(default=None, description="database name to connect to")
88
+ collection: Optional[str] = Field(default=None, description="collection name to connect to")
69
89
 
70
90
 
71
91
  class MongoDBDownloaderConfig(DownloaderConfig):
@@ -81,42 +101,38 @@ class MongoDBIndexer(Indexer):
81
101
  def precheck(self) -> None:
82
102
  """Validates the connection to the MongoDB server."""
83
103
  try:
84
- client = self.create_client()
85
- client.admin.command("ping")
104
+ with self.connection_config.get_client() as client:
105
+ client.admin.command("ping")
106
+ database_names = client.list_database_names()
107
+ database_name = self.index_config.database
108
+ if database_name not in database_names:
109
+ raise DestinationConnectionError(
110
+ "database {} does not exist: {}".format(
111
+ database_name, ", ".join(database_names)
112
+ )
113
+ )
114
+ database = client[database_name]
115
+ collection_names = database.list_collection_names()
116
+ collection_name = self.index_config.collection
117
+ if collection_name not in collection_names:
118
+ raise SourceConnectionError(
119
+ "collection {} does not exist: {}".format(
120
+ collection_name, ", ".join(collection_names)
121
+ )
122
+ )
86
123
  except Exception as e:
87
124
  logger.error(f"Failed to validate connection: {e}", exc_info=True)
88
125
  raise SourceConnectionError(f"Failed to validate connection: {e}")
89
126
 
90
- @requires_dependencies(["pymongo"], extras="mongodb")
91
- def create_client(self) -> "MongoClient":
92
- from pymongo import MongoClient
93
- from pymongo.driver_info import DriverInfo
94
- from pymongo.server_api import ServerApi
95
-
96
- access_config = self.connection_config.access_config.get_secret_value()
97
-
98
- if access_config.uri:
99
- return MongoClient(
100
- access_config.uri,
101
- server_api=ServerApi(version=SERVER_API_VERSION),
102
- driver=DriverInfo(name="unstructured", version=unstructured_version),
103
- )
104
- else:
105
- return MongoClient(
106
- host=self.connection_config.host,
107
- port=self.connection_config.port,
108
- server_api=ServerApi(version=SERVER_API_VERSION),
109
- )
110
-
111
127
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
112
128
  """Generates FileData objects for each document in the MongoDB collection."""
113
- client = self.create_client()
114
- database = client[self.connection_config.database]
115
- collection = database[self.connection_config.collection]
129
+ with self.connection_config.get_client() as client:
130
+ database = client[self.index_config.database]
131
+ collection = database[self.index_config.collection]
116
132
 
117
- # Get list of document IDs
118
- ids = collection.distinct("_id")
119
- batch_size = self.index_config.batch_size if self.index_config else 100
133
+ # Get list of document IDs
134
+ ids = collection.distinct("_id")
135
+ batch_size = self.index_config.batch_size if self.index_config else 100
120
136
 
121
137
  for id_batch in batch_generator(ids, batch_size=batch_size):
122
138
  # Make sure the hash is always a positive number to create identifier
@@ -125,8 +141,8 @@ class MongoDBIndexer(Indexer):
125
141
  metadata = FileDataSourceMetadata(
126
142
  date_processed=str(time()),
127
143
  record_locator={
128
- "database": self.connection_config.database,
129
- "collection": self.connection_config.collection,
144
+ "database": self.index_config.database,
145
+ "collection": self.index_config.collection,
130
146
  },
131
147
  )
132
148
 
@@ -177,8 +193,8 @@ class MongoDBDownloader(Downloader):
177
193
  from bson.objectid import ObjectId
178
194
 
179
195
  client = self.create_client()
180
- database = client[self.connection_config.database]
181
- collection = database[self.connection_config.collection]
196
+ database = client[file_data.metadata.record_locator["database"]]
197
+ collection = database[file_data.metadata.record_locator["collection"]]
182
198
 
183
199
  ids = file_data.additional_metadata.get("ids", [])
184
200
  if not ids:
@@ -222,14 +238,12 @@ class MongoDBDownloader(Downloader):
222
238
  concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
223
239
 
224
240
  # Create a FileData object for each document with source_identifiers
225
- individual_file_data = FileData(
226
- identifier=str(doc_id),
227
- connector_type=self.connector_type,
228
- source_identifiers=SourceIdentifiers(
229
- filename=str(doc_id),
230
- fullpath=str(doc_id),
231
- rel_path=str(doc_id),
232
- ),
241
+ individual_file_data = replace(file_data)
242
+ individual_file_data.identifier = str(doc_id)
243
+ individual_file_data.source_identifiers = SourceIdentifiers(
244
+ filename=str(doc_id),
245
+ fullpath=str(doc_id),
246
+ rel_path=str(doc_id),
233
247
  )
234
248
 
235
249
  # Determine the download path
@@ -247,15 +261,8 @@ class MongoDBDownloader(Downloader):
247
261
  individual_file_data.local_download_path = str(download_path)
248
262
 
249
263
  # Update metadata
250
- individual_file_data.metadata = FileDataSourceMetadata(
251
- date_created=date_created, # Include date_created here
252
- date_processed=str(time()),
253
- record_locator={
254
- "database": self.connection_config.database,
255
- "collection": self.connection_config.collection,
256
- "document_id": str(doc_id),
257
- },
258
- )
264
+ individual_file_data.metadata.record_locator["document_id"] = str(doc_id)
265
+ individual_file_data.metadata.date_created = date_created
259
266
 
260
267
  download_response = self.generate_download_response(
261
268
  file_data=individual_file_data, download_path=download_path
@@ -265,31 +272,14 @@ class MongoDBDownloader(Downloader):
265
272
  return download_responses
266
273
 
267
274
 
268
- @dataclass
269
- class MongoDBUploadStager(UploadStager):
270
- upload_stager_config: MongoDBUploadStagerConfig = field(
271
- default_factory=lambda: MongoDBUploadStagerConfig()
272
- )
273
-
274
- def run(
275
- self,
276
- elements_filepath: Path,
277
- file_data: FileData,
278
- output_dir: Path,
279
- output_filename: str,
280
- **kwargs: Any,
281
- ) -> Path:
282
- with open(elements_filepath) as elements_file:
283
- elements_contents = json.load(elements_file)
284
-
285
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
286
- with open(output_path, "w") as output_file:
287
- json.dump(elements_contents, output_file)
288
- return output_path
289
-
290
-
291
275
  class MongoDBUploaderConfig(UploaderConfig):
292
276
  batch_size: int = Field(default=100, description="Number of records per batch")
277
+ database: Optional[str] = Field(default=None, description="database name to connect to")
278
+ collection: Optional[str] = Field(default=None, description="collection name to connect to")
279
+ record_id_key: str = Field(
280
+ default=RECORD_ID_LABEL,
281
+ description="searchable key to find entries for the same record on previous runs",
282
+ )
293
283
 
294
284
 
295
285
  @dataclass
@@ -300,55 +290,76 @@ class MongoDBUploader(Uploader):
300
290
 
301
291
  def precheck(self) -> None:
302
292
  try:
303
- client = self.create_client()
304
- client.admin.command("ping")
293
+ with self.connection_config.get_client() as client:
294
+ client.admin.command("ping")
295
+ database_names = client.list_database_names()
296
+ database_name = self.upload_config.database
297
+ if database_name not in database_names:
298
+ raise DestinationConnectionError(
299
+ "database {} does not exist: {}".format(
300
+ database_name, ", ".join(database_names)
301
+ )
302
+ )
303
+ database = client[database_name]
304
+ collection_names = database.list_collection_names()
305
+ collection_name = self.upload_config.collection
306
+ if collection_name not in collection_names:
307
+ raise SourceConnectionError(
308
+ "collection {} does not exist: {}".format(
309
+ collection_name, ", ".join(collection_names)
310
+ )
311
+ )
305
312
  except Exception as e:
306
313
  logger.error(f"failed to validate connection: {e}", exc_info=True)
307
314
  raise DestinationConnectionError(f"failed to validate connection: {e}")
308
315
 
309
- @requires_dependencies(["pymongo"], extras="mongodb")
310
- def create_client(self) -> "MongoClient":
311
- from pymongo import MongoClient
312
- from pymongo.driver_info import DriverInfo
313
- from pymongo.server_api import ServerApi
314
-
315
- access_config = self.connection_config.access_config.get_secret_value()
316
-
317
- if access_config.uri:
318
- return MongoClient(
319
- access_config.uri,
320
- server_api=ServerApi(version=SERVER_API_VERSION),
321
- driver=DriverInfo(name="unstructured", version=unstructured_version),
322
- )
323
- else:
324
- return MongoClient(
325
- host=self.connection_config.host,
326
- port=self.connection_config.port,
327
- server_api=ServerApi(version=SERVER_API_VERSION),
328
- )
316
+ def can_delete(self, collection: "Collection") -> bool:
317
+ indexed_keys = []
318
+ for index in collection.list_indexes():
319
+ key_bson = index["key"]
320
+ indexed_keys.extend(key_bson.keys())
321
+ return self.upload_config.record_id_key in indexed_keys
322
+
323
+ def delete_by_record_id(self, collection: "Collection", file_data: FileData) -> None:
324
+ logger.debug(
325
+ f"deleting any content with metadata "
326
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
327
+ f"from collection: {collection.name}"
328
+ )
329
+ query = {self.upload_config.record_id_key: file_data.identifier}
330
+ delete_results = collection.delete_many(filter=query)
331
+ logger.info(
332
+ f"deleted {delete_results.deleted_count} records from collection {collection.name}"
333
+ )
329
334
 
330
335
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
331
336
  with path.open("r") as file:
332
337
  elements_dict = json.load(file)
333
338
  logger.info(
334
339
  f"writing {len(elements_dict)} objects to destination "
335
- f"db, {self.connection_config.database}, "
336
- f"collection {self.connection_config.collection} "
340
+ f"db, {self.upload_config.database}, "
341
+ f"collection {self.upload_config.collection} "
337
342
  f"at {self.connection_config.host}",
338
343
  )
339
- client = self.create_client()
340
- db = client[self.connection_config.database]
341
- collection = db[self.connection_config.collection]
342
- for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
343
- collection.insert_many(chunk)
344
+ # This would typically live in the stager but since no other manipulation
345
+ # is done, setting the record id field in the uploader
346
+ for element in elements_dict:
347
+ element[self.upload_config.record_id_key] = file_data.identifier
348
+ with self.connection_config.get_client() as client:
349
+ db = client[self.upload_config.database]
350
+ collection = db[self.upload_config.collection]
351
+ if self.can_delete(collection=collection):
352
+ self.delete_by_record_id(file_data=file_data, collection=collection)
353
+ else:
354
+ logger.warning("criteria for deleting previous content not met, skipping")
355
+ for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
356
+ collection.insert_many(chunk)
344
357
 
345
358
 
346
359
  mongodb_destination_entry = DestinationRegistryEntry(
347
360
  connection_config=MongoDBConnectionConfig,
348
361
  uploader=MongoDBUploader,
349
362
  uploader_config=MongoDBUploaderConfig,
350
- upload_stager=MongoDBUploadStager,
351
- upload_stager_config=MongoDBUploadStagerConfig,
352
363
  )
353
364
 
354
365
  mongodb_source_entry = SourceRegistryEntry(
@@ -10,6 +10,8 @@ from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
10
10
  from .embedded import weaviate_embedded_destination_entry
11
11
  from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
12
12
  from .local import weaviate_local_destination_entry
13
+ from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
14
+ from .weaviate import weaviate_destination_entry
13
15
 
14
16
  add_destination_entry(
15
17
  destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
@@ -20,3 +22,4 @@ add_destination_entry(
20
22
  add_destination_entry(
21
23
  destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
22
24
  )
25
+ add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
@@ -55,10 +55,11 @@ class CloudWeaviateConnectionConfig(WeaviateConnectionConfig):
55
55
  "client_secret": access_config.client_secret is not None,
56
56
  "client_password": access_config.password is not None and self.username is not None,
57
57
  }
58
- if len(auths) == 0:
58
+ existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
59
+
60
+ if len(existing_auths) == 0:
59
61
  raise ValueError("No auth values provided and anonymous is False")
60
- if len(auths) > 1:
61
- existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
62
+ if len(existing_auths) > 1:
62
63
  raise ValueError(
63
64
  "Multiple auth values provided, only one approach can be used: {}".format(
64
65
  ", ".join(existing_auths)
@@ -22,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
22
22
  UploadStagerConfig,
23
23
  )
24
24
  from unstructured_ingest.v2.logger import logger
25
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
25
26
 
26
27
  if TYPE_CHECKING:
27
28
  from weaviate.classes.init import Timeout
@@ -287,3 +288,12 @@ class WeaviateUploader(Uploader, ABC):
287
288
  vector=vector,
288
289
  )
289
290
  self.check_for_errors(client=weaviate_client)
291
+
292
+
293
+ weaviate_destination_entry = DestinationRegistryEntry(
294
+ connection_config=WeaviateConnectionConfig,
295
+ uploader=WeaviateUploader,
296
+ uploader_config=WeaviateUploaderConfig,
297
+ upload_stager=WeaviateUploadStager,
298
+ upload_stager_config=WeaviateUploadStagerConfig,
299
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,27 +22,27 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: python-dateutil
26
- Requires-Dist: tqdm
27
- Requires-Dist: click
28
25
  Requires-Dist: dataclasses-json
29
26
  Requires-Dist: pydantic>=2.7
30
27
  Requires-Dist: pandas
28
+ Requires-Dist: tqdm
29
+ Requires-Dist: python-dateutil
30
+ Requires-Dist: click
31
31
  Requires-Dist: opentelemetry-sdk
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: fsspec; extra == "azure"
38
37
  Requires-Dist: adlfs; extra == "azure"
38
+ Requires-Dist: fsspec; extra == "azure"
39
39
  Provides-Extra: azure-ai-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
42
  Requires-Dist: boto3; extra == "bedrock"
43
43
  Provides-Extra: biomed
44
- Requires-Dist: requests; extra == "biomed"
45
44
  Requires-Dist: bs4; extra == "biomed"
45
+ Requires-Dist: requests; extra == "biomed"
46
46
  Provides-Extra: box
47
47
  Requires-Dist: boxfs; extra == "box"
48
48
  Requires-Dist: fsspec; extra == "box"
@@ -91,21 +91,23 @@ Requires-Dist: bs4; extra == "gcs"
91
91
  Requires-Dist: gcsfs; extra == "gcs"
92
92
  Requires-Dist: fsspec; extra == "gcs"
93
93
  Provides-Extra: github
94
- Requires-Dist: requests; extra == "github"
95
94
  Requires-Dist: pygithub>1.58.0; extra == "github"
95
+ Requires-Dist: requests; extra == "github"
96
96
  Provides-Extra: gitlab
97
97
  Requires-Dist: python-gitlab; extra == "gitlab"
98
98
  Provides-Extra: google-drive
99
99
  Requires-Dist: google-api-python-client; extra == "google-drive"
100
100
  Provides-Extra: hubspot
101
- Requires-Dist: hubspot-api-client; extra == "hubspot"
102
101
  Requires-Dist: urllib3; extra == "hubspot"
102
+ Requires-Dist: hubspot-api-client; extra == "hubspot"
103
103
  Provides-Extra: jira
104
104
  Requires-Dist: atlassian-python-api; extra == "jira"
105
105
  Provides-Extra: kafka
106
106
  Requires-Dist: confluent-kafka; extra == "kafka"
107
107
  Provides-Extra: kdbai
108
108
  Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
109
+ Provides-Extra: lancedb
110
+ Requires-Dist: lancedb; extra == "lancedb"
109
111
  Provides-Extra: md
110
112
  Requires-Dist: unstructured[md]; extra == "md"
111
113
  Provides-Extra: milvus
@@ -115,16 +117,16 @@ Requires-Dist: pymongo; extra == "mongodb"
115
117
  Provides-Extra: msg
116
118
  Requires-Dist: unstructured[msg]; extra == "msg"
117
119
  Provides-Extra: notion
120
+ Requires-Dist: backoff; extra == "notion"
118
121
  Requires-Dist: htmlBuilder; extra == "notion"
119
122
  Requires-Dist: notion-client; extra == "notion"
120
- Requires-Dist: backoff; extra == "notion"
121
123
  Requires-Dist: httpx; extra == "notion"
122
124
  Provides-Extra: odt
123
125
  Requires-Dist: unstructured[odt]; extra == "odt"
124
126
  Provides-Extra: onedrive
125
127
  Requires-Dist: msal; extra == "onedrive"
126
- Requires-Dist: bs4; extra == "onedrive"
127
128
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
129
+ Requires-Dist: bs4; extra == "onedrive"
128
130
  Provides-Extra: openai
129
131
  Requires-Dist: openai; extra == "openai"
130
132
  Requires-Dist: tiktoken; extra == "openai"
@@ -156,8 +158,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
156
158
  Provides-Extra: rtf
157
159
  Requires-Dist: unstructured[rtf]; extra == "rtf"
158
160
  Provides-Extra: s3
159
- Requires-Dist: s3fs; extra == "s3"
160
161
  Requires-Dist: fsspec; extra == "s3"
162
+ Requires-Dist: s3fs; extra == "s3"
161
163
  Provides-Extra: salesforce
162
164
  Requires-Dist: simple-salesforce; extra == "salesforce"
163
165
  Provides-Extra: sftp
@@ -171,8 +173,8 @@ Requires-Dist: singlestoredb; extra == "singlestore"
171
173
  Provides-Extra: slack
172
174
  Requires-Dist: slack-sdk[optional]; extra == "slack"
173
175
  Provides-Extra: snowflake
174
- Requires-Dist: psycopg2-binary; extra == "snowflake"
175
176
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
177
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
176
178
  Provides-Extra: togetherai
177
179
  Requires-Dist: together; extra == "togetherai"
178
180
  Provides-Extra: tsv