unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (52) hide show
  1. test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
  2. test/integration/connectors/sql/test_postgres.py +9 -5
  3. test/integration/connectors/sql/test_singlestore.py +9 -5
  4. test/integration/connectors/sql/test_snowflake.py +6 -2
  5. test/integration/connectors/sql/test_sqlite.py +9 -5
  6. test/integration/connectors/test_astradb.py +40 -0
  7. test/integration/connectors/test_kafka.py +2 -2
  8. test/integration/connectors/test_mongodb.py +4 -1
  9. test/integration/connectors/utils/validation/source.py +31 -11
  10. unstructured_ingest/__version__.py +1 -1
  11. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  12. unstructured_ingest/v2/interfaces/file_data.py +69 -15
  13. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  14. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  15. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  16. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  17. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  18. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  19. unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
  20. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  21. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  22. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  23. unstructured_ingest/v2/processes/connectors/astradb.py +37 -33
  24. unstructured_ingest/v2/processes/connectors/couchbase.py +52 -41
  25. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
  26. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
  27. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
  28. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
  29. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
  30. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
  31. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  32. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  33. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  34. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  35. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  36. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  37. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  38. unstructured_ingest/v2/processes/connectors/mongodb.py +94 -100
  39. unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
  40. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  41. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  42. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  43. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  44. unstructured_ingest/v2/processes/connectors/sql/sql.py +36 -26
  45. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  46. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/METADATA +11 -10
  47. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/RECORD +52 -52
  48. /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
  49. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/LICENSE.md +0 -0
  50. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/WHEEL +0 -0
  51. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/entry_points.txt +0 -0
  52. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import contextmanager
3
4
  from dataclasses import dataclass, field
4
5
  from pathlib import Path
5
6
  from time import time
6
- from typing import Any, Generator, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
7
8
 
8
9
  from dateutil import parser
9
10
  from pydantic import Field, Secret
10
11
 
11
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
13
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
13
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
14
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
14
15
  from unstructured_ingest.v2.processes.connector_registry import (
15
16
  DestinationRegistryEntry,
16
17
  SourceRegistryEntry,
@@ -26,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
27
  FsspecUploaderConfig,
27
28
  )
28
29
 
30
+ if TYPE_CHECKING:
31
+ from gcsfs import GCSFileSystem
32
+
29
33
  CONNECTOR_TYPE = "gcs"
30
34
 
31
35
 
@@ -93,6 +97,12 @@ class GcsConnectionConfig(FsspecConnectionConfig):
93
97
  access_config: Secret[GcsAccessConfig] = Field(default=GcsAccessConfig(), validate_default=True)
94
98
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
95
99
 
100
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
101
+ @contextmanager
102
+ def get_client(self, protocol: str) -> Generator["GCSFileSystem", None, None]:
103
+ with super().get_client(protocol=protocol) as client:
104
+ yield client
105
+
96
106
 
97
107
  @dataclass
98
108
  class GcsIndexer(FsspecIndexer):
@@ -100,14 +110,6 @@ class GcsIndexer(FsspecIndexer):
100
110
  index_config: GcsIndexerConfig
101
111
  connector_type: str = CONNECTOR_TYPE
102
112
 
103
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
104
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
105
- return super().run(**kwargs)
106
-
107
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
108
- def precheck(self) -> None:
109
- super().precheck()
110
-
111
113
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
112
114
  path = file_data["name"]
113
115
  date_created = None
@@ -147,14 +149,6 @@ class GcsDownloader(FsspecDownloader):
147
149
  connector_type: str = CONNECTOR_TYPE
148
150
  download_config: Optional[GcsDownloaderConfig] = field(default_factory=GcsDownloaderConfig)
149
151
 
150
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
151
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
152
- return super().run(file_data=file_data, **kwargs)
153
-
154
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
155
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
156
- return await super().run_async(file_data=file_data, **kwargs)
157
-
158
152
 
159
153
  class GcsUploaderConfig(FsspecUploaderConfig):
160
154
  pass
@@ -166,22 +160,6 @@ class GcsUploader(FsspecUploader):
166
160
  connection_config: GcsConnectionConfig
167
161
  upload_config: GcsUploaderConfig = field(default=None)
168
162
 
169
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
170
- def __post_init__(self):
171
- super().__post_init__()
172
-
173
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
174
- def precheck(self) -> None:
175
- super().precheck()
176
-
177
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
178
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
179
- return super().run(path=path, file_data=file_data, **kwargs)
180
-
181
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
182
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
183
- return await super().run_async(path=path, file_data=file_data, **kwargs)
184
-
185
163
 
186
164
  gcs_source_entry = SourceRegistryEntry(
187
165
  indexer=GcsIndexer,
@@ -1,15 +1,13 @@
1
1
  import contextlib
2
+ from contextlib import contextmanager
2
3
  from dataclasses import dataclass, field
3
- from pathlib import Path
4
4
  from time import time
5
- from typing import Any, Generator, Optional
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
 
7
7
  from pydantic import Field, Secret
8
8
 
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
10
  from unstructured_ingest.v2.interfaces import (
11
- DownloadResponse,
12
- FileData,
13
11
  FileDataSourceMetadata,
14
12
  )
15
13
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -29,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
29
27
 
30
28
  CONNECTOR_TYPE = "s3"
31
29
 
30
+ if TYPE_CHECKING:
31
+ from s3fs import S3FileSystem
32
+
32
33
 
33
34
  class S3IndexerConfig(FsspecIndexerConfig):
34
35
  pass
@@ -72,6 +73,12 @@ class S3ConnectionConfig(FsspecConnectionConfig):
72
73
  )
73
74
  return access_configs
74
75
 
76
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
77
+ @contextmanager
78
+ def get_client(self, protocol: str) -> Generator["S3FileSystem", None, None]:
79
+ with super().get_client(protocol=protocol) as client:
80
+ yield client
81
+
75
82
 
76
83
  @dataclass
77
84
  class S3Indexer(FsspecIndexer):
@@ -97,7 +104,8 @@ class S3Indexer(FsspecIndexer):
97
104
  version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
98
105
  metadata: dict[str, str] = {}
99
106
  with contextlib.suppress(AttributeError):
100
- metadata = self.fs.metadata(path=path)
107
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
108
+ metadata = client.metadata(path=path)
101
109
  record_locator = {
102
110
  "protocol": self.index_config.protocol,
103
111
  "remote_file_path": self.index_config.remote_url,
@@ -114,14 +122,6 @@ class S3Indexer(FsspecIndexer):
114
122
  filesize_bytes=file_size,
115
123
  )
116
124
 
117
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
118
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
119
- return super().run(**kwargs)
120
-
121
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
122
- def precheck(self) -> None:
123
- super().precheck()
124
-
125
125
 
126
126
  class S3DownloaderConfig(FsspecDownloaderConfig):
127
127
  pass
@@ -134,14 +134,6 @@ class S3Downloader(FsspecDownloader):
134
134
  connector_type: str = CONNECTOR_TYPE
135
135
  download_config: Optional[S3DownloaderConfig] = field(default_factory=S3DownloaderConfig)
136
136
 
137
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
138
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
139
- return super().run(file_data=file_data, **kwargs)
140
-
141
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
142
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
143
- return await super().run_async(file_data=file_data, **kwargs)
144
-
145
137
 
146
138
  class S3UploaderConfig(FsspecUploaderConfig):
147
139
  pass
@@ -153,22 +145,6 @@ class S3Uploader(FsspecUploader):
153
145
  connection_config: S3ConnectionConfig
154
146
  upload_config: S3UploaderConfig = field(default=None)
155
147
 
156
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
157
- def precheck(self) -> None:
158
- super().precheck()
159
-
160
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
161
- def __post_init__(self):
162
- super().__post_init__()
163
-
164
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
165
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
166
- return super().run(path=path, file_data=file_data, **kwargs)
167
-
168
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
169
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
170
- return await super().run_async(path=path, file_data=file_data, **kwargs)
171
-
172
148
 
173
149
  s3_source_entry = SourceRegistryEntry(
174
150
  indexer=S3Indexer,
@@ -1,16 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
+ from contextlib import contextmanager
4
5
  from dataclasses import dataclass, field
5
6
  from pathlib import Path
6
7
  from time import time
7
- from typing import Any, Generator, Optional
8
+ from typing import TYPE_CHECKING, Any, Generator, Optional
8
9
  from urllib.parse import urlparse
9
10
 
10
11
  from pydantic import Field, Secret
11
12
 
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
14
+ from unstructured_ingest.v2.interfaces import FileData, FileDataSourceMetadata
14
15
  from unstructured_ingest.v2.processes.connector_registry import (
15
16
  DestinationRegistryEntry,
16
17
  SourceRegistryEntry,
@@ -26,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
27
  FsspecUploaderConfig,
27
28
  )
28
29
 
30
+ if TYPE_CHECKING:
31
+ from fsspec.implementations.sftp import SFTPFileSystem
32
+
29
33
  CONNECTOR_TYPE = "sftp"
30
34
 
31
35
 
@@ -67,6 +71,19 @@ class SftpConnectionConfig(FsspecConnectionConfig):
67
71
  }
68
72
  return access_config
69
73
 
74
+ @contextmanager
75
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
76
+ def get_client(self, protocol: str) -> Generator["SFTPFileSystem", None, None]:
77
+ # The paramiko.SSHClient() client that's opened by the SFTPFileSystem
78
+ # never gets closed so explicitly adding that as part of this context manager
79
+ from fsspec import get_filesystem_class
80
+
81
+ client: SFTPFileSystem = get_filesystem_class(protocol)(
82
+ **self.get_access_config(),
83
+ )
84
+ yield client
85
+ client.client.close()
86
+
70
87
 
71
88
  @dataclass
72
89
  class SftpIndexer(FsspecIndexer):
@@ -74,13 +91,11 @@ class SftpIndexer(FsspecIndexer):
74
91
  index_config: SftpIndexerConfig
75
92
  connector_type: str = CONNECTOR_TYPE
76
93
 
77
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
78
94
  def __post_init__(self):
79
95
  parsed_url = urlparse(self.index_config.remote_url)
80
96
  self.connection_config.host = parsed_url.hostname or self.connection_config.host
81
97
  self.connection_config.port = parsed_url.port or self.connection_config.port
82
98
 
83
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
84
99
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
85
100
  for file in super().run(**kwargs):
86
101
  new_identifier = (
@@ -92,10 +107,6 @@ class SftpIndexer(FsspecIndexer):
92
107
  file.identifier = new_identifier
93
108
  yield file
94
109
 
95
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
96
- def precheck(self) -> None:
97
- super().precheck()
98
-
99
110
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
100
111
  path = file_data["name"]
101
112
  date_created = str(file_data.get("time").timestamp()) if "time" in file_data else None
@@ -128,20 +139,11 @@ class SftpDownloader(FsspecDownloader):
128
139
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
129
140
  download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
130
141
 
131
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
132
142
  def __post_init__(self):
133
143
  parsed_url = urlparse(self.download_config.remote_url)
134
144
  self.connection_config.host = parsed_url.hostname or self.connection_config.host
135
145
  self.connection_config.port = parsed_url.port or self.connection_config.port
136
146
 
137
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
138
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
139
- return super().run(file_data=file_data, **kwargs)
140
-
141
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
142
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
143
- return await super().run_async(file_data=file_data, **kwargs)
144
-
145
147
 
146
148
  class SftpUploaderConfig(FsspecUploaderConfig):
147
149
  pass
@@ -153,22 +155,6 @@ class SftpUploader(FsspecUploader):
153
155
  connection_config: SftpConnectionConfig
154
156
  upload_config: SftpUploaderConfig = field(default=None)
155
157
 
156
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
157
- def __post_init__(self):
158
- super().__post_init__()
159
-
160
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
161
- def precheck(self) -> None:
162
- super().precheck()
163
-
164
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
165
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
166
- return super().run(path=path, file_data=file_data, **kwargs)
167
-
168
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
169
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
170
- return await super().run_async(path=path, file_data=file_data, **kwargs)
171
-
172
158
 
173
159
  sftp_source_entry = SourceRegistryEntry(
174
160
  indexer=SftpIndexer,
@@ -1,11 +1,10 @@
1
- import sys
2
1
  from contextlib import contextmanager
3
- from dataclasses import dataclass, replace
2
+ from dataclasses import dataclass
4
3
  from datetime import datetime
5
4
  from time import time
6
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
7
6
 
8
- from pydantic import Field, Secret
7
+ from pydantic import BaseModel, Field, Secret
9
8
 
10
9
  from unstructured_ingest.__version__ import __version__ as unstructured_version
11
10
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
@@ -14,9 +13,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
14
13
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
15
14
  from unstructured_ingest.v2.interfaces import (
16
15
  AccessConfig,
16
+ BatchFileData,
17
+ BatchItem,
17
18
  ConnectionConfig,
18
19
  Downloader,
19
20
  DownloaderConfig,
21
+ DownloadResponse,
20
22
  FileData,
21
23
  FileDataSourceMetadata,
22
24
  Indexer,
@@ -40,6 +42,15 @@ CONNECTOR_TYPE = "mongodb"
40
42
  SERVER_API_VERSION = "1"
41
43
 
42
44
 
45
+ class MongoDBAdditionalMetadata(BaseModel):
46
+ database: str
47
+ collection: str
48
+
49
+
50
+ class MongoDBBatchFileData(BatchFileData):
51
+ additional_metadata: MongoDBAdditionalMetadata
52
+
53
+
43
54
  class MongoDBAccessConfig(AccessConfig):
44
55
  uri: Optional[str] = Field(default=None, description="URI to user when connecting")
45
56
 
@@ -122,7 +133,7 @@ class MongoDBIndexer(Indexer):
122
133
  logger.error(f"Failed to validate connection: {e}", exc_info=True)
123
134
  raise SourceConnectionError(f"Failed to validate connection: {e}")
124
135
 
125
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
136
+ def run(self, **kwargs: Any) -> Generator[BatchFileData, None, None]:
126
137
  """Generates FileData objects for each document in the MongoDB collection."""
127
138
  with self.connection_config.get_client() as client:
128
139
  database = client[self.index_config.database]
@@ -130,12 +141,12 @@ class MongoDBIndexer(Indexer):
130
141
 
131
142
  # Get list of document IDs
132
143
  ids = collection.distinct("_id")
133
- batch_size = self.index_config.batch_size if self.index_config else 100
144
+
145
+ ids = sorted(ids)
146
+ batch_size = self.index_config.batch_size
134
147
 
135
148
  for id_batch in batch_generator(ids, batch_size=batch_size):
136
149
  # Make sure the hash is always a positive number to create identifier
137
- batch_id = str(hash(frozenset(id_batch)) + sys.maxsize + 1)
138
-
139
150
  metadata = FileDataSourceMetadata(
140
151
  date_processed=str(time()),
141
152
  record_locator={
@@ -144,14 +155,13 @@ class MongoDBIndexer(Indexer):
144
155
  },
145
156
  )
146
157
 
147
- file_data = FileData(
148
- identifier=batch_id,
149
- doc_type="batch",
158
+ file_data = MongoDBBatchFileData(
150
159
  connector_type=self.connector_type,
151
160
  metadata=metadata,
152
- additional_metadata={
153
- "ids": [str(doc_id) for doc_id in id_batch],
154
- },
161
+ batch_items=[BatchItem(identifier=str(doc_id)) for doc_id in id_batch],
162
+ additional_metadata=MongoDBAdditionalMetadata(
163
+ collection=self.index_config.collection, database=self.index_config.database
164
+ ),
155
165
  )
156
166
  yield file_data
157
167
 
@@ -162,26 +172,58 @@ class MongoDBDownloader(Downloader):
162
172
  connection_config: MongoDBConnectionConfig
163
173
  connector_type: str = CONNECTOR_TYPE
164
174
 
165
- @requires_dependencies(["pymongo"], extras="mongodb")
166
- def create_client(self) -> "MongoClient":
167
- from pymongo import MongoClient
168
- from pymongo.driver_info import DriverInfo
169
- from pymongo.server_api import ServerApi
175
+ def generate_download_response(
176
+ self, doc: dict, file_data: MongoDBBatchFileData
177
+ ) -> DownloadResponse:
178
+ from bson.objectid import ObjectId
170
179
 
171
- access_config = self.connection_config.access_config.get_secret_value()
180
+ doc_id = doc["_id"]
181
+ doc.pop("_id", None)
172
182
 
173
- if access_config.uri:
174
- return MongoClient(
175
- access_config.uri,
176
- server_api=ServerApi(version=SERVER_API_VERSION),
177
- driver=DriverInfo(name="unstructured", version=unstructured_version),
178
- )
179
- else:
180
- return MongoClient(
181
- host=self.connection_config.host,
182
- port=self.connection_config.port,
183
- server_api=ServerApi(version=SERVER_API_VERSION),
184
- )
183
+ # Extract date_created from the document or ObjectId
184
+ date_created = None
185
+ if "date_created" in doc:
186
+ # If the document has a 'date_created' field, use it
187
+ date_created = doc["date_created"]
188
+ if isinstance(date_created, datetime):
189
+ date_created = date_created.isoformat()
190
+ else:
191
+ # Convert to ISO format if it's a string
192
+ date_created = str(date_created)
193
+ elif isinstance(doc_id, ObjectId):
194
+ # Use the ObjectId's generation time
195
+ date_created = doc_id.generation_time.isoformat()
196
+
197
+ flattened_dict = flatten_dict(dictionary=doc)
198
+ concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
199
+
200
+ # Create a FileData object for each document with source_identifiers
201
+ filename = f"{doc_id}.txt"
202
+ file_data.source_identifiers = SourceIdentifiers(
203
+ filename=filename,
204
+ fullpath=filename,
205
+ )
206
+ cast_file_data = FileData.cast(file_data=file_data)
207
+ cast_file_data.identifier = str(doc_id)
208
+
209
+ # Determine the download path
210
+ download_path = self.get_download_path(file_data=cast_file_data)
211
+ if download_path is None:
212
+ raise ValueError("Download path could not be determined")
213
+
214
+ download_path.parent.mkdir(parents=True, exist_ok=True)
215
+
216
+ # Write the concatenated values to the file
217
+ with open(download_path, "w", encoding="utf8") as f:
218
+ f.write(concatenated_values)
219
+
220
+ # Update metadata
221
+ cast_file_data.metadata.record_locator["document_id"] = str(doc_id)
222
+ cast_file_data.metadata.date_created = date_created
223
+
224
+ return super().generate_download_response(
225
+ file_data=cast_file_data, download_path=download_path
226
+ )
185
227
 
186
228
  @SourceConnectionError.wrap
187
229
  @requires_dependencies(["bson"], extras="mongodb")
@@ -190,82 +232,34 @@ class MongoDBDownloader(Downloader):
190
232
  from bson.errors import InvalidId
191
233
  from bson.objectid import ObjectId
192
234
 
193
- client = self.create_client()
194
- database = client[file_data.metadata.record_locator["database"]]
195
- collection = database[file_data.metadata.record_locator["collection"]]
235
+ mongo_file_data = MongoDBBatchFileData.cast(file_data=file_data)
196
236
 
197
- ids = file_data.additional_metadata.get("ids", [])
198
- if not ids:
199
- raise ValueError("No document IDs provided in additional_metadata")
237
+ with self.connection_config.get_client() as client:
238
+ database = client[mongo_file_data.additional_metadata.database]
239
+ collection = database[mongo_file_data.additional_metadata.collection]
200
240
 
201
- object_ids = []
202
- for doc_id in ids:
203
- try:
204
- object_ids.append(ObjectId(doc_id))
205
- except InvalidId as e:
206
- error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
207
- logger.error(error_message)
208
- raise ValueError(error_message) from e
241
+ ids = [item.identifier for item in mongo_file_data.batch_items]
209
242
 
210
- try:
211
- docs = list(collection.find({"_id": {"$in": object_ids}}))
212
- except Exception as e:
213
- logger.error(f"Failed to fetch documents: {e}", exc_info=True)
214
- raise e
243
+ object_ids = []
244
+ for doc_id in ids:
245
+ try:
246
+ object_ids.append(ObjectId(doc_id))
247
+ except InvalidId as e:
248
+ error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
249
+ logger.error(error_message)
250
+ raise ValueError(error_message) from e
251
+
252
+ try:
253
+ docs = list(collection.find({"_id": {"$in": object_ids}}))
254
+ except Exception as e:
255
+ logger.error(f"Failed to fetch documents: {e}", exc_info=True)
256
+ raise e
215
257
 
216
258
  download_responses = []
217
259
  for doc in docs:
218
- doc_id = doc["_id"]
219
- doc.pop("_id", None)
220
-
221
- # Extract date_created from the document or ObjectId
222
- date_created = None
223
- if "date_created" in doc:
224
- # If the document has a 'date_created' field, use it
225
- date_created = doc["date_created"]
226
- if isinstance(date_created, datetime):
227
- date_created = date_created.isoformat()
228
- else:
229
- # Convert to ISO format if it's a string
230
- date_created = str(date_created)
231
- elif isinstance(doc_id, ObjectId):
232
- # Use the ObjectId's generation time
233
- date_created = doc_id.generation_time.isoformat()
234
-
235
- flattened_dict = flatten_dict(dictionary=doc)
236
- concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
237
-
238
- # Create a FileData object for each document with source_identifiers
239
- individual_file_data = replace(file_data)
240
- individual_file_data.identifier = str(doc_id)
241
- individual_file_data.source_identifiers = SourceIdentifiers(
242
- filename=str(doc_id),
243
- fullpath=str(doc_id),
244
- rel_path=str(doc_id),
245
- )
246
-
247
- # Determine the download path
248
- download_path = self.get_download_path(individual_file_data)
249
- if download_path is None:
250
- raise ValueError("Download path could not be determined")
251
-
252
- download_path.parent.mkdir(parents=True, exist_ok=True)
253
- download_path = download_path.with_suffix(".txt")
254
-
255
- # Write the concatenated values to the file
256
- with open(download_path, "w", encoding="utf8") as f:
257
- f.write(concatenated_values)
258
-
259
- individual_file_data.local_download_path = str(download_path)
260
-
261
- # Update metadata
262
- individual_file_data.metadata.record_locator["document_id"] = str(doc_id)
263
- individual_file_data.metadata.date_created = date_created
264
-
265
- download_response = self.generate_download_response(
266
- file_data=individual_file_data, download_path=download_path
260
+ download_responses.append(
261
+ self.generate_download_response(doc=doc, file_data=mongo_file_data)
267
262
  )
268
- download_responses.append(download_response)
269
263
 
270
264
  return download_responses
271
265
 
@@ -10,7 +10,6 @@ from enum import Enum
10
10
  from pathlib import Path
11
11
  from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
12
12
 
13
- import networkx as nx
14
13
  from pydantic import BaseModel, ConfigDict, Field, Secret
15
14
 
16
15
  from unstructured_ingest.error import DestinationConnectionError
@@ -33,6 +32,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
33
32
 
34
33
  if TYPE_CHECKING:
35
34
  from neo4j import AsyncDriver, Auth
35
+ from networkx import Graph, MultiDiGraph
36
36
 
37
37
  CONNECTOR_TYPE = "neo4j"
38
38
 
@@ -109,7 +109,9 @@ class Neo4jUploadStager(UploadStager):
109
109
 
110
110
  return output_filepath
111
111
 
112
- def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> nx.Graph:
112
+ def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> "Graph":
113
+ import networkx as nx
114
+
113
115
  graph = nx.MultiDiGraph()
114
116
  graph.add_node(document_node)
115
117
 
@@ -180,7 +182,7 @@ class _GraphData(BaseModel):
180
182
  edges: list[_Edge]
181
183
 
182
184
  @classmethod
183
- def from_nx(cls, nx_graph: nx.MultiDiGraph) -> _GraphData:
185
+ def from_nx(cls, nx_graph: "MultiDiGraph") -> _GraphData:
184
186
  nodes = list(nx_graph.nodes())
185
187
  edges = [
186
188
  _Edge(
@@ -202,7 +202,7 @@ class OnedriveDownloader(Downloader):
202
202
  if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
203
203
  raise ValueError(
204
204
  f"file data doesn't have enough information to get "
205
- f"file content: {file_data.to_dict()}"
205
+ f"file content: {file_data.model_dump()}"
206
206
  )
207
207
 
208
208
  server_relative_path = file_data.source_identifiers.fullpath
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Generator, Optional
5
5
  from pydantic import Field, Secret
6
6
 
7
7
  from unstructured_ingest.utils.dep_check import requires_dependencies
8
- from unstructured_ingest.v2.interfaces import FileData
9
8
  from unstructured_ingest.v2.logger import logger
10
9
  from unstructured_ingest.v2.processes.connector_registry import (
11
10
  DestinationRegistryEntry,
@@ -13,6 +12,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
13
12
  )
14
13
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
15
14
  SQLAccessConfig,
15
+ SqlBatchFileData,
16
16
  SQLConnectionConfig,
17
17
  SQLDownloader,
18
18
  SQLDownloaderConfig,
@@ -99,12 +99,12 @@ class PostgresDownloader(SQLDownloader):
99
99
  connector_type: str = CONNECTOR_TYPE
100
100
 
101
101
  @requires_dependencies(["psycopg2"], extras="postgres")
102
- def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
102
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
103
103
  from psycopg2 import sql
104
104
 
105
- table_name = file_data.additional_metadata["table_name"]
106
- id_column = file_data.additional_metadata["id_column"]
107
- ids = tuple(file_data.additional_metadata["ids"])
105
+ table_name = file_data.additional_metadata.table_name
106
+ id_column = file_data.additional_metadata.id_column
107
+ ids = tuple([item.identifier for item in file_data.batch_items])
108
108
 
109
109
  with self.connection_config.get_cursor() as cursor:
110
110
  fields = (