unstructured-ingest 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/test_postgres.py +100 -0
  10. test/integration/connectors/test_s3.py +152 -0
  11. test/integration/connectors/test_sqlite.py +91 -0
  12. test/integration/connectors/utils/__init__.py +0 -0
  13. test/integration/connectors/utils/constants.py +7 -0
  14. test/integration/connectors/utils/docker_compose.py +44 -0
  15. test/integration/connectors/utils/validation.py +198 -0
  16. test/integration/embedders/__init__.py +0 -0
  17. test/integration/embedders/conftest.py +13 -0
  18. test/integration/embedders/test_bedrock.py +49 -0
  19. test/integration/embedders/test_huggingface.py +26 -0
  20. test/integration/embedders/test_mixedbread.py +47 -0
  21. test/integration/embedders/test_octoai.py +41 -0
  22. test/integration/embedders/test_openai.py +41 -0
  23. test/integration/embedders/test_vertexai.py +41 -0
  24. test/integration/embedders/test_voyageai.py +41 -0
  25. test/integration/embedders/togetherai.py +43 -0
  26. test/integration/embedders/utils.py +44 -0
  27. test/integration/partitioners/__init__.py +0 -0
  28. test/integration/partitioners/test_partitioner.py +75 -0
  29. test/integration/utils.py +15 -0
  30. test/unit/__init__.py +0 -0
  31. test/unit/embed/__init__.py +0 -0
  32. test/unit/embed/test_mixedbreadai.py +41 -0
  33. test/unit/embed/test_octoai.py +20 -0
  34. test/unit/embed/test_openai.py +20 -0
  35. test/unit/embed/test_vertexai.py +25 -0
  36. test/unit/embed/test_voyageai.py +24 -0
  37. test/unit/test_chunking_utils.py +36 -0
  38. test/unit/test_error.py +27 -0
  39. test/unit/test_interfaces.py +280 -0
  40. test/unit/test_interfaces_v2.py +26 -0
  41. test/unit/test_logger.py +78 -0
  42. test/unit/test_utils.py +164 -0
  43. test/unit/test_utils_v2.py +82 -0
  44. unstructured_ingest/__version__.py +1 -1
  45. unstructured_ingest/cli/interfaces.py +2 -2
  46. unstructured_ingest/connector/notion/types/block.py +1 -0
  47. unstructured_ingest/connector/notion/types/database.py +1 -0
  48. unstructured_ingest/connector/notion/types/page.py +1 -0
  49. unstructured_ingest/embed/bedrock.py +0 -20
  50. unstructured_ingest/embed/huggingface.py +0 -21
  51. unstructured_ingest/embed/interfaces.py +29 -3
  52. unstructured_ingest/embed/mixedbreadai.py +0 -36
  53. unstructured_ingest/embed/octoai.py +2 -24
  54. unstructured_ingest/embed/openai.py +0 -20
  55. unstructured_ingest/embed/togetherai.py +40 -0
  56. unstructured_ingest/embed/vertexai.py +0 -20
  57. unstructured_ingest/embed/voyageai.py +1 -24
  58. unstructured_ingest/interfaces.py +1 -1
  59. unstructured_ingest/utils/dep_check.py +12 -0
  60. unstructured_ingest/v2/cli/utils/click.py +21 -2
  61. unstructured_ingest/v2/interfaces/connector.py +22 -2
  62. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  63. unstructured_ingest/v2/processes/chunker.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/__init__.py +9 -11
  65. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  70. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  71. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +125 -32
  72. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  73. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +9 -1
  75. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
  77. unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
  78. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
  79. unstructured_ingest/v2/processes/embedder.py +13 -0
  80. unstructured_ingest/v2/processes/partitioner.py +2 -1
  81. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +12 -10
  82. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +86 -32
  83. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
  84. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  85. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
  86. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
  87. {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,85 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.interfaces import AccessConfig
7
+ from unstructured_ingest.v2.processes.connector_registry import (
8
+ DestinationRegistryEntry,
9
+ SourceRegistryEntry,
10
+ )
11
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes_gcp"
22
+
23
+
24
+ class DatabricksGoogleVolumesAccessConfig(AccessConfig):
25
+ account_id: Optional[str] = Field(
26
+ default=None,
27
+ description="The Databricks account ID for the Databricks " "accounts endpoint.",
28
+ )
29
+ profile: Optional[str] = None
30
+ google_credentials: Optional[str] = None
31
+ google_service_account: Optional[str] = None
32
+
33
+
34
+ class DatabricksGoogleVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
35
+ access_config: Secret[DatabricksGoogleVolumesAccessConfig]
36
+
37
+
38
+ class DatabricksGoogleVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
39
+ pass
40
+
41
+
42
+ @dataclass
43
+ class DatabricksGoogleVolumesIndexer(DatabricksVolumesIndexer):
44
+ connection_config: DatabricksGoogleVolumesConnectionConfig
45
+ index_config: DatabricksGoogleVolumesIndexerConfig
46
+ connector_type: str = CONNECTOR_TYPE
47
+
48
+
49
+ class DatabricksGoogleVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
50
+ pass
51
+
52
+
53
+ @dataclass
54
+ class DatabricksGoogleVolumesDownloader(DatabricksVolumesDownloader):
55
+ connection_config: DatabricksGoogleVolumesConnectionConfig
56
+ download_config: DatabricksVolumesDownloaderConfig
57
+ connector_type: str = CONNECTOR_TYPE
58
+
59
+
60
+ class DatabricksGoogleVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
61
+ pass
62
+
63
+
64
+ @dataclass
65
+ class DatabricksGoogleVolumesUploader(DatabricksVolumesUploader):
66
+ connection_config: DatabricksGoogleVolumesConnectionConfig
67
+ upload_config: DatabricksGoogleVolumesUploaderConfig = field(
68
+ default_factory=DatabricksGoogleVolumesUploaderConfig
69
+ )
70
+ connector_type: str = CONNECTOR_TYPE
71
+
72
+
73
+ databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
74
+ connection_config=DatabricksGoogleVolumesConnectionConfig,
75
+ uploader=DatabricksGoogleVolumesUploader,
76
+ uploader_config=DatabricksGoogleVolumesUploaderConfig,
77
+ )
78
+
79
+ databricks_gcp_volumes_source_entry = SourceRegistryEntry(
80
+ connection_config=DatabricksGoogleVolumesConnectionConfig,
81
+ indexer=DatabricksGoogleVolumesIndexer,
82
+ indexer_config=DatabricksGoogleVolumesIndexerConfig,
83
+ downloader=DatabricksGoogleVolumesDownloader,
84
+ downloader_config=DatabricksGoogleVolumesDownloaderConfig,
85
+ )
@@ -0,0 +1,86 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.interfaces import AccessConfig
7
+ from unstructured_ingest.v2.processes.connector_registry import (
8
+ DestinationRegistryEntry,
9
+ SourceRegistryEntry,
10
+ )
11
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes"
22
+
23
+
24
+ class DatabricksNativeVolumesAccessConfig(AccessConfig):
25
+ client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
26
+ client_secret: Optional[str] = Field(
27
+ default=None, description="Client Secret of the OAuth app."
28
+ )
29
+ profile: Optional[str] = None
30
+ azure_workspace_resource_id: Optional[str] = Field(
31
+ default=None,
32
+ description="The Azure Resource Manager ID for the Azure Databricks workspace, "
33
+ "which is exchanged for a Databricks host URL.",
34
+ )
35
+
36
+
37
+ class DatabricksNativeVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
38
+ access_config: Secret[DatabricksNativeVolumesAccessConfig]
39
+
40
+
41
+ class DatabricksNativeVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
42
+ pass
43
+
44
+
45
+ @dataclass
46
+ class DatabricksNativeVolumesIndexer(DatabricksVolumesIndexer):
47
+ connection_config: DatabricksNativeVolumesConnectionConfig
48
+ index_config: DatabricksNativeVolumesIndexerConfig
49
+ connector_type: str = CONNECTOR_TYPE
50
+
51
+
52
+ class DatabricksNativeVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
53
+ pass
54
+
55
+
56
+ @dataclass
57
+ class DatabricksNativeVolumesDownloader(DatabricksVolumesDownloader):
58
+ connection_config: DatabricksNativeVolumesConnectionConfig
59
+ download_config: DatabricksVolumesDownloaderConfig
60
+ connector_type: str = CONNECTOR_TYPE
61
+
62
+
63
+ class DatabricksNativeVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
64
+ pass
65
+
66
+
67
+ @dataclass
68
+ class DatabricksNativeVolumesUploader(DatabricksVolumesUploader):
69
+ connection_config: DatabricksNativeVolumesConnectionConfig
70
+ upload_config: DatabricksNativeVolumesUploaderConfig
71
+ connector_type: str = CONNECTOR_TYPE
72
+
73
+
74
+ databricks_native_volumes_destination_entry = DestinationRegistryEntry(
75
+ connection_config=DatabricksNativeVolumesConnectionConfig,
76
+ uploader=DatabricksNativeVolumesUploader,
77
+ uploader_config=DatabricksNativeVolumesUploaderConfig,
78
+ )
79
+
80
+ databricks_native_volumes_source_entry = SourceRegistryEntry(
81
+ connection_config=DatabricksNativeVolumesConnectionConfig,
82
+ indexer=DatabricksNativeVolumesIndexer,
83
+ indexer_config=DatabricksNativeVolumesIndexerConfig,
84
+ downloader=DatabricksNativeVolumesDownloader,
85
+ downloader_config=DatabricksNativeVolumesDownloaderConfig,
86
+ )
@@ -1,21 +1,35 @@
1
1
  import os
2
2
  from dataclasses import dataclass
3
3
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Any, Optional
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
6
  from pydantic import Field, Secret
7
7
 
8
- from unstructured_ingest.error import DestinationConnectionError
8
+ from unstructured_ingest.error import (
9
+ DestinationConnectionError,
10
+ SourceConnectionError,
11
+ SourceConnectionNetworkError,
12
+ )
9
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
14
  from unstructured_ingest.v2.interfaces import (
11
15
  AccessConfig,
12
16
  ConnectionConfig,
17
+ Downloader,
18
+ DownloaderConfig,
19
+ DownloadResponse,
13
20
  FileData,
21
+ FileDataSourceMetadata,
22
+ Indexer,
23
+ IndexerConfig,
24
+ SourceIdentifiers,
14
25
  Uploader,
15
26
  UploaderConfig,
16
27
  )
17
28
  from unstructured_ingest.v2.logger import logger
18
- from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
29
+ from unstructured_ingest.v2.processes.connector_registry import (
30
+ DestinationRegistryEntry,
31
+ SourceRegistryEntry,
32
+ )
19
33
 
20
34
  if TYPE_CHECKING:
21
35
  from databricks.sdk import WorkspaceClient
@@ -32,16 +46,6 @@ class DatabricksVolumesAccessConfig(AccessConfig):
32
46
  "https://accounts.azuredatabricks.net/ (Azure), "
33
47
  "or https://accounts.gcp.databricks.com/ (GCP).",
34
48
  )
35
- username: Optional[str] = Field(
36
- default=None,
37
- description="The Databricks username part of basic authentication. "
38
- "Only possible when Host is *.cloud.databricks.com (AWS).",
39
- )
40
- password: Optional[str] = Field(
41
- default=None,
42
- description="The Databricks password part of basic authentication. "
43
- "Only possible when Host is *.cloud.databricks.com (AWS).",
44
- )
45
49
  client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
46
50
  client_secret: Optional[str] = Field(
47
51
  default=None, description="Client Secret of the OAuth app."
@@ -78,7 +82,6 @@ class DatabricksVolumesAccessConfig(AccessConfig):
78
82
  "argument. This argument also holds the currently "
79
83
  "selected auth.",
80
84
  )
81
- cluster_id: Optional[str] = None
82
85
  google_credentials: Optional[str] = None
83
86
  google_service_account: Optional[str] = None
84
87
 
@@ -93,17 +96,11 @@ class DatabricksVolumesConnectionConfig(ConnectionConfig):
93
96
  "Databricks workspace endpoint or the "
94
97
  "Databricks accounts endpoint.",
95
98
  )
96
-
97
-
98
- class DatabricksVolumesUploaderConfig(UploaderConfig):
99
99
  volume: str = Field(description="Name of volume in the Unity Catalog")
100
100
  catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
101
101
  volume_path: Optional[str] = Field(
102
102
  default=None, description="Optional path within the volume to write to"
103
103
  )
104
- overwrite: bool = Field(
105
- default=False, description="If true, an existing file will be overwritten."
106
- )
107
104
  databricks_schema: str = Field(
108
105
  default="default",
109
106
  alias="schema",
@@ -117,33 +114,121 @@ class DatabricksVolumesUploaderConfig(UploaderConfig):
117
114
  path = f"{path}/{self.volume_path}"
118
115
  return path
119
116
 
120
-
121
- @dataclass
122
- class DatabricksVolumesUploader(Uploader):
123
- connector_type: str = CONNECTOR_TYPE
124
- upload_config: DatabricksVolumesUploaderConfig
125
- connection_config: DatabricksVolumesConnectionConfig
126
-
127
117
  @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
128
118
  def get_client(self) -> "WorkspaceClient":
129
119
  from databricks.sdk import WorkspaceClient
130
120
 
131
121
  return WorkspaceClient(
132
- host=self.connection_config.host,
133
- **self.connection_config.access_config.get_secret_value().model_dump(),
122
+ host=self.host,
123
+ **self.access_config.get_secret_value().model_dump(),
134
124
  )
135
125
 
126
+
127
+ @dataclass
128
+ class DatabricksVolumesIndexerConfig(IndexerConfig):
129
+ recursive: bool = False
130
+
131
+
132
+ @dataclass
133
+ class DatabricksVolumesIndexer(Indexer):
134
+ index_config: DatabricksVolumesIndexerConfig
135
+ connection_config: DatabricksVolumesConnectionConfig
136
+ connector_type: str = CONNECTOR_TYPE
137
+
136
138
  def precheck(self) -> None:
137
139
  try:
138
- assert self.get_client().current_user.me().active
140
+ self.connection_config.get_client()
141
+ except Exception as e:
142
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
143
+ raise SourceConnectionError(f"failed to validate connection: {e}")
144
+
145
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
146
+ for file_info in self.connection_config.get_client().dbfs.list(
147
+ path=self.connection_config.path, recursive=self.index_config.recursive
148
+ ):
149
+ if file_info.is_dir:
150
+ continue
151
+ rel_path = file_info.path.replace(self.connection_config.path, "")
152
+ if rel_path.startswith("/"):
153
+ rel_path = rel_path[1:]
154
+ filename = Path(file_info.path).name
155
+ yield FileData(
156
+ identifier=file_info.path,
157
+ connector_type=CONNECTOR_TYPE,
158
+ source_identifiers=SourceIdentifiers(
159
+ filename=filename,
160
+ rel_path=rel_path,
161
+ fullpath=file_info.path,
162
+ ),
163
+ additional_metadata={
164
+ "catalog": self.connection_config.catalog,
165
+ },
166
+ metadata=FileDataSourceMetadata(
167
+ url=file_info.path, date_modified=str(file_info.modification_time)
168
+ ),
169
+ )
170
+
171
+
172
+ @dataclass
173
+ class DatabricksVolumesDownloaderConfig(DownloaderConfig):
174
+ pass
175
+
176
+
177
+ @dataclass
178
+ class DatabricksVolumesDownloader(Downloader):
179
+ download_config: DatabricksVolumesDownloaderConfig
180
+ connection_config: DatabricksVolumesConnectionConfig
181
+ connector_type: str = CONNECTOR_TYPE
182
+
183
+ def precheck(self) -> None:
184
+ try:
185
+ self.connection_config.get_client()
186
+ except Exception as e:
187
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
188
+ raise SourceConnectionError(f"failed to validate connection: {e}")
189
+
190
+ def get_download_path(self, file_data: FileData) -> Path:
191
+ return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
192
+
193
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
194
+ download_path = self.get_download_path(file_data=file_data)
195
+ download_path.parent.mkdir(parents=True, exist_ok=True)
196
+ logger.info(f"Writing {file_data.identifier} to {download_path}")
197
+ try:
198
+ with self.connection_config.get_client().dbfs.download(path=file_data.identifier) as c:
199
+ read_content = c._read_handle.read()
200
+ with open(download_path, "wb") as f:
201
+ f.write(read_content)
202
+ except Exception as e:
203
+ logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
204
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
205
+
206
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
207
+
208
+
209
+ class DatabricksVolumesUploaderConfig(UploaderConfig):
210
+ overwrite: bool = Field(
211
+ default=False, description="If true, an existing file will be overwritten."
212
+ )
213
+
214
+
215
+ @dataclass
216
+ class DatabricksVolumesUploader(Uploader):
217
+ upload_config: DatabricksVolumesUploaderConfig
218
+ connection_config: DatabricksVolumesConnectionConfig
219
+ connector_type: str = CONNECTOR_TYPE
220
+
221
+ def precheck(self) -> None:
222
+ try:
223
+ assert self.connection_config.get_client().current_user.me().active
139
224
  except Exception as e:
140
225
  logger.error(f"failed to validate connection: {e}", exc_info=True)
141
226
  raise DestinationConnectionError(f"failed to validate connection: {e}")
142
227
 
143
228
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
144
- output_path = os.path.join(self.upload_config.path, path.name)
229
+ output_path = os.path.join(self.connection_config.path, path.name)
145
230
  with open(path, "rb") as elements_file:
146
- self.get_client().files.upload(
231
+ self.connection_config.get_client().files.upload(
147
232
  file_path=output_path,
148
233
  contents=elements_file,
149
234
  overwrite=self.upload_config.overwrite,
@@ -155,3 +240,11 @@ databricks_volumes_destination_entry = DestinationRegistryEntry(
155
240
  uploader=DatabricksVolumesUploader,
156
241
  uploader_config=DatabricksVolumesUploaderConfig,
157
242
  )
243
+
244
+ databricks_volumes_source_entry = SourceRegistryEntry(
245
+ connection_config=DatabricksVolumesConnectionConfig,
246
+ indexer=DatabricksVolumesIndexer,
247
+ indexer_config=DatabricksVolumesIndexerConfig,
248
+ downloader=DatabricksVolumesDownloader,
249
+ downloader_config=DatabricksVolumesDownloaderConfig,
250
+ )
@@ -1,26 +1,37 @@
1
1
  import json
2
+ import sys
2
3
  from dataclasses import dataclass, field
4
+ from datetime import datetime
3
5
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Any, Optional
6
+ from time import time
7
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
8
 
6
9
  from pydantic import Field, Secret
7
10
 
8
11
  from unstructured_ingest.__version__ import __version__ as unstructured_version
9
- from unstructured_ingest.error import DestinationConnectionError
10
- from unstructured_ingest.utils.data_prep import batch_generator
12
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
13
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
11
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
15
  from unstructured_ingest.v2.interfaces import (
13
16
  AccessConfig,
14
17
  ConnectionConfig,
18
+ Downloader,
19
+ DownloaderConfig,
15
20
  FileData,
21
+ FileDataSourceMetadata,
22
+ Indexer,
23
+ IndexerConfig,
24
+ SourceIdentifiers,
16
25
  Uploader,
17
26
  UploaderConfig,
18
27
  UploadStager,
19
28
  UploadStagerConfig,
29
+ download_responses,
20
30
  )
21
31
  from unstructured_ingest.v2.logger import logger
22
32
  from unstructured_ingest.v2.processes.connector_registry import (
23
33
  DestinationRegistryEntry,
34
+ SourceRegistryEntry,
24
35
  )
25
36
 
26
37
  if TYPE_CHECKING:
@@ -53,6 +64,207 @@ class MongoDBUploadStagerConfig(UploadStagerConfig):
53
64
  pass
54
65
 
55
66
 
67
+ class MongoDBIndexerConfig(IndexerConfig):
68
+ batch_size: int = Field(default=100, description="Number of records per batch")
69
+
70
+
71
+ class MongoDBDownloaderConfig(DownloaderConfig):
72
+ pass
73
+
74
+
75
+ @dataclass
76
+ class MongoDBIndexer(Indexer):
77
+ connection_config: MongoDBConnectionConfig
78
+ index_config: MongoDBIndexerConfig
79
+ connector_type: str = CONNECTOR_TYPE
80
+
81
+ def precheck(self) -> None:
82
+ """Validates the connection to the MongoDB server."""
83
+ try:
84
+ client = self.create_client()
85
+ client.admin.command("ping")
86
+ except Exception as e:
87
+ logger.error(f"Failed to validate connection: {e}", exc_info=True)
88
+ raise SourceConnectionError(f"Failed to validate connection: {e}")
89
+
90
+ @requires_dependencies(["pymongo"], extras="mongodb")
91
+ def create_client(self) -> "MongoClient":
92
+ from pymongo import MongoClient
93
+ from pymongo.driver_info import DriverInfo
94
+ from pymongo.server_api import ServerApi
95
+
96
+ access_config = self.connection_config.access_config.get_secret_value()
97
+
98
+ if access_config.uri:
99
+ return MongoClient(
100
+ access_config.uri,
101
+ server_api=ServerApi(version=SERVER_API_VERSION),
102
+ driver=DriverInfo(name="unstructured", version=unstructured_version),
103
+ )
104
+ else:
105
+ return MongoClient(
106
+ host=self.connection_config.host,
107
+ port=self.connection_config.port,
108
+ server_api=ServerApi(version=SERVER_API_VERSION),
109
+ )
110
+
111
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
112
+ """Generates FileData objects for each document in the MongoDB collection."""
113
+ client = self.create_client()
114
+ database = client[self.connection_config.database]
115
+ collection = database[self.connection_config.collection]
116
+
117
+ # Get list of document IDs
118
+ ids = collection.distinct("_id")
119
+ batch_size = self.index_config.batch_size if self.index_config else 100
120
+
121
+ for id_batch in batch_generator(ids, batch_size=batch_size):
122
+ # Make sure the hash is always a positive number to create identifier
123
+ batch_id = str(hash(frozenset(id_batch)) + sys.maxsize + 1)
124
+
125
+ metadata = FileDataSourceMetadata(
126
+ date_processed=str(time()),
127
+ record_locator={
128
+ "database": self.connection_config.database,
129
+ "collection": self.connection_config.collection,
130
+ },
131
+ )
132
+
133
+ file_data = FileData(
134
+ identifier=batch_id,
135
+ doc_type="batch",
136
+ connector_type=self.connector_type,
137
+ metadata=metadata,
138
+ additional_metadata={
139
+ "ids": [str(doc_id) for doc_id in id_batch],
140
+ },
141
+ )
142
+ yield file_data
143
+
144
+
145
+ @dataclass
146
+ class MongoDBDownloader(Downloader):
147
+ download_config: MongoDBDownloaderConfig
148
+ connection_config: MongoDBConnectionConfig
149
+ connector_type: str = CONNECTOR_TYPE
150
+
151
+ @requires_dependencies(["pymongo"], extras="mongodb")
152
+ def create_client(self) -> "MongoClient":
153
+ from pymongo import MongoClient
154
+ from pymongo.driver_info import DriverInfo
155
+ from pymongo.server_api import ServerApi
156
+
157
+ access_config = self.connection_config.access_config.get_secret_value()
158
+
159
+ if access_config.uri:
160
+ return MongoClient(
161
+ access_config.uri,
162
+ server_api=ServerApi(version=SERVER_API_VERSION),
163
+ driver=DriverInfo(name="unstructured", version=unstructured_version),
164
+ )
165
+ else:
166
+ return MongoClient(
167
+ host=self.connection_config.host,
168
+ port=self.connection_config.port,
169
+ server_api=ServerApi(version=SERVER_API_VERSION),
170
+ )
171
+
172
+ @SourceConnectionError.wrap
173
+ @requires_dependencies(["bson"], extras="mongodb")
174
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
175
+ """Fetches the document from MongoDB and writes it to a file."""
176
+ from bson.errors import InvalidId
177
+ from bson.objectid import ObjectId
178
+
179
+ client = self.create_client()
180
+ database = client[self.connection_config.database]
181
+ collection = database[self.connection_config.collection]
182
+
183
+ ids = file_data.additional_metadata.get("ids", [])
184
+ if not ids:
185
+ raise ValueError("No document IDs provided in additional_metadata")
186
+
187
+ object_ids = []
188
+ for doc_id in ids:
189
+ try:
190
+ object_ids.append(ObjectId(doc_id))
191
+ except InvalidId as e:
192
+ error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
193
+ logger.error(error_message)
194
+ raise ValueError(error_message) from e
195
+
196
+ try:
197
+ docs = list(collection.find({"_id": {"$in": object_ids}}))
198
+ except Exception as e:
199
+ logger.error(f"Failed to fetch documents: {e}", exc_info=True)
200
+ raise e
201
+
202
+ download_responses = []
203
+ for doc in docs:
204
+ doc_id = doc["_id"]
205
+ doc.pop("_id", None)
206
+
207
+ # Extract date_created from the document or ObjectId
208
+ date_created = None
209
+ if "date_created" in doc:
210
+ # If the document has a 'date_created' field, use it
211
+ date_created = doc["date_created"]
212
+ if isinstance(date_created, datetime):
213
+ date_created = date_created.isoformat()
214
+ else:
215
+ # Convert to ISO format if it's a string
216
+ date_created = str(date_created)
217
+ elif isinstance(doc_id, ObjectId):
218
+ # Use the ObjectId's generation time
219
+ date_created = doc_id.generation_time.isoformat()
220
+
221
+ flattened_dict = flatten_dict(dictionary=doc)
222
+ concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
223
+
224
+ # Create a FileData object for each document with source_identifiers
225
+ individual_file_data = FileData(
226
+ identifier=str(doc_id),
227
+ connector_type=self.connector_type,
228
+ source_identifiers=SourceIdentifiers(
229
+ filename=str(doc_id),
230
+ fullpath=str(doc_id),
231
+ rel_path=str(doc_id),
232
+ ),
233
+ )
234
+
235
+ # Determine the download path
236
+ download_path = self.get_download_path(individual_file_data)
237
+ if download_path is None:
238
+ raise ValueError("Download path could not be determined")
239
+
240
+ download_path.parent.mkdir(parents=True, exist_ok=True)
241
+ download_path = download_path.with_suffix(".txt")
242
+
243
+ # Write the concatenated values to the file
244
+ with open(download_path, "w", encoding="utf8") as f:
245
+ f.write(concatenated_values)
246
+
247
+ individual_file_data.local_download_path = str(download_path)
248
+
249
+ # Update metadata
250
+ individual_file_data.metadata = FileDataSourceMetadata(
251
+ date_created=date_created, # Include date_created here
252
+ date_processed=str(time()),
253
+ record_locator={
254
+ "database": self.connection_config.database,
255
+ "collection": self.connection_config.collection,
256
+ "document_id": str(doc_id),
257
+ },
258
+ )
259
+
260
+ download_response = self.generate_download_response(
261
+ file_data=individual_file_data, download_path=download_path
262
+ )
263
+ download_responses.append(download_response)
264
+
265
+ return download_responses
266
+
267
+
56
268
  @dataclass
57
269
  class MongoDBUploadStager(UploadStager):
58
270
  upload_stager_config: MongoDBUploadStagerConfig = field(
@@ -138,3 +350,11 @@ mongodb_destination_entry = DestinationRegistryEntry(
138
350
  upload_stager=MongoDBUploadStager,
139
351
  upload_stager_config=MongoDBUploadStagerConfig,
140
352
  )
353
+
354
+ mongodb_source_entry = SourceRegistryEntry(
355
+ connection_config=MongoDBConnectionConfig,
356
+ indexer_config=MongoDBIndexerConfig,
357
+ indexer=MongoDBIndexer,
358
+ downloader_config=MongoDBDownloaderConfig,
359
+ downloader=MongoDBDownloader,
360
+ )