unstructured-ingest 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/test_postgres.py +100 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/test_sqlite.py +91 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +198 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/utils/dep_check.py +12 -0
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +9 -11
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +125 -32
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +9 -1
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +12 -10
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +86 -32
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
|
+
DestinationRegistryEntry,
|
|
9
|
+
SourceRegistryEntry,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "databricks_volumes_gcp"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksGoogleVolumesAccessConfig(AccessConfig):
|
|
25
|
+
account_id: Optional[str] = Field(
|
|
26
|
+
default=None,
|
|
27
|
+
description="The Databricks account ID for the Databricks " "accounts endpoint.",
|
|
28
|
+
)
|
|
29
|
+
profile: Optional[str] = None
|
|
30
|
+
google_credentials: Optional[str] = None
|
|
31
|
+
google_service_account: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DatabricksGoogleVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
35
|
+
access_config: Secret[DatabricksGoogleVolumesAccessConfig]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DatabricksGoogleVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class DatabricksGoogleVolumesIndexer(DatabricksVolumesIndexer):
|
|
44
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
45
|
+
index_config: DatabricksGoogleVolumesIndexerConfig
|
|
46
|
+
connector_type: str = CONNECTOR_TYPE
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class DatabricksGoogleVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class DatabricksGoogleVolumesDownloader(DatabricksVolumesDownloader):
|
|
55
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
56
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
57
|
+
connector_type: str = CONNECTOR_TYPE
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DatabricksGoogleVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class DatabricksGoogleVolumesUploader(DatabricksVolumesUploader):
|
|
66
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
67
|
+
upload_config: DatabricksGoogleVolumesUploaderConfig = field(
|
|
68
|
+
default_factory=DatabricksGoogleVolumesUploaderConfig
|
|
69
|
+
)
|
|
70
|
+
connector_type: str = CONNECTOR_TYPE
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
|
|
74
|
+
connection_config=DatabricksGoogleVolumesConnectionConfig,
|
|
75
|
+
uploader=DatabricksGoogleVolumesUploader,
|
|
76
|
+
uploader_config=DatabricksGoogleVolumesUploaderConfig,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
databricks_gcp_volumes_source_entry = SourceRegistryEntry(
|
|
80
|
+
connection_config=DatabricksGoogleVolumesConnectionConfig,
|
|
81
|
+
indexer=DatabricksGoogleVolumesIndexer,
|
|
82
|
+
indexer_config=DatabricksGoogleVolumesIndexerConfig,
|
|
83
|
+
downloader=DatabricksGoogleVolumesDownloader,
|
|
84
|
+
downloader_config=DatabricksGoogleVolumesDownloaderConfig,
|
|
85
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
|
+
DestinationRegistryEntry,
|
|
9
|
+
SourceRegistryEntry,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "databricks_volumes"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksNativeVolumesAccessConfig(AccessConfig):
|
|
25
|
+
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
26
|
+
client_secret: Optional[str] = Field(
|
|
27
|
+
default=None, description="Client Secret of the OAuth app."
|
|
28
|
+
)
|
|
29
|
+
profile: Optional[str] = None
|
|
30
|
+
azure_workspace_resource_id: Optional[str] = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="The Azure Resource Manager ID for the Azure Databricks workspace, "
|
|
33
|
+
"which is exchanged for a Databricks host URL.",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DatabricksNativeVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
38
|
+
access_config: Secret[DatabricksNativeVolumesAccessConfig]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DatabricksNativeVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class DatabricksNativeVolumesIndexer(DatabricksVolumesIndexer):
|
|
47
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
48
|
+
index_config: DatabricksNativeVolumesIndexerConfig
|
|
49
|
+
connector_type: str = CONNECTOR_TYPE
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class DatabricksNativeVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class DatabricksNativeVolumesDownloader(DatabricksVolumesDownloader):
|
|
58
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
59
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
60
|
+
connector_type: str = CONNECTOR_TYPE
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class DatabricksNativeVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class DatabricksNativeVolumesUploader(DatabricksVolumesUploader):
|
|
69
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
70
|
+
upload_config: DatabricksNativeVolumesUploaderConfig
|
|
71
|
+
connector_type: str = CONNECTOR_TYPE
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
databricks_native_volumes_destination_entry = DestinationRegistryEntry(
|
|
75
|
+
connection_config=DatabricksNativeVolumesConnectionConfig,
|
|
76
|
+
uploader=DatabricksNativeVolumesUploader,
|
|
77
|
+
uploader_config=DatabricksNativeVolumesUploaderConfig,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
databricks_native_volumes_source_entry = SourceRegistryEntry(
|
|
81
|
+
connection_config=DatabricksNativeVolumesConnectionConfig,
|
|
82
|
+
indexer=DatabricksNativeVolumesIndexer,
|
|
83
|
+
indexer_config=DatabricksNativeVolumesIndexerConfig,
|
|
84
|
+
downloader=DatabricksNativeVolumesDownloader,
|
|
85
|
+
downloader_config=DatabricksNativeVolumesDownloaderConfig,
|
|
86
|
+
)
|
|
@@ -1,21 +1,35 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
6
|
from pydantic import Field, Secret
|
|
7
7
|
|
|
8
|
-
from unstructured_ingest.error import
|
|
8
|
+
from unstructured_ingest.error import (
|
|
9
|
+
DestinationConnectionError,
|
|
10
|
+
SourceConnectionError,
|
|
11
|
+
SourceConnectionNetworkError,
|
|
12
|
+
)
|
|
9
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
14
|
from unstructured_ingest.v2.interfaces import (
|
|
11
15
|
AccessConfig,
|
|
12
16
|
ConnectionConfig,
|
|
17
|
+
Downloader,
|
|
18
|
+
DownloaderConfig,
|
|
19
|
+
DownloadResponse,
|
|
13
20
|
FileData,
|
|
21
|
+
FileDataSourceMetadata,
|
|
22
|
+
Indexer,
|
|
23
|
+
IndexerConfig,
|
|
24
|
+
SourceIdentifiers,
|
|
14
25
|
Uploader,
|
|
15
26
|
UploaderConfig,
|
|
16
27
|
)
|
|
17
28
|
from unstructured_ingest.v2.logger import logger
|
|
18
|
-
from unstructured_ingest.v2.processes.connector_registry import
|
|
29
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
30
|
+
DestinationRegistryEntry,
|
|
31
|
+
SourceRegistryEntry,
|
|
32
|
+
)
|
|
19
33
|
|
|
20
34
|
if TYPE_CHECKING:
|
|
21
35
|
from databricks.sdk import WorkspaceClient
|
|
@@ -32,16 +46,6 @@ class DatabricksVolumesAccessConfig(AccessConfig):
|
|
|
32
46
|
"https://accounts.azuredatabricks.net/ (Azure), "
|
|
33
47
|
"or https://accounts.gcp.databricks.com/ (GCP).",
|
|
34
48
|
)
|
|
35
|
-
username: Optional[str] = Field(
|
|
36
|
-
default=None,
|
|
37
|
-
description="The Databricks username part of basic authentication. "
|
|
38
|
-
"Only possible when Host is *.cloud.databricks.com (AWS).",
|
|
39
|
-
)
|
|
40
|
-
password: Optional[str] = Field(
|
|
41
|
-
default=None,
|
|
42
|
-
description="The Databricks password part of basic authentication. "
|
|
43
|
-
"Only possible when Host is *.cloud.databricks.com (AWS).",
|
|
44
|
-
)
|
|
45
49
|
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
46
50
|
client_secret: Optional[str] = Field(
|
|
47
51
|
default=None, description="Client Secret of the OAuth app."
|
|
@@ -78,7 +82,6 @@ class DatabricksVolumesAccessConfig(AccessConfig):
|
|
|
78
82
|
"argument. This argument also holds the currently "
|
|
79
83
|
"selected auth.",
|
|
80
84
|
)
|
|
81
|
-
cluster_id: Optional[str] = None
|
|
82
85
|
google_credentials: Optional[str] = None
|
|
83
86
|
google_service_account: Optional[str] = None
|
|
84
87
|
|
|
@@ -93,17 +96,11 @@ class DatabricksVolumesConnectionConfig(ConnectionConfig):
|
|
|
93
96
|
"Databricks workspace endpoint or the "
|
|
94
97
|
"Databricks accounts endpoint.",
|
|
95
98
|
)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
class DatabricksVolumesUploaderConfig(UploaderConfig):
|
|
99
99
|
volume: str = Field(description="Name of volume in the Unity Catalog")
|
|
100
100
|
catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
|
|
101
101
|
volume_path: Optional[str] = Field(
|
|
102
102
|
default=None, description="Optional path within the volume to write to"
|
|
103
103
|
)
|
|
104
|
-
overwrite: bool = Field(
|
|
105
|
-
default=False, description="If true, an existing file will be overwritten."
|
|
106
|
-
)
|
|
107
104
|
databricks_schema: str = Field(
|
|
108
105
|
default="default",
|
|
109
106
|
alias="schema",
|
|
@@ -117,33 +114,121 @@ class DatabricksVolumesUploaderConfig(UploaderConfig):
|
|
|
117
114
|
path = f"{path}/{self.volume_path}"
|
|
118
115
|
return path
|
|
119
116
|
|
|
120
|
-
|
|
121
|
-
@dataclass
|
|
122
|
-
class DatabricksVolumesUploader(Uploader):
|
|
123
|
-
connector_type: str = CONNECTOR_TYPE
|
|
124
|
-
upload_config: DatabricksVolumesUploaderConfig
|
|
125
|
-
connection_config: DatabricksVolumesConnectionConfig
|
|
126
|
-
|
|
127
117
|
@requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
|
|
128
118
|
def get_client(self) -> "WorkspaceClient":
|
|
129
119
|
from databricks.sdk import WorkspaceClient
|
|
130
120
|
|
|
131
121
|
return WorkspaceClient(
|
|
132
|
-
host=self.
|
|
133
|
-
**self.
|
|
122
|
+
host=self.host,
|
|
123
|
+
**self.access_config.get_secret_value().model_dump(),
|
|
134
124
|
)
|
|
135
125
|
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class DatabricksVolumesIndexerConfig(IndexerConfig):
|
|
129
|
+
recursive: bool = False
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@dataclass
|
|
133
|
+
class DatabricksVolumesIndexer(Indexer):
|
|
134
|
+
index_config: DatabricksVolumesIndexerConfig
|
|
135
|
+
connection_config: DatabricksVolumesConnectionConfig
|
|
136
|
+
connector_type: str = CONNECTOR_TYPE
|
|
137
|
+
|
|
136
138
|
def precheck(self) -> None:
|
|
137
139
|
try:
|
|
138
|
-
|
|
140
|
+
self.connection_config.get_client()
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
143
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
144
|
+
|
|
145
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
146
|
+
for file_info in self.connection_config.get_client().dbfs.list(
|
|
147
|
+
path=self.connection_config.path, recursive=self.index_config.recursive
|
|
148
|
+
):
|
|
149
|
+
if file_info.is_dir:
|
|
150
|
+
continue
|
|
151
|
+
rel_path = file_info.path.replace(self.connection_config.path, "")
|
|
152
|
+
if rel_path.startswith("/"):
|
|
153
|
+
rel_path = rel_path[1:]
|
|
154
|
+
filename = Path(file_info.path).name
|
|
155
|
+
yield FileData(
|
|
156
|
+
identifier=file_info.path,
|
|
157
|
+
connector_type=CONNECTOR_TYPE,
|
|
158
|
+
source_identifiers=SourceIdentifiers(
|
|
159
|
+
filename=filename,
|
|
160
|
+
rel_path=rel_path,
|
|
161
|
+
fullpath=file_info.path,
|
|
162
|
+
),
|
|
163
|
+
additional_metadata={
|
|
164
|
+
"catalog": self.connection_config.catalog,
|
|
165
|
+
},
|
|
166
|
+
metadata=FileDataSourceMetadata(
|
|
167
|
+
url=file_info.path, date_modified=str(file_info.modification_time)
|
|
168
|
+
),
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@dataclass
|
|
173
|
+
class DatabricksVolumesDownloaderConfig(DownloaderConfig):
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@dataclass
|
|
178
|
+
class DatabricksVolumesDownloader(Downloader):
|
|
179
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
180
|
+
connection_config: DatabricksVolumesConnectionConfig
|
|
181
|
+
connector_type: str = CONNECTOR_TYPE
|
|
182
|
+
|
|
183
|
+
def precheck(self) -> None:
|
|
184
|
+
try:
|
|
185
|
+
self.connection_config.get_client()
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
188
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
189
|
+
|
|
190
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
191
|
+
return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
|
|
192
|
+
|
|
193
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
194
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
195
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
196
|
+
logger.info(f"Writing {file_data.identifier} to {download_path}")
|
|
197
|
+
try:
|
|
198
|
+
with self.connection_config.get_client().dbfs.download(path=file_data.identifier) as c:
|
|
199
|
+
read_content = c._read_handle.read()
|
|
200
|
+
with open(download_path, "wb") as f:
|
|
201
|
+
f.write(read_content)
|
|
202
|
+
except Exception as e:
|
|
203
|
+
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
204
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
205
|
+
|
|
206
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class DatabricksVolumesUploaderConfig(UploaderConfig):
|
|
210
|
+
overwrite: bool = Field(
|
|
211
|
+
default=False, description="If true, an existing file will be overwritten."
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
@dataclass
|
|
216
|
+
class DatabricksVolumesUploader(Uploader):
|
|
217
|
+
upload_config: DatabricksVolumesUploaderConfig
|
|
218
|
+
connection_config: DatabricksVolumesConnectionConfig
|
|
219
|
+
connector_type: str = CONNECTOR_TYPE
|
|
220
|
+
|
|
221
|
+
def precheck(self) -> None:
|
|
222
|
+
try:
|
|
223
|
+
assert self.connection_config.get_client().current_user.me().active
|
|
139
224
|
except Exception as e:
|
|
140
225
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
141
226
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
142
227
|
|
|
143
228
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
144
|
-
output_path = os.path.join(self.
|
|
229
|
+
output_path = os.path.join(self.connection_config.path, path.name)
|
|
145
230
|
with open(path, "rb") as elements_file:
|
|
146
|
-
self.get_client().files.upload(
|
|
231
|
+
self.connection_config.get_client().files.upload(
|
|
147
232
|
file_path=output_path,
|
|
148
233
|
contents=elements_file,
|
|
149
234
|
overwrite=self.upload_config.overwrite,
|
|
@@ -155,3 +240,11 @@ databricks_volumes_destination_entry = DestinationRegistryEntry(
|
|
|
155
240
|
uploader=DatabricksVolumesUploader,
|
|
156
241
|
uploader_config=DatabricksVolumesUploaderConfig,
|
|
157
242
|
)
|
|
243
|
+
|
|
244
|
+
databricks_volumes_source_entry = SourceRegistryEntry(
|
|
245
|
+
connection_config=DatabricksVolumesConnectionConfig,
|
|
246
|
+
indexer=DatabricksVolumesIndexer,
|
|
247
|
+
indexer_config=DatabricksVolumesIndexerConfig,
|
|
248
|
+
downloader=DatabricksVolumesDownloader,
|
|
249
|
+
downloader_config=DatabricksVolumesDownloaderConfig,
|
|
250
|
+
)
|
|
@@ -1,26 +1,37 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import sys
|
|
2
3
|
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
3
5
|
from pathlib import Path
|
|
4
|
-
from
|
|
6
|
+
from time import time
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
8
|
|
|
6
9
|
from pydantic import Field, Secret
|
|
7
10
|
|
|
8
11
|
from unstructured_ingest.__version__ import __version__ as unstructured_version
|
|
9
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
10
|
-
from unstructured_ingest.utils.data_prep import batch_generator
|
|
12
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
13
|
+
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
11
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
15
|
from unstructured_ingest.v2.interfaces import (
|
|
13
16
|
AccessConfig,
|
|
14
17
|
ConnectionConfig,
|
|
18
|
+
Downloader,
|
|
19
|
+
DownloaderConfig,
|
|
15
20
|
FileData,
|
|
21
|
+
FileDataSourceMetadata,
|
|
22
|
+
Indexer,
|
|
23
|
+
IndexerConfig,
|
|
24
|
+
SourceIdentifiers,
|
|
16
25
|
Uploader,
|
|
17
26
|
UploaderConfig,
|
|
18
27
|
UploadStager,
|
|
19
28
|
UploadStagerConfig,
|
|
29
|
+
download_responses,
|
|
20
30
|
)
|
|
21
31
|
from unstructured_ingest.v2.logger import logger
|
|
22
32
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
33
|
DestinationRegistryEntry,
|
|
34
|
+
SourceRegistryEntry,
|
|
24
35
|
)
|
|
25
36
|
|
|
26
37
|
if TYPE_CHECKING:
|
|
@@ -53,6 +64,207 @@ class MongoDBUploadStagerConfig(UploadStagerConfig):
|
|
|
53
64
|
pass
|
|
54
65
|
|
|
55
66
|
|
|
67
|
+
class MongoDBIndexerConfig(IndexerConfig):
|
|
68
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class MongoDBDownloaderConfig(DownloaderConfig):
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class MongoDBIndexer(Indexer):
|
|
77
|
+
connection_config: MongoDBConnectionConfig
|
|
78
|
+
index_config: MongoDBIndexerConfig
|
|
79
|
+
connector_type: str = CONNECTOR_TYPE
|
|
80
|
+
|
|
81
|
+
def precheck(self) -> None:
|
|
82
|
+
"""Validates the connection to the MongoDB server."""
|
|
83
|
+
try:
|
|
84
|
+
client = self.create_client()
|
|
85
|
+
client.admin.command("ping")
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"Failed to validate connection: {e}", exc_info=True)
|
|
88
|
+
raise SourceConnectionError(f"Failed to validate connection: {e}")
|
|
89
|
+
|
|
90
|
+
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
91
|
+
def create_client(self) -> "MongoClient":
|
|
92
|
+
from pymongo import MongoClient
|
|
93
|
+
from pymongo.driver_info import DriverInfo
|
|
94
|
+
from pymongo.server_api import ServerApi
|
|
95
|
+
|
|
96
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
97
|
+
|
|
98
|
+
if access_config.uri:
|
|
99
|
+
return MongoClient(
|
|
100
|
+
access_config.uri,
|
|
101
|
+
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
102
|
+
driver=DriverInfo(name="unstructured", version=unstructured_version),
|
|
103
|
+
)
|
|
104
|
+
else:
|
|
105
|
+
return MongoClient(
|
|
106
|
+
host=self.connection_config.host,
|
|
107
|
+
port=self.connection_config.port,
|
|
108
|
+
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
112
|
+
"""Generates FileData objects for each document in the MongoDB collection."""
|
|
113
|
+
client = self.create_client()
|
|
114
|
+
database = client[self.connection_config.database]
|
|
115
|
+
collection = database[self.connection_config.collection]
|
|
116
|
+
|
|
117
|
+
# Get list of document IDs
|
|
118
|
+
ids = collection.distinct("_id")
|
|
119
|
+
batch_size = self.index_config.batch_size if self.index_config else 100
|
|
120
|
+
|
|
121
|
+
for id_batch in batch_generator(ids, batch_size=batch_size):
|
|
122
|
+
# Make sure the hash is always a positive number to create identifier
|
|
123
|
+
batch_id = str(hash(frozenset(id_batch)) + sys.maxsize + 1)
|
|
124
|
+
|
|
125
|
+
metadata = FileDataSourceMetadata(
|
|
126
|
+
date_processed=str(time()),
|
|
127
|
+
record_locator={
|
|
128
|
+
"database": self.connection_config.database,
|
|
129
|
+
"collection": self.connection_config.collection,
|
|
130
|
+
},
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
file_data = FileData(
|
|
134
|
+
identifier=batch_id,
|
|
135
|
+
doc_type="batch",
|
|
136
|
+
connector_type=self.connector_type,
|
|
137
|
+
metadata=metadata,
|
|
138
|
+
additional_metadata={
|
|
139
|
+
"ids": [str(doc_id) for doc_id in id_batch],
|
|
140
|
+
},
|
|
141
|
+
)
|
|
142
|
+
yield file_data
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@dataclass
|
|
146
|
+
class MongoDBDownloader(Downloader):
|
|
147
|
+
download_config: MongoDBDownloaderConfig
|
|
148
|
+
connection_config: MongoDBConnectionConfig
|
|
149
|
+
connector_type: str = CONNECTOR_TYPE
|
|
150
|
+
|
|
151
|
+
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
152
|
+
def create_client(self) -> "MongoClient":
|
|
153
|
+
from pymongo import MongoClient
|
|
154
|
+
from pymongo.driver_info import DriverInfo
|
|
155
|
+
from pymongo.server_api import ServerApi
|
|
156
|
+
|
|
157
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
158
|
+
|
|
159
|
+
if access_config.uri:
|
|
160
|
+
return MongoClient(
|
|
161
|
+
access_config.uri,
|
|
162
|
+
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
163
|
+
driver=DriverInfo(name="unstructured", version=unstructured_version),
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
return MongoClient(
|
|
167
|
+
host=self.connection_config.host,
|
|
168
|
+
port=self.connection_config.port,
|
|
169
|
+
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
@SourceConnectionError.wrap
|
|
173
|
+
@requires_dependencies(["bson"], extras="mongodb")
|
|
174
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
175
|
+
"""Fetches the document from MongoDB and writes it to a file."""
|
|
176
|
+
from bson.errors import InvalidId
|
|
177
|
+
from bson.objectid import ObjectId
|
|
178
|
+
|
|
179
|
+
client = self.create_client()
|
|
180
|
+
database = client[self.connection_config.database]
|
|
181
|
+
collection = database[self.connection_config.collection]
|
|
182
|
+
|
|
183
|
+
ids = file_data.additional_metadata.get("ids", [])
|
|
184
|
+
if not ids:
|
|
185
|
+
raise ValueError("No document IDs provided in additional_metadata")
|
|
186
|
+
|
|
187
|
+
object_ids = []
|
|
188
|
+
for doc_id in ids:
|
|
189
|
+
try:
|
|
190
|
+
object_ids.append(ObjectId(doc_id))
|
|
191
|
+
except InvalidId as e:
|
|
192
|
+
error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
|
|
193
|
+
logger.error(error_message)
|
|
194
|
+
raise ValueError(error_message) from e
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
docs = list(collection.find({"_id": {"$in": object_ids}}))
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"Failed to fetch documents: {e}", exc_info=True)
|
|
200
|
+
raise e
|
|
201
|
+
|
|
202
|
+
download_responses = []
|
|
203
|
+
for doc in docs:
|
|
204
|
+
doc_id = doc["_id"]
|
|
205
|
+
doc.pop("_id", None)
|
|
206
|
+
|
|
207
|
+
# Extract date_created from the document or ObjectId
|
|
208
|
+
date_created = None
|
|
209
|
+
if "date_created" in doc:
|
|
210
|
+
# If the document has a 'date_created' field, use it
|
|
211
|
+
date_created = doc["date_created"]
|
|
212
|
+
if isinstance(date_created, datetime):
|
|
213
|
+
date_created = date_created.isoformat()
|
|
214
|
+
else:
|
|
215
|
+
# Convert to ISO format if it's a string
|
|
216
|
+
date_created = str(date_created)
|
|
217
|
+
elif isinstance(doc_id, ObjectId):
|
|
218
|
+
# Use the ObjectId's generation time
|
|
219
|
+
date_created = doc_id.generation_time.isoformat()
|
|
220
|
+
|
|
221
|
+
flattened_dict = flatten_dict(dictionary=doc)
|
|
222
|
+
concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
|
|
223
|
+
|
|
224
|
+
# Create a FileData object for each document with source_identifiers
|
|
225
|
+
individual_file_data = FileData(
|
|
226
|
+
identifier=str(doc_id),
|
|
227
|
+
connector_type=self.connector_type,
|
|
228
|
+
source_identifiers=SourceIdentifiers(
|
|
229
|
+
filename=str(doc_id),
|
|
230
|
+
fullpath=str(doc_id),
|
|
231
|
+
rel_path=str(doc_id),
|
|
232
|
+
),
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Determine the download path
|
|
236
|
+
download_path = self.get_download_path(individual_file_data)
|
|
237
|
+
if download_path is None:
|
|
238
|
+
raise ValueError("Download path could not be determined")
|
|
239
|
+
|
|
240
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
241
|
+
download_path = download_path.with_suffix(".txt")
|
|
242
|
+
|
|
243
|
+
# Write the concatenated values to the file
|
|
244
|
+
with open(download_path, "w", encoding="utf8") as f:
|
|
245
|
+
f.write(concatenated_values)
|
|
246
|
+
|
|
247
|
+
individual_file_data.local_download_path = str(download_path)
|
|
248
|
+
|
|
249
|
+
# Update metadata
|
|
250
|
+
individual_file_data.metadata = FileDataSourceMetadata(
|
|
251
|
+
date_created=date_created, # Include date_created here
|
|
252
|
+
date_processed=str(time()),
|
|
253
|
+
record_locator={
|
|
254
|
+
"database": self.connection_config.database,
|
|
255
|
+
"collection": self.connection_config.collection,
|
|
256
|
+
"document_id": str(doc_id),
|
|
257
|
+
},
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
download_response = self.generate_download_response(
|
|
261
|
+
file_data=individual_file_data, download_path=download_path
|
|
262
|
+
)
|
|
263
|
+
download_responses.append(download_response)
|
|
264
|
+
|
|
265
|
+
return download_responses
|
|
266
|
+
|
|
267
|
+
|
|
56
268
|
@dataclass
|
|
57
269
|
class MongoDBUploadStager(UploadStager):
|
|
58
270
|
upload_stager_config: MongoDBUploadStagerConfig = field(
|
|
@@ -138,3 +350,11 @@ mongodb_destination_entry = DestinationRegistryEntry(
|
|
|
138
350
|
upload_stager=MongoDBUploadStager,
|
|
139
351
|
upload_stager_config=MongoDBUploadStagerConfig,
|
|
140
352
|
)
|
|
353
|
+
|
|
354
|
+
mongodb_source_entry = SourceRegistryEntry(
|
|
355
|
+
connection_config=MongoDBConnectionConfig,
|
|
356
|
+
indexer_config=MongoDBIndexerConfig,
|
|
357
|
+
indexer=MongoDBIndexer,
|
|
358
|
+
downloader_config=MongoDBDownloaderConfig,
|
|
359
|
+
downloader=MongoDBDownloader,
|
|
360
|
+
)
|