unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
- test/integration/connectors/sql/test_postgres.py +9 -5
- test/integration/connectors/sql/test_singlestore.py +9 -5
- test/integration/connectors/sql/test_snowflake.py +6 -2
- test/integration/connectors/sql/test_sqlite.py +9 -5
- test/integration/connectors/test_astradb.py +40 -0
- test/integration/connectors/test_kafka.py +2 -2
- test/integration/connectors/test_mongodb.py +4 -1
- test/integration/connectors/utils/validation/source.py +31 -11
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +69 -15
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +37 -33
- unstructured_ingest/v2/processes/connectors/couchbase.py +52 -41
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/mongodb.py +94 -100
- unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +36 -26
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/METADATA +11 -10
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/RECORD +52 -52
- /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from time import time
|
|
6
|
-
from typing import Any, Generator, Optional, Union
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
7
8
|
|
|
8
9
|
from dateutil import parser
|
|
9
10
|
from pydantic import Field, Secret
|
|
10
11
|
|
|
11
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
13
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
13
|
-
from unstructured_ingest.v2.interfaces import
|
|
14
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
14
15
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
16
|
DestinationRegistryEntry,
|
|
16
17
|
SourceRegistryEntry,
|
|
@@ -26,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
27
|
FsspecUploaderConfig,
|
|
27
28
|
)
|
|
28
29
|
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from gcsfs import GCSFileSystem
|
|
32
|
+
|
|
29
33
|
CONNECTOR_TYPE = "gcs"
|
|
30
34
|
|
|
31
35
|
|
|
@@ -93,6 +97,12 @@ class GcsConnectionConfig(FsspecConnectionConfig):
|
|
|
93
97
|
access_config: Secret[GcsAccessConfig] = Field(default=GcsAccessConfig(), validate_default=True)
|
|
94
98
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
95
99
|
|
|
100
|
+
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
101
|
+
@contextmanager
|
|
102
|
+
def get_client(self, protocol: str) -> Generator["GCSFileSystem", None, None]:
|
|
103
|
+
with super().get_client(protocol=protocol) as client:
|
|
104
|
+
yield client
|
|
105
|
+
|
|
96
106
|
|
|
97
107
|
@dataclass
|
|
98
108
|
class GcsIndexer(FsspecIndexer):
|
|
@@ -100,14 +110,6 @@ class GcsIndexer(FsspecIndexer):
|
|
|
100
110
|
index_config: GcsIndexerConfig
|
|
101
111
|
connector_type: str = CONNECTOR_TYPE
|
|
102
112
|
|
|
103
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
104
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
105
|
-
return super().run(**kwargs)
|
|
106
|
-
|
|
107
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
108
|
-
def precheck(self) -> None:
|
|
109
|
-
super().precheck()
|
|
110
|
-
|
|
111
113
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
112
114
|
path = file_data["name"]
|
|
113
115
|
date_created = None
|
|
@@ -147,14 +149,6 @@ class GcsDownloader(FsspecDownloader):
|
|
|
147
149
|
connector_type: str = CONNECTOR_TYPE
|
|
148
150
|
download_config: Optional[GcsDownloaderConfig] = field(default_factory=GcsDownloaderConfig)
|
|
149
151
|
|
|
150
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
151
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
152
|
-
return super().run(file_data=file_data, **kwargs)
|
|
153
|
-
|
|
154
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
155
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
156
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
157
|
-
|
|
158
152
|
|
|
159
153
|
class GcsUploaderConfig(FsspecUploaderConfig):
|
|
160
154
|
pass
|
|
@@ -166,22 +160,6 @@ class GcsUploader(FsspecUploader):
|
|
|
166
160
|
connection_config: GcsConnectionConfig
|
|
167
161
|
upload_config: GcsUploaderConfig = field(default=None)
|
|
168
162
|
|
|
169
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
170
|
-
def __post_init__(self):
|
|
171
|
-
super().__post_init__()
|
|
172
|
-
|
|
173
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
174
|
-
def precheck(self) -> None:
|
|
175
|
-
super().precheck()
|
|
176
|
-
|
|
177
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
178
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
179
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
180
|
-
|
|
181
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
182
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
183
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
184
|
-
|
|
185
163
|
|
|
186
164
|
gcs_source_entry = SourceRegistryEntry(
|
|
187
165
|
indexer=GcsIndexer,
|
|
@@ -1,15 +1,13 @@
|
|
|
1
1
|
import contextlib
|
|
2
|
+
from contextlib import contextmanager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
|
-
from pathlib import Path
|
|
4
4
|
from time import time
|
|
5
|
-
from typing import Any, Generator, Optional
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
6
|
|
|
7
7
|
from pydantic import Field, Secret
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.v2.interfaces import (
|
|
11
|
-
DownloadResponse,
|
|
12
|
-
FileData,
|
|
13
11
|
FileDataSourceMetadata,
|
|
14
12
|
)
|
|
15
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -29,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
29
27
|
|
|
30
28
|
CONNECTOR_TYPE = "s3"
|
|
31
29
|
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from s3fs import S3FileSystem
|
|
32
|
+
|
|
32
33
|
|
|
33
34
|
class S3IndexerConfig(FsspecIndexerConfig):
|
|
34
35
|
pass
|
|
@@ -72,6 +73,12 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
72
73
|
)
|
|
73
74
|
return access_configs
|
|
74
75
|
|
|
76
|
+
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
77
|
+
@contextmanager
|
|
78
|
+
def get_client(self, protocol: str) -> Generator["S3FileSystem", None, None]:
|
|
79
|
+
with super().get_client(protocol=protocol) as client:
|
|
80
|
+
yield client
|
|
81
|
+
|
|
75
82
|
|
|
76
83
|
@dataclass
|
|
77
84
|
class S3Indexer(FsspecIndexer):
|
|
@@ -97,7 +104,8 @@ class S3Indexer(FsspecIndexer):
|
|
|
97
104
|
version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
|
|
98
105
|
metadata: dict[str, str] = {}
|
|
99
106
|
with contextlib.suppress(AttributeError):
|
|
100
|
-
|
|
107
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
108
|
+
metadata = client.metadata(path=path)
|
|
101
109
|
record_locator = {
|
|
102
110
|
"protocol": self.index_config.protocol,
|
|
103
111
|
"remote_file_path": self.index_config.remote_url,
|
|
@@ -114,14 +122,6 @@ class S3Indexer(FsspecIndexer):
|
|
|
114
122
|
filesize_bytes=file_size,
|
|
115
123
|
)
|
|
116
124
|
|
|
117
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
118
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
119
|
-
return super().run(**kwargs)
|
|
120
|
-
|
|
121
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
122
|
-
def precheck(self) -> None:
|
|
123
|
-
super().precheck()
|
|
124
|
-
|
|
125
125
|
|
|
126
126
|
class S3DownloaderConfig(FsspecDownloaderConfig):
|
|
127
127
|
pass
|
|
@@ -134,14 +134,6 @@ class S3Downloader(FsspecDownloader):
|
|
|
134
134
|
connector_type: str = CONNECTOR_TYPE
|
|
135
135
|
download_config: Optional[S3DownloaderConfig] = field(default_factory=S3DownloaderConfig)
|
|
136
136
|
|
|
137
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
138
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
139
|
-
return super().run(file_data=file_data, **kwargs)
|
|
140
|
-
|
|
141
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
142
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
143
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
144
|
-
|
|
145
137
|
|
|
146
138
|
class S3UploaderConfig(FsspecUploaderConfig):
|
|
147
139
|
pass
|
|
@@ -153,22 +145,6 @@ class S3Uploader(FsspecUploader):
|
|
|
153
145
|
connection_config: S3ConnectionConfig
|
|
154
146
|
upload_config: S3UploaderConfig = field(default=None)
|
|
155
147
|
|
|
156
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
157
|
-
def precheck(self) -> None:
|
|
158
|
-
super().precheck()
|
|
159
|
-
|
|
160
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
161
|
-
def __post_init__(self):
|
|
162
|
-
super().__post_init__()
|
|
163
|
-
|
|
164
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
165
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
166
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
167
|
-
|
|
168
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
169
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
170
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
171
|
-
|
|
172
148
|
|
|
173
149
|
s3_source_entry = SourceRegistryEntry(
|
|
174
150
|
indexer=S3Indexer,
|
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
from contextlib import contextmanager
|
|
4
5
|
from dataclasses import dataclass, field
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from time import time
|
|
7
|
-
from typing import Any, Generator, Optional
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
8
9
|
from urllib.parse import urlparse
|
|
9
10
|
|
|
10
11
|
from pydantic import Field, Secret
|
|
11
12
|
|
|
12
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
-
from unstructured_ingest.v2.interfaces import
|
|
14
|
+
from unstructured_ingest.v2.interfaces import FileData, FileDataSourceMetadata
|
|
14
15
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
16
|
DestinationRegistryEntry,
|
|
16
17
|
SourceRegistryEntry,
|
|
@@ -26,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
27
|
FsspecUploaderConfig,
|
|
27
28
|
)
|
|
28
29
|
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from fsspec.implementations.sftp import SFTPFileSystem
|
|
32
|
+
|
|
29
33
|
CONNECTOR_TYPE = "sftp"
|
|
30
34
|
|
|
31
35
|
|
|
@@ -67,6 +71,19 @@ class SftpConnectionConfig(FsspecConnectionConfig):
|
|
|
67
71
|
}
|
|
68
72
|
return access_config
|
|
69
73
|
|
|
74
|
+
@contextmanager
|
|
75
|
+
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
76
|
+
def get_client(self, protocol: str) -> Generator["SFTPFileSystem", None, None]:
|
|
77
|
+
# The paramiko.SSHClient() client that's opened by the SFTPFileSystem
|
|
78
|
+
# never gets closed so explicitly adding that as part of this context manager
|
|
79
|
+
from fsspec import get_filesystem_class
|
|
80
|
+
|
|
81
|
+
client: SFTPFileSystem = get_filesystem_class(protocol)(
|
|
82
|
+
**self.get_access_config(),
|
|
83
|
+
)
|
|
84
|
+
yield client
|
|
85
|
+
client.client.close()
|
|
86
|
+
|
|
70
87
|
|
|
71
88
|
@dataclass
|
|
72
89
|
class SftpIndexer(FsspecIndexer):
|
|
@@ -74,13 +91,11 @@ class SftpIndexer(FsspecIndexer):
|
|
|
74
91
|
index_config: SftpIndexerConfig
|
|
75
92
|
connector_type: str = CONNECTOR_TYPE
|
|
76
93
|
|
|
77
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
78
94
|
def __post_init__(self):
|
|
79
95
|
parsed_url = urlparse(self.index_config.remote_url)
|
|
80
96
|
self.connection_config.host = parsed_url.hostname or self.connection_config.host
|
|
81
97
|
self.connection_config.port = parsed_url.port or self.connection_config.port
|
|
82
98
|
|
|
83
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
84
99
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
85
100
|
for file in super().run(**kwargs):
|
|
86
101
|
new_identifier = (
|
|
@@ -92,10 +107,6 @@ class SftpIndexer(FsspecIndexer):
|
|
|
92
107
|
file.identifier = new_identifier
|
|
93
108
|
yield file
|
|
94
109
|
|
|
95
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
96
|
-
def precheck(self) -> None:
|
|
97
|
-
super().precheck()
|
|
98
|
-
|
|
99
110
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
100
111
|
path = file_data["name"]
|
|
101
112
|
date_created = str(file_data.get("time").timestamp()) if "time" in file_data else None
|
|
@@ -128,20 +139,11 @@ class SftpDownloader(FsspecDownloader):
|
|
|
128
139
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
129
140
|
download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
|
|
130
141
|
|
|
131
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
132
142
|
def __post_init__(self):
|
|
133
143
|
parsed_url = urlparse(self.download_config.remote_url)
|
|
134
144
|
self.connection_config.host = parsed_url.hostname or self.connection_config.host
|
|
135
145
|
self.connection_config.port = parsed_url.port or self.connection_config.port
|
|
136
146
|
|
|
137
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
138
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
139
|
-
return super().run(file_data=file_data, **kwargs)
|
|
140
|
-
|
|
141
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
142
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
143
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
144
|
-
|
|
145
147
|
|
|
146
148
|
class SftpUploaderConfig(FsspecUploaderConfig):
|
|
147
149
|
pass
|
|
@@ -153,22 +155,6 @@ class SftpUploader(FsspecUploader):
|
|
|
153
155
|
connection_config: SftpConnectionConfig
|
|
154
156
|
upload_config: SftpUploaderConfig = field(default=None)
|
|
155
157
|
|
|
156
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
157
|
-
def __post_init__(self):
|
|
158
|
-
super().__post_init__()
|
|
159
|
-
|
|
160
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
161
|
-
def precheck(self) -> None:
|
|
162
|
-
super().precheck()
|
|
163
|
-
|
|
164
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
165
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
166
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
167
|
-
|
|
168
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
169
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
170
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
171
|
-
|
|
172
158
|
|
|
173
159
|
sftp_source_entry = SourceRegistryEntry(
|
|
174
160
|
indexer=SftpIndexer,
|
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import sys
|
|
2
1
|
from contextlib import contextmanager
|
|
3
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass
|
|
4
3
|
from datetime import datetime
|
|
5
4
|
from time import time
|
|
6
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
6
|
|
|
8
|
-
from pydantic import Field, Secret
|
|
7
|
+
from pydantic import BaseModel, Field, Secret
|
|
9
8
|
|
|
10
9
|
from unstructured_ingest.__version__ import __version__ as unstructured_version
|
|
11
10
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
@@ -14,9 +13,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
14
13
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
15
14
|
from unstructured_ingest.v2.interfaces import (
|
|
16
15
|
AccessConfig,
|
|
16
|
+
BatchFileData,
|
|
17
|
+
BatchItem,
|
|
17
18
|
ConnectionConfig,
|
|
18
19
|
Downloader,
|
|
19
20
|
DownloaderConfig,
|
|
21
|
+
DownloadResponse,
|
|
20
22
|
FileData,
|
|
21
23
|
FileDataSourceMetadata,
|
|
22
24
|
Indexer,
|
|
@@ -40,6 +42,15 @@ CONNECTOR_TYPE = "mongodb"
|
|
|
40
42
|
SERVER_API_VERSION = "1"
|
|
41
43
|
|
|
42
44
|
|
|
45
|
+
class MongoDBAdditionalMetadata(BaseModel):
|
|
46
|
+
database: str
|
|
47
|
+
collection: str
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class MongoDBBatchFileData(BatchFileData):
|
|
51
|
+
additional_metadata: MongoDBAdditionalMetadata
|
|
52
|
+
|
|
53
|
+
|
|
43
54
|
class MongoDBAccessConfig(AccessConfig):
|
|
44
55
|
uri: Optional[str] = Field(default=None, description="URI to user when connecting")
|
|
45
56
|
|
|
@@ -122,7 +133,7 @@ class MongoDBIndexer(Indexer):
|
|
|
122
133
|
logger.error(f"Failed to validate connection: {e}", exc_info=True)
|
|
123
134
|
raise SourceConnectionError(f"Failed to validate connection: {e}")
|
|
124
135
|
|
|
125
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
136
|
+
def run(self, **kwargs: Any) -> Generator[BatchFileData, None, None]:
|
|
126
137
|
"""Generates FileData objects for each document in the MongoDB collection."""
|
|
127
138
|
with self.connection_config.get_client() as client:
|
|
128
139
|
database = client[self.index_config.database]
|
|
@@ -130,12 +141,12 @@ class MongoDBIndexer(Indexer):
|
|
|
130
141
|
|
|
131
142
|
# Get list of document IDs
|
|
132
143
|
ids = collection.distinct("_id")
|
|
133
|
-
|
|
144
|
+
|
|
145
|
+
ids = sorted(ids)
|
|
146
|
+
batch_size = self.index_config.batch_size
|
|
134
147
|
|
|
135
148
|
for id_batch in batch_generator(ids, batch_size=batch_size):
|
|
136
149
|
# Make sure the hash is always a positive number to create identifier
|
|
137
|
-
batch_id = str(hash(frozenset(id_batch)) + sys.maxsize + 1)
|
|
138
|
-
|
|
139
150
|
metadata = FileDataSourceMetadata(
|
|
140
151
|
date_processed=str(time()),
|
|
141
152
|
record_locator={
|
|
@@ -144,14 +155,13 @@ class MongoDBIndexer(Indexer):
|
|
|
144
155
|
},
|
|
145
156
|
)
|
|
146
157
|
|
|
147
|
-
file_data =
|
|
148
|
-
identifier=batch_id,
|
|
149
|
-
doc_type="batch",
|
|
158
|
+
file_data = MongoDBBatchFileData(
|
|
150
159
|
connector_type=self.connector_type,
|
|
151
160
|
metadata=metadata,
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
161
|
+
batch_items=[BatchItem(identifier=str(doc_id)) for doc_id in id_batch],
|
|
162
|
+
additional_metadata=MongoDBAdditionalMetadata(
|
|
163
|
+
collection=self.index_config.collection, database=self.index_config.database
|
|
164
|
+
),
|
|
155
165
|
)
|
|
156
166
|
yield file_data
|
|
157
167
|
|
|
@@ -162,26 +172,58 @@ class MongoDBDownloader(Downloader):
|
|
|
162
172
|
connection_config: MongoDBConnectionConfig
|
|
163
173
|
connector_type: str = CONNECTOR_TYPE
|
|
164
174
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
from
|
|
169
|
-
from pymongo.server_api import ServerApi
|
|
175
|
+
def generate_download_response(
|
|
176
|
+
self, doc: dict, file_data: MongoDBBatchFileData
|
|
177
|
+
) -> DownloadResponse:
|
|
178
|
+
from bson.objectid import ObjectId
|
|
170
179
|
|
|
171
|
-
|
|
180
|
+
doc_id = doc["_id"]
|
|
181
|
+
doc.pop("_id", None)
|
|
172
182
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
183
|
+
# Extract date_created from the document or ObjectId
|
|
184
|
+
date_created = None
|
|
185
|
+
if "date_created" in doc:
|
|
186
|
+
# If the document has a 'date_created' field, use it
|
|
187
|
+
date_created = doc["date_created"]
|
|
188
|
+
if isinstance(date_created, datetime):
|
|
189
|
+
date_created = date_created.isoformat()
|
|
190
|
+
else:
|
|
191
|
+
# Convert to ISO format if it's a string
|
|
192
|
+
date_created = str(date_created)
|
|
193
|
+
elif isinstance(doc_id, ObjectId):
|
|
194
|
+
# Use the ObjectId's generation time
|
|
195
|
+
date_created = doc_id.generation_time.isoformat()
|
|
196
|
+
|
|
197
|
+
flattened_dict = flatten_dict(dictionary=doc)
|
|
198
|
+
concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
|
|
199
|
+
|
|
200
|
+
# Create a FileData object for each document with source_identifiers
|
|
201
|
+
filename = f"{doc_id}.txt"
|
|
202
|
+
file_data.source_identifiers = SourceIdentifiers(
|
|
203
|
+
filename=filename,
|
|
204
|
+
fullpath=filename,
|
|
205
|
+
)
|
|
206
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
207
|
+
cast_file_data.identifier = str(doc_id)
|
|
208
|
+
|
|
209
|
+
# Determine the download path
|
|
210
|
+
download_path = self.get_download_path(file_data=cast_file_data)
|
|
211
|
+
if download_path is None:
|
|
212
|
+
raise ValueError("Download path could not be determined")
|
|
213
|
+
|
|
214
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
215
|
+
|
|
216
|
+
# Write the concatenated values to the file
|
|
217
|
+
with open(download_path, "w", encoding="utf8") as f:
|
|
218
|
+
f.write(concatenated_values)
|
|
219
|
+
|
|
220
|
+
# Update metadata
|
|
221
|
+
cast_file_data.metadata.record_locator["document_id"] = str(doc_id)
|
|
222
|
+
cast_file_data.metadata.date_created = date_created
|
|
223
|
+
|
|
224
|
+
return super().generate_download_response(
|
|
225
|
+
file_data=cast_file_data, download_path=download_path
|
|
226
|
+
)
|
|
185
227
|
|
|
186
228
|
@SourceConnectionError.wrap
|
|
187
229
|
@requires_dependencies(["bson"], extras="mongodb")
|
|
@@ -190,82 +232,34 @@ class MongoDBDownloader(Downloader):
|
|
|
190
232
|
from bson.errors import InvalidId
|
|
191
233
|
from bson.objectid import ObjectId
|
|
192
234
|
|
|
193
|
-
|
|
194
|
-
database = client[file_data.metadata.record_locator["database"]]
|
|
195
|
-
collection = database[file_data.metadata.record_locator["collection"]]
|
|
235
|
+
mongo_file_data = MongoDBBatchFileData.cast(file_data=file_data)
|
|
196
236
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
237
|
+
with self.connection_config.get_client() as client:
|
|
238
|
+
database = client[mongo_file_data.additional_metadata.database]
|
|
239
|
+
collection = database[mongo_file_data.additional_metadata.collection]
|
|
200
240
|
|
|
201
|
-
|
|
202
|
-
for doc_id in ids:
|
|
203
|
-
try:
|
|
204
|
-
object_ids.append(ObjectId(doc_id))
|
|
205
|
-
except InvalidId as e:
|
|
206
|
-
error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
|
|
207
|
-
logger.error(error_message)
|
|
208
|
-
raise ValueError(error_message) from e
|
|
241
|
+
ids = [item.identifier for item in mongo_file_data.batch_items]
|
|
209
242
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
243
|
+
object_ids = []
|
|
244
|
+
for doc_id in ids:
|
|
245
|
+
try:
|
|
246
|
+
object_ids.append(ObjectId(doc_id))
|
|
247
|
+
except InvalidId as e:
|
|
248
|
+
error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
|
|
249
|
+
logger.error(error_message)
|
|
250
|
+
raise ValueError(error_message) from e
|
|
251
|
+
|
|
252
|
+
try:
|
|
253
|
+
docs = list(collection.find({"_id": {"$in": object_ids}}))
|
|
254
|
+
except Exception as e:
|
|
255
|
+
logger.error(f"Failed to fetch documents: {e}", exc_info=True)
|
|
256
|
+
raise e
|
|
215
257
|
|
|
216
258
|
download_responses = []
|
|
217
259
|
for doc in docs:
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
# Extract date_created from the document or ObjectId
|
|
222
|
-
date_created = None
|
|
223
|
-
if "date_created" in doc:
|
|
224
|
-
# If the document has a 'date_created' field, use it
|
|
225
|
-
date_created = doc["date_created"]
|
|
226
|
-
if isinstance(date_created, datetime):
|
|
227
|
-
date_created = date_created.isoformat()
|
|
228
|
-
else:
|
|
229
|
-
# Convert to ISO format if it's a string
|
|
230
|
-
date_created = str(date_created)
|
|
231
|
-
elif isinstance(doc_id, ObjectId):
|
|
232
|
-
# Use the ObjectId's generation time
|
|
233
|
-
date_created = doc_id.generation_time.isoformat()
|
|
234
|
-
|
|
235
|
-
flattened_dict = flatten_dict(dictionary=doc)
|
|
236
|
-
concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
|
|
237
|
-
|
|
238
|
-
# Create a FileData object for each document with source_identifiers
|
|
239
|
-
individual_file_data = replace(file_data)
|
|
240
|
-
individual_file_data.identifier = str(doc_id)
|
|
241
|
-
individual_file_data.source_identifiers = SourceIdentifiers(
|
|
242
|
-
filename=str(doc_id),
|
|
243
|
-
fullpath=str(doc_id),
|
|
244
|
-
rel_path=str(doc_id),
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
# Determine the download path
|
|
248
|
-
download_path = self.get_download_path(individual_file_data)
|
|
249
|
-
if download_path is None:
|
|
250
|
-
raise ValueError("Download path could not be determined")
|
|
251
|
-
|
|
252
|
-
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
253
|
-
download_path = download_path.with_suffix(".txt")
|
|
254
|
-
|
|
255
|
-
# Write the concatenated values to the file
|
|
256
|
-
with open(download_path, "w", encoding="utf8") as f:
|
|
257
|
-
f.write(concatenated_values)
|
|
258
|
-
|
|
259
|
-
individual_file_data.local_download_path = str(download_path)
|
|
260
|
-
|
|
261
|
-
# Update metadata
|
|
262
|
-
individual_file_data.metadata.record_locator["document_id"] = str(doc_id)
|
|
263
|
-
individual_file_data.metadata.date_created = date_created
|
|
264
|
-
|
|
265
|
-
download_response = self.generate_download_response(
|
|
266
|
-
file_data=individual_file_data, download_path=download_path
|
|
260
|
+
download_responses.append(
|
|
261
|
+
self.generate_download_response(doc=doc, file_data=mongo_file_data)
|
|
267
262
|
)
|
|
268
|
-
download_responses.append(download_response)
|
|
269
263
|
|
|
270
264
|
return download_responses
|
|
271
265
|
|
|
@@ -10,7 +10,6 @@ from enum import Enum
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
|
|
12
12
|
|
|
13
|
-
import networkx as nx
|
|
14
13
|
from pydantic import BaseModel, ConfigDict, Field, Secret
|
|
15
14
|
|
|
16
15
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -33,6 +32,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
33
32
|
|
|
34
33
|
if TYPE_CHECKING:
|
|
35
34
|
from neo4j import AsyncDriver, Auth
|
|
35
|
+
from networkx import Graph, MultiDiGraph
|
|
36
36
|
|
|
37
37
|
CONNECTOR_TYPE = "neo4j"
|
|
38
38
|
|
|
@@ -109,7 +109,9 @@ class Neo4jUploadStager(UploadStager):
|
|
|
109
109
|
|
|
110
110
|
return output_filepath
|
|
111
111
|
|
|
112
|
-
def _create_lexical_graph(self, elements: list[dict], document_node: _Node) ->
|
|
112
|
+
def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> "Graph":
|
|
113
|
+
import networkx as nx
|
|
114
|
+
|
|
113
115
|
graph = nx.MultiDiGraph()
|
|
114
116
|
graph.add_node(document_node)
|
|
115
117
|
|
|
@@ -180,7 +182,7 @@ class _GraphData(BaseModel):
|
|
|
180
182
|
edges: list[_Edge]
|
|
181
183
|
|
|
182
184
|
@classmethod
|
|
183
|
-
def from_nx(cls, nx_graph:
|
|
185
|
+
def from_nx(cls, nx_graph: "MultiDiGraph") -> _GraphData:
|
|
184
186
|
nodes = list(nx_graph.nodes())
|
|
185
187
|
edges = [
|
|
186
188
|
_Edge(
|
|
@@ -202,7 +202,7 @@ class OnedriveDownloader(Downloader):
|
|
|
202
202
|
if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
|
|
203
203
|
raise ValueError(
|
|
204
204
|
f"file data doesn't have enough information to get "
|
|
205
|
-
f"file content: {file_data.
|
|
205
|
+
f"file content: {file_data.model_dump()}"
|
|
206
206
|
)
|
|
207
207
|
|
|
208
208
|
server_relative_path = file_data.source_identifiers.fullpath
|
|
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Generator, Optional
|
|
|
5
5
|
from pydantic import Field, Secret
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
9
8
|
from unstructured_ingest.v2.logger import logger
|
|
10
9
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
11
10
|
DestinationRegistryEntry,
|
|
@@ -13,6 +12,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
13
12
|
)
|
|
14
13
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
15
14
|
SQLAccessConfig,
|
|
15
|
+
SqlBatchFileData,
|
|
16
16
|
SQLConnectionConfig,
|
|
17
17
|
SQLDownloader,
|
|
18
18
|
SQLDownloaderConfig,
|
|
@@ -99,12 +99,12 @@ class PostgresDownloader(SQLDownloader):
|
|
|
99
99
|
connector_type: str = CONNECTOR_TYPE
|
|
100
100
|
|
|
101
101
|
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
102
|
-
def query_db(self, file_data:
|
|
102
|
+
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
103
103
|
from psycopg2 import sql
|
|
104
104
|
|
|
105
|
-
table_name = file_data.additional_metadata
|
|
106
|
-
id_column = file_data.additional_metadata
|
|
107
|
-
ids = tuple(file_data.
|
|
105
|
+
table_name = file_data.additional_metadata.table_name
|
|
106
|
+
id_column = file_data.additional_metadata.id_column
|
|
107
|
+
ids = tuple([item.identifier for item in file_data.batch_items])
|
|
108
108
|
|
|
109
109
|
with self.connection_config.get_cursor() as cursor:
|
|
110
110
|
fields = (
|