unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_postgres.py +3 -3
- test/integration/connectors/sql/test_singlestore.py +3 -3
- test/integration/connectors/sql/test_sqlite.py +3 -3
- test/integration/connectors/test_astradb.py +40 -0
- test/integration/connectors/test_kafka.py +2 -2
- test/integration/connectors/test_mongodb.py +4 -1
- test/integration/connectors/utils/validation/source.py +31 -11
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +35 -33
- unstructured_ingest/v2/processes/connectors/couchbase.py +50 -41
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/mongodb.py +95 -100
- unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +31 -26
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +14 -13
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +44 -44
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import sys
|
|
2
1
|
from contextlib import contextmanager
|
|
3
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass
|
|
4
3
|
from datetime import datetime
|
|
5
4
|
from time import time
|
|
6
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
6
|
|
|
8
|
-
from pydantic import Field, Secret
|
|
7
|
+
from pydantic import BaseModel, Field, Secret
|
|
9
8
|
|
|
10
9
|
from unstructured_ingest.__version__ import __version__ as unstructured_version
|
|
11
10
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
@@ -14,9 +13,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
14
13
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
15
14
|
from unstructured_ingest.v2.interfaces import (
|
|
16
15
|
AccessConfig,
|
|
16
|
+
BatchFileData,
|
|
17
|
+
BatchItem,
|
|
17
18
|
ConnectionConfig,
|
|
18
19
|
Downloader,
|
|
19
20
|
DownloaderConfig,
|
|
21
|
+
DownloadResponse,
|
|
20
22
|
FileData,
|
|
21
23
|
FileDataSourceMetadata,
|
|
22
24
|
Indexer,
|
|
@@ -40,6 +42,15 @@ CONNECTOR_TYPE = "mongodb"
|
|
|
40
42
|
SERVER_API_VERSION = "1"
|
|
41
43
|
|
|
42
44
|
|
|
45
|
+
class MongoDBAdditionalMetadata(BaseModel):
|
|
46
|
+
database: str
|
|
47
|
+
collection: str
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class MongoDBBatchFileData(BatchFileData):
|
|
51
|
+
additional_metadata: MongoDBAdditionalMetadata
|
|
52
|
+
|
|
53
|
+
|
|
43
54
|
class MongoDBAccessConfig(AccessConfig):
|
|
44
55
|
uri: Optional[str] = Field(default=None, description="URI to user when connecting")
|
|
45
56
|
|
|
@@ -122,7 +133,7 @@ class MongoDBIndexer(Indexer):
|
|
|
122
133
|
logger.error(f"Failed to validate connection: {e}", exc_info=True)
|
|
123
134
|
raise SourceConnectionError(f"Failed to validate connection: {e}")
|
|
124
135
|
|
|
125
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
136
|
+
def run(self, **kwargs: Any) -> Generator[BatchFileData, None, None]:
|
|
126
137
|
"""Generates FileData objects for each document in the MongoDB collection."""
|
|
127
138
|
with self.connection_config.get_client() as client:
|
|
128
139
|
database = client[self.index_config.database]
|
|
@@ -130,12 +141,12 @@ class MongoDBIndexer(Indexer):
|
|
|
130
141
|
|
|
131
142
|
# Get list of document IDs
|
|
132
143
|
ids = collection.distinct("_id")
|
|
133
|
-
|
|
144
|
+
|
|
145
|
+
ids = sorted(ids)
|
|
146
|
+
batch_size = self.index_config.batch_size
|
|
134
147
|
|
|
135
148
|
for id_batch in batch_generator(ids, batch_size=batch_size):
|
|
136
149
|
# Make sure the hash is always a positive number to create identifier
|
|
137
|
-
batch_id = str(hash(frozenset(id_batch)) + sys.maxsize + 1)
|
|
138
|
-
|
|
139
150
|
metadata = FileDataSourceMetadata(
|
|
140
151
|
date_processed=str(time()),
|
|
141
152
|
record_locator={
|
|
@@ -144,14 +155,13 @@ class MongoDBIndexer(Indexer):
|
|
|
144
155
|
},
|
|
145
156
|
)
|
|
146
157
|
|
|
147
|
-
file_data =
|
|
148
|
-
identifier=batch_id,
|
|
149
|
-
doc_type="batch",
|
|
158
|
+
file_data = MongoDBBatchFileData(
|
|
150
159
|
connector_type=self.connector_type,
|
|
151
160
|
metadata=metadata,
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
161
|
+
batch_items=[BatchItem(identifier=str(doc_id)) for doc_id in id_batch],
|
|
162
|
+
additional_metadata=MongoDBAdditionalMetadata(
|
|
163
|
+
collection=self.index_config.collection, database=self.index_config.database
|
|
164
|
+
),
|
|
155
165
|
)
|
|
156
166
|
yield file_data
|
|
157
167
|
|
|
@@ -162,26 +172,59 @@ class MongoDBDownloader(Downloader):
|
|
|
162
172
|
connection_config: MongoDBConnectionConfig
|
|
163
173
|
connector_type: str = CONNECTOR_TYPE
|
|
164
174
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
from
|
|
169
|
-
from pymongo.server_api import ServerApi
|
|
175
|
+
def generate_download_response(
|
|
176
|
+
self, doc: dict, file_data: MongoDBBatchFileData
|
|
177
|
+
) -> DownloadResponse:
|
|
178
|
+
from bson.objectid import ObjectId
|
|
170
179
|
|
|
171
|
-
|
|
180
|
+
doc_id = doc["_id"]
|
|
181
|
+
doc.pop("_id", None)
|
|
172
182
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
183
|
+
# Extract date_created from the document or ObjectId
|
|
184
|
+
date_created = None
|
|
185
|
+
if "date_created" in doc:
|
|
186
|
+
# If the document has a 'date_created' field, use it
|
|
187
|
+
date_created = doc["date_created"]
|
|
188
|
+
if isinstance(date_created, datetime):
|
|
189
|
+
date_created = date_created.isoformat()
|
|
190
|
+
else:
|
|
191
|
+
# Convert to ISO format if it's a string
|
|
192
|
+
date_created = str(date_created)
|
|
193
|
+
elif isinstance(doc_id, ObjectId):
|
|
194
|
+
# Use the ObjectId's generation time
|
|
195
|
+
date_created = doc_id.generation_time.isoformat()
|
|
196
|
+
|
|
197
|
+
flattened_dict = flatten_dict(dictionary=doc)
|
|
198
|
+
concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
|
|
199
|
+
|
|
200
|
+
# Create a FileData object for each document with source_identifiers
|
|
201
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
202
|
+
cast_file_data.identifier = str(doc_id)
|
|
203
|
+
filename = f"{doc_id}.txt"
|
|
204
|
+
cast_file_data.source_identifiers = SourceIdentifiers(
|
|
205
|
+
filename=filename,
|
|
206
|
+
fullpath=filename,
|
|
207
|
+
rel_path=filename,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Determine the download path
|
|
211
|
+
download_path = self.get_download_path(file_data=cast_file_data)
|
|
212
|
+
if download_path is None:
|
|
213
|
+
raise ValueError("Download path could not be determined")
|
|
214
|
+
|
|
215
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
216
|
+
|
|
217
|
+
# Write the concatenated values to the file
|
|
218
|
+
with open(download_path, "w", encoding="utf8") as f:
|
|
219
|
+
f.write(concatenated_values)
|
|
220
|
+
|
|
221
|
+
# Update metadata
|
|
222
|
+
cast_file_data.metadata.record_locator["document_id"] = str(doc_id)
|
|
223
|
+
cast_file_data.metadata.date_created = date_created
|
|
224
|
+
|
|
225
|
+
return super().generate_download_response(
|
|
226
|
+
file_data=cast_file_data, download_path=download_path
|
|
227
|
+
)
|
|
185
228
|
|
|
186
229
|
@SourceConnectionError.wrap
|
|
187
230
|
@requires_dependencies(["bson"], extras="mongodb")
|
|
@@ -190,82 +233,34 @@ class MongoDBDownloader(Downloader):
|
|
|
190
233
|
from bson.errors import InvalidId
|
|
191
234
|
from bson.objectid import ObjectId
|
|
192
235
|
|
|
193
|
-
|
|
194
|
-
database = client[file_data.metadata.record_locator["database"]]
|
|
195
|
-
collection = database[file_data.metadata.record_locator["collection"]]
|
|
236
|
+
mongo_file_data = MongoDBBatchFileData.cast(file_data=file_data)
|
|
196
237
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
238
|
+
with self.connection_config.get_client() as client:
|
|
239
|
+
database = client[mongo_file_data.additional_metadata.database]
|
|
240
|
+
collection = database[mongo_file_data.additional_metadata.collection]
|
|
200
241
|
|
|
201
|
-
|
|
202
|
-
for doc_id in ids:
|
|
203
|
-
try:
|
|
204
|
-
object_ids.append(ObjectId(doc_id))
|
|
205
|
-
except InvalidId as e:
|
|
206
|
-
error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
|
|
207
|
-
logger.error(error_message)
|
|
208
|
-
raise ValueError(error_message) from e
|
|
242
|
+
ids = [item.identifier for item in mongo_file_data.batch_items]
|
|
209
243
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
244
|
+
object_ids = []
|
|
245
|
+
for doc_id in ids:
|
|
246
|
+
try:
|
|
247
|
+
object_ids.append(ObjectId(doc_id))
|
|
248
|
+
except InvalidId as e:
|
|
249
|
+
error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
|
|
250
|
+
logger.error(error_message)
|
|
251
|
+
raise ValueError(error_message) from e
|
|
252
|
+
|
|
253
|
+
try:
|
|
254
|
+
docs = list(collection.find({"_id": {"$in": object_ids}}))
|
|
255
|
+
except Exception as e:
|
|
256
|
+
logger.error(f"Failed to fetch documents: {e}", exc_info=True)
|
|
257
|
+
raise e
|
|
215
258
|
|
|
216
259
|
download_responses = []
|
|
217
260
|
for doc in docs:
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
# Extract date_created from the document or ObjectId
|
|
222
|
-
date_created = None
|
|
223
|
-
if "date_created" in doc:
|
|
224
|
-
# If the document has a 'date_created' field, use it
|
|
225
|
-
date_created = doc["date_created"]
|
|
226
|
-
if isinstance(date_created, datetime):
|
|
227
|
-
date_created = date_created.isoformat()
|
|
228
|
-
else:
|
|
229
|
-
# Convert to ISO format if it's a string
|
|
230
|
-
date_created = str(date_created)
|
|
231
|
-
elif isinstance(doc_id, ObjectId):
|
|
232
|
-
# Use the ObjectId's generation time
|
|
233
|
-
date_created = doc_id.generation_time.isoformat()
|
|
234
|
-
|
|
235
|
-
flattened_dict = flatten_dict(dictionary=doc)
|
|
236
|
-
concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
|
|
237
|
-
|
|
238
|
-
# Create a FileData object for each document with source_identifiers
|
|
239
|
-
individual_file_data = replace(file_data)
|
|
240
|
-
individual_file_data.identifier = str(doc_id)
|
|
241
|
-
individual_file_data.source_identifiers = SourceIdentifiers(
|
|
242
|
-
filename=str(doc_id),
|
|
243
|
-
fullpath=str(doc_id),
|
|
244
|
-
rel_path=str(doc_id),
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
# Determine the download path
|
|
248
|
-
download_path = self.get_download_path(individual_file_data)
|
|
249
|
-
if download_path is None:
|
|
250
|
-
raise ValueError("Download path could not be determined")
|
|
251
|
-
|
|
252
|
-
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
253
|
-
download_path = download_path.with_suffix(".txt")
|
|
254
|
-
|
|
255
|
-
# Write the concatenated values to the file
|
|
256
|
-
with open(download_path, "w", encoding="utf8") as f:
|
|
257
|
-
f.write(concatenated_values)
|
|
258
|
-
|
|
259
|
-
individual_file_data.local_download_path = str(download_path)
|
|
260
|
-
|
|
261
|
-
# Update metadata
|
|
262
|
-
individual_file_data.metadata.record_locator["document_id"] = str(doc_id)
|
|
263
|
-
individual_file_data.metadata.date_created = date_created
|
|
264
|
-
|
|
265
|
-
download_response = self.generate_download_response(
|
|
266
|
-
file_data=individual_file_data, download_path=download_path
|
|
261
|
+
download_responses.append(
|
|
262
|
+
self.generate_download_response(doc=doc, file_data=mongo_file_data)
|
|
267
263
|
)
|
|
268
|
-
download_responses.append(download_response)
|
|
269
264
|
|
|
270
265
|
return download_responses
|
|
271
266
|
|
|
@@ -10,7 +10,6 @@ from enum import Enum
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
|
|
12
12
|
|
|
13
|
-
import networkx as nx
|
|
14
13
|
from pydantic import BaseModel, ConfigDict, Field, Secret
|
|
15
14
|
|
|
16
15
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -33,6 +32,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
33
32
|
|
|
34
33
|
if TYPE_CHECKING:
|
|
35
34
|
from neo4j import AsyncDriver, Auth
|
|
35
|
+
from networkx import Graph, MultiDiGraph
|
|
36
36
|
|
|
37
37
|
CONNECTOR_TYPE = "neo4j"
|
|
38
38
|
|
|
@@ -109,7 +109,9 @@ class Neo4jUploadStager(UploadStager):
|
|
|
109
109
|
|
|
110
110
|
return output_filepath
|
|
111
111
|
|
|
112
|
-
def _create_lexical_graph(self, elements: list[dict], document_node: _Node) ->
|
|
112
|
+
def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> "Graph":
|
|
113
|
+
import networkx as nx
|
|
114
|
+
|
|
113
115
|
graph = nx.MultiDiGraph()
|
|
114
116
|
graph.add_node(document_node)
|
|
115
117
|
|
|
@@ -180,7 +182,7 @@ class _GraphData(BaseModel):
|
|
|
180
182
|
edges: list[_Edge]
|
|
181
183
|
|
|
182
184
|
@classmethod
|
|
183
|
-
def from_nx(cls, nx_graph:
|
|
185
|
+
def from_nx(cls, nx_graph: "MultiDiGraph") -> _GraphData:
|
|
184
186
|
nodes = list(nx_graph.nodes())
|
|
185
187
|
edges = [
|
|
186
188
|
_Edge(
|
|
@@ -202,7 +202,7 @@ class OnedriveDownloader(Downloader):
|
|
|
202
202
|
if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
|
|
203
203
|
raise ValueError(
|
|
204
204
|
f"file data doesn't have enough information to get "
|
|
205
|
-
f"file content: {file_data.
|
|
205
|
+
f"file content: {file_data.model_dump()}"
|
|
206
206
|
)
|
|
207
207
|
|
|
208
208
|
server_relative_path = file_data.source_identifiers.fullpath
|
|
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Generator, Optional
|
|
|
5
5
|
from pydantic import Field, Secret
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
9
8
|
from unstructured_ingest.v2.logger import logger
|
|
10
9
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
11
10
|
DestinationRegistryEntry,
|
|
@@ -13,6 +12,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
13
12
|
)
|
|
14
13
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
15
14
|
SQLAccessConfig,
|
|
15
|
+
SqlBatchFileData,
|
|
16
16
|
SQLConnectionConfig,
|
|
17
17
|
SQLDownloader,
|
|
18
18
|
SQLDownloaderConfig,
|
|
@@ -99,12 +99,12 @@ class PostgresDownloader(SQLDownloader):
|
|
|
99
99
|
connector_type: str = CONNECTOR_TYPE
|
|
100
100
|
|
|
101
101
|
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
102
|
-
def query_db(self, file_data:
|
|
102
|
+
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
103
103
|
from psycopg2 import sql
|
|
104
104
|
|
|
105
|
-
table_name = file_data.additional_metadata
|
|
106
|
-
id_column = file_data.additional_metadata
|
|
107
|
-
ids = tuple(file_data.
|
|
105
|
+
table_name = file_data.additional_metadata.table_name
|
|
106
|
+
id_column = file_data.additional_metadata.id_column
|
|
107
|
+
ids = tuple([item.identifier for item in file_data.batch_items])
|
|
108
108
|
|
|
109
109
|
with self.connection_config.get_cursor() as cursor:
|
|
110
110
|
fields = (
|
|
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
|
5
5
|
|
|
6
6
|
from pydantic import Field, Secret
|
|
7
7
|
|
|
8
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
9
8
|
from unstructured_ingest.v2.logger import logger
|
|
10
9
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
11
10
|
DestinationRegistryEntry,
|
|
@@ -14,6 +13,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
14
13
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
15
14
|
_DATE_COLUMNS,
|
|
16
15
|
SQLAccessConfig,
|
|
16
|
+
SqlBatchFileData,
|
|
17
17
|
SQLConnectionConfig,
|
|
18
18
|
SQLDownloader,
|
|
19
19
|
SQLDownloaderConfig,
|
|
@@ -93,10 +93,10 @@ class SingleStoreDownloader(SQLDownloader):
|
|
|
93
93
|
connector_type: str = CONNECTOR_TYPE
|
|
94
94
|
values_delimiter: str = "%s"
|
|
95
95
|
|
|
96
|
-
def query_db(self, file_data:
|
|
97
|
-
table_name = file_data.additional_metadata
|
|
98
|
-
id_column = file_data.additional_metadata
|
|
99
|
-
ids = tuple(file_data.
|
|
96
|
+
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
97
|
+
table_name = file_data.additional_metadata.table_name
|
|
98
|
+
id_column = file_data.additional_metadata.id_column
|
|
99
|
+
ids = tuple([item.identifier for item in file_data.batch_items])
|
|
100
100
|
with self.connection_config.get_connection() as sqlite_connection:
|
|
101
101
|
cursor = sqlite_connection.cursor()
|
|
102
102
|
fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
|
|
@@ -9,7 +9,6 @@ from pydantic import Field, Secret
|
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
13
12
|
from unstructured_ingest.v2.logger import logger
|
|
14
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
14
|
DestinationRegistryEntry,
|
|
@@ -17,6 +16,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
17
16
|
)
|
|
18
17
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
19
18
|
SQLAccessConfig,
|
|
19
|
+
SqlBatchFileData,
|
|
20
20
|
SQLConnectionConfig,
|
|
21
21
|
SQLDownloader,
|
|
22
22
|
SQLDownloaderConfig,
|
|
@@ -118,10 +118,10 @@ class SnowflakeDownloader(SQLDownloader):
|
|
|
118
118
|
|
|
119
119
|
# The actual snowflake module package name is: snowflake-connector-python
|
|
120
120
|
@requires_dependencies(["snowflake"], extras="snowflake")
|
|
121
|
-
def query_db(self, file_data:
|
|
122
|
-
table_name = file_data.additional_metadata
|
|
123
|
-
id_column = file_data.additional_metadata
|
|
124
|
-
ids = file_data.
|
|
121
|
+
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
122
|
+
table_name = file_data.additional_metadata.table_name
|
|
123
|
+
id_column = file_data.additional_metadata.id_column
|
|
124
|
+
ids = [item.identifier for item in file_data.batch_items]
|
|
125
125
|
|
|
126
126
|
with self.connection_config.get_cursor() as cursor:
|
|
127
127
|
query = "SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})".format(
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import json
|
|
3
|
-
import sys
|
|
4
3
|
from abc import ABC, abstractmethod
|
|
5
4
|
from contextlib import contextmanager
|
|
6
|
-
from dataclasses import dataclass, field
|
|
5
|
+
from dataclasses import dataclass, field
|
|
7
6
|
from datetime import date, datetime
|
|
8
7
|
from pathlib import Path
|
|
9
8
|
from time import time
|
|
@@ -12,13 +11,15 @@ from typing import Any, Generator, Union
|
|
|
12
11
|
import numpy as np
|
|
13
12
|
import pandas as pd
|
|
14
13
|
from dateutil import parser
|
|
15
|
-
from pydantic import Field, Secret
|
|
14
|
+
from pydantic import BaseModel, Field, Secret
|
|
16
15
|
|
|
17
16
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
18
17
|
from unstructured_ingest.utils.data_prep import get_data_df, split_dataframe
|
|
19
18
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
20
19
|
from unstructured_ingest.v2.interfaces import (
|
|
21
20
|
AccessConfig,
|
|
21
|
+
BatchFileData,
|
|
22
|
+
BatchItem,
|
|
22
23
|
ConnectionConfig,
|
|
23
24
|
Downloader,
|
|
24
25
|
DownloaderConfig,
|
|
@@ -81,6 +82,15 @@ _COLUMNS = (
|
|
|
81
82
|
_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
|
|
82
83
|
|
|
83
84
|
|
|
85
|
+
class SqlAdditionalMetadata(BaseModel):
|
|
86
|
+
table_name: str
|
|
87
|
+
id_column: str
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class SqlBatchFileData(BatchFileData):
|
|
91
|
+
additional_metadata: SqlAdditionalMetadata
|
|
92
|
+
|
|
93
|
+
|
|
84
94
|
def parse_date_string(date_value: Union[str, int]) -> date:
|
|
85
95
|
try:
|
|
86
96
|
timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
|
|
@@ -124,7 +134,7 @@ class SQLIndexer(Indexer, ABC):
|
|
|
124
134
|
f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
|
|
125
135
|
)
|
|
126
136
|
results = cursor.fetchall()
|
|
127
|
-
ids = [result[0] for result in results]
|
|
137
|
+
ids = sorted([result[0] for result in results])
|
|
128
138
|
return ids
|
|
129
139
|
|
|
130
140
|
def precheck(self) -> None:
|
|
@@ -135,7 +145,7 @@ class SQLIndexer(Indexer, ABC):
|
|
|
135
145
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
136
146
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
137
147
|
|
|
138
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
148
|
+
def run(self, **kwargs: Any) -> Generator[SqlBatchFileData, None, None]:
|
|
139
149
|
ids = self._get_doc_ids()
|
|
140
150
|
id_batches: list[frozenset[str]] = [
|
|
141
151
|
frozenset(
|
|
@@ -151,19 +161,15 @@ class SQLIndexer(Indexer, ABC):
|
|
|
151
161
|
]
|
|
152
162
|
for batch in id_batches:
|
|
153
163
|
# Make sure the hash is always a positive number to create identified
|
|
154
|
-
|
|
155
|
-
yield FileData(
|
|
156
|
-
identifier=identified,
|
|
164
|
+
yield SqlBatchFileData(
|
|
157
165
|
connector_type=self.connector_type,
|
|
158
166
|
metadata=FileDataSourceMetadata(
|
|
159
167
|
date_processed=str(time()),
|
|
160
168
|
),
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
"id_column": self.index_config.id_column,
|
|
166
|
-
},
|
|
169
|
+
additional_metadata=SqlAdditionalMetadata(
|
|
170
|
+
table_name=self.index_config.table_name, id_column=self.index_config.id_column
|
|
171
|
+
),
|
|
172
|
+
batch_items=[BatchItem(identifier=str(b)) for b in batch],
|
|
167
173
|
)
|
|
168
174
|
|
|
169
175
|
|
|
@@ -176,7 +182,7 @@ class SQLDownloader(Downloader, ABC):
|
|
|
176
182
|
download_config: SQLDownloaderConfig
|
|
177
183
|
|
|
178
184
|
@abstractmethod
|
|
179
|
-
def query_db(self, file_data:
|
|
185
|
+
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
180
186
|
pass
|
|
181
187
|
|
|
182
188
|
def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[pd.DataFrame]:
|
|
@@ -185,7 +191,7 @@ class SQLDownloader(Downloader, ABC):
|
|
|
185
191
|
dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
|
|
186
192
|
return dfs
|
|
187
193
|
|
|
188
|
-
def get_data(self, file_data:
|
|
194
|
+
def get_data(self, file_data: SqlBatchFileData) -> list[pd.DataFrame]:
|
|
189
195
|
rows, columns = self.query_db(file_data=file_data)
|
|
190
196
|
return self.sql_to_df(rows=rows, columns=columns)
|
|
191
197
|
|
|
@@ -199,10 +205,10 @@ class SQLDownloader(Downloader, ABC):
|
|
|
199
205
|
return f
|
|
200
206
|
|
|
201
207
|
def generate_download_response(
|
|
202
|
-
self, result: pd.DataFrame, file_data:
|
|
208
|
+
self, result: pd.DataFrame, file_data: SqlBatchFileData
|
|
203
209
|
) -> DownloadResponse:
|
|
204
|
-
id_column = file_data.additional_metadata
|
|
205
|
-
table_name = file_data.additional_metadata
|
|
210
|
+
id_column = file_data.additional_metadata.id_column
|
|
211
|
+
table_name = file_data.additional_metadata.table_name
|
|
206
212
|
record_id = result.iloc[0][id_column]
|
|
207
213
|
filename_id = self.get_identifier(table_name=table_name, record_id=record_id)
|
|
208
214
|
filename = f"{filename_id}.csv"
|
|
@@ -212,20 +218,19 @@ class SQLDownloader(Downloader, ABC):
|
|
|
212
218
|
)
|
|
213
219
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
214
220
|
result.to_csv(download_path, index=False)
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
copied_file_data.doc_type = "file"
|
|
218
|
-
copied_file_data.additional_metadata.pop("ids", None)
|
|
221
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
222
|
+
cast_file_data.identifier = filename_id
|
|
219
223
|
return super().generate_download_response(
|
|
220
|
-
file_data=
|
|
224
|
+
file_data=cast_file_data, download_path=download_path
|
|
221
225
|
)
|
|
222
226
|
|
|
223
227
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
224
|
-
|
|
228
|
+
sql_filedata = SqlBatchFileData.cast(file_data=file_data)
|
|
229
|
+
data_dfs = self.get_data(file_data=sql_filedata)
|
|
225
230
|
download_responses = []
|
|
226
231
|
for df in data_dfs:
|
|
227
232
|
download_responses.append(
|
|
228
|
-
self.generate_download_response(result=df, file_data=
|
|
233
|
+
self.generate_download_response(result=df, file_data=sql_filedata)
|
|
229
234
|
)
|
|
230
235
|
return download_responses
|
|
231
236
|
|
|
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Generator
|
|
|
6
6
|
|
|
7
7
|
from pydantic import Field, Secret, model_validator
|
|
8
8
|
|
|
9
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
10
9
|
from unstructured_ingest.v2.logger import logger
|
|
11
10
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
11
|
DestinationRegistryEntry,
|
|
@@ -15,6 +14,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
15
14
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
16
15
|
_DATE_COLUMNS,
|
|
17
16
|
SQLAccessConfig,
|
|
17
|
+
SqlBatchFileData,
|
|
18
18
|
SQLConnectionConfig,
|
|
19
19
|
SQLDownloader,
|
|
20
20
|
SQLDownloaderConfig,
|
|
@@ -97,10 +97,10 @@ class SQLiteDownloader(SQLDownloader):
|
|
|
97
97
|
connector_type: str = CONNECTOR_TYPE
|
|
98
98
|
values_delimiter: str = "?"
|
|
99
99
|
|
|
100
|
-
def query_db(self, file_data:
|
|
101
|
-
table_name = file_data.additional_metadata
|
|
102
|
-
id_column = file_data.additional_metadata
|
|
103
|
-
ids = file_data.
|
|
100
|
+
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
101
|
+
table_name = file_data.additional_metadata.table_name
|
|
102
|
+
id_column = file_data.additional_metadata.id_column
|
|
103
|
+
ids = [item.identifier for item in file_data.batch_items]
|
|
104
104
|
with self.connection_config.get_connection() as sqlite_connection:
|
|
105
105
|
cursor = sqlite_connection.cursor()
|
|
106
106
|
fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
|