unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show
  1. test/integration/connectors/sql/test_postgres.py +3 -3
  2. test/integration/connectors/sql/test_singlestore.py +3 -3
  3. test/integration/connectors/sql/test_sqlite.py +3 -3
  4. test/integration/connectors/test_astradb.py +40 -0
  5. test/integration/connectors/test_kafka.py +2 -2
  6. test/integration/connectors/test_mongodb.py +4 -1
  7. test/integration/connectors/utils/validation/source.py +31 -11
  8. unstructured_ingest/__version__.py +1 -1
  9. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  10. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  11. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  12. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  13. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  14. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  15. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  16. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  17. unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
  18. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  19. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  20. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  21. unstructured_ingest/v2/processes/connectors/astradb.py +35 -33
  22. unstructured_ingest/v2/processes/connectors/couchbase.py +50 -41
  23. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
  24. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  25. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  26. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  27. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  28. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  29. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  30. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  31. unstructured_ingest/v2/processes/connectors/mongodb.py +95 -100
  32. unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
  33. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  34. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  35. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  36. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  37. unstructured_ingest/v2/processes/connectors/sql/sql.py +31 -26
  38. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  39. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +14 -13
  40. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +44 -44
  41. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  42. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  43. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  44. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,10 @@
1
- import sys
2
1
  from contextlib import contextmanager
3
- from dataclasses import dataclass, replace
2
+ from dataclasses import dataclass
4
3
  from datetime import datetime
5
4
  from time import time
6
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
7
6
 
8
- from pydantic import Field, Secret
7
+ from pydantic import BaseModel, Field, Secret
9
8
 
10
9
  from unstructured_ingest.__version__ import __version__ as unstructured_version
11
10
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
@@ -14,9 +13,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
14
13
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
15
14
  from unstructured_ingest.v2.interfaces import (
16
15
  AccessConfig,
16
+ BatchFileData,
17
+ BatchItem,
17
18
  ConnectionConfig,
18
19
  Downloader,
19
20
  DownloaderConfig,
21
+ DownloadResponse,
20
22
  FileData,
21
23
  FileDataSourceMetadata,
22
24
  Indexer,
@@ -40,6 +42,15 @@ CONNECTOR_TYPE = "mongodb"
40
42
  SERVER_API_VERSION = "1"
41
43
 
42
44
 
45
+ class MongoDBAdditionalMetadata(BaseModel):
46
+ database: str
47
+ collection: str
48
+
49
+
50
+ class MongoDBBatchFileData(BatchFileData):
51
+ additional_metadata: MongoDBAdditionalMetadata
52
+
53
+
43
54
  class MongoDBAccessConfig(AccessConfig):
44
55
  uri: Optional[str] = Field(default=None, description="URI to user when connecting")
45
56
 
@@ -122,7 +133,7 @@ class MongoDBIndexer(Indexer):
122
133
  logger.error(f"Failed to validate connection: {e}", exc_info=True)
123
134
  raise SourceConnectionError(f"Failed to validate connection: {e}")
124
135
 
125
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
136
+ def run(self, **kwargs: Any) -> Generator[BatchFileData, None, None]:
126
137
  """Generates FileData objects for each document in the MongoDB collection."""
127
138
  with self.connection_config.get_client() as client:
128
139
  database = client[self.index_config.database]
@@ -130,12 +141,12 @@ class MongoDBIndexer(Indexer):
130
141
 
131
142
  # Get list of document IDs
132
143
  ids = collection.distinct("_id")
133
- batch_size = self.index_config.batch_size if self.index_config else 100
144
+
145
+ ids = sorted(ids)
146
+ batch_size = self.index_config.batch_size
134
147
 
135
148
  for id_batch in batch_generator(ids, batch_size=batch_size):
136
149
  # Make sure the hash is always a positive number to create identifier
137
- batch_id = str(hash(frozenset(id_batch)) + sys.maxsize + 1)
138
-
139
150
  metadata = FileDataSourceMetadata(
140
151
  date_processed=str(time()),
141
152
  record_locator={
@@ -144,14 +155,13 @@ class MongoDBIndexer(Indexer):
144
155
  },
145
156
  )
146
157
 
147
- file_data = FileData(
148
- identifier=batch_id,
149
- doc_type="batch",
158
+ file_data = MongoDBBatchFileData(
150
159
  connector_type=self.connector_type,
151
160
  metadata=metadata,
152
- additional_metadata={
153
- "ids": [str(doc_id) for doc_id in id_batch],
154
- },
161
+ batch_items=[BatchItem(identifier=str(doc_id)) for doc_id in id_batch],
162
+ additional_metadata=MongoDBAdditionalMetadata(
163
+ collection=self.index_config.collection, database=self.index_config.database
164
+ ),
155
165
  )
156
166
  yield file_data
157
167
 
@@ -162,26 +172,59 @@ class MongoDBDownloader(Downloader):
162
172
  connection_config: MongoDBConnectionConfig
163
173
  connector_type: str = CONNECTOR_TYPE
164
174
 
165
- @requires_dependencies(["pymongo"], extras="mongodb")
166
- def create_client(self) -> "MongoClient":
167
- from pymongo import MongoClient
168
- from pymongo.driver_info import DriverInfo
169
- from pymongo.server_api import ServerApi
175
+ def generate_download_response(
176
+ self, doc: dict, file_data: MongoDBBatchFileData
177
+ ) -> DownloadResponse:
178
+ from bson.objectid import ObjectId
170
179
 
171
- access_config = self.connection_config.access_config.get_secret_value()
180
+ doc_id = doc["_id"]
181
+ doc.pop("_id", None)
172
182
 
173
- if access_config.uri:
174
- return MongoClient(
175
- access_config.uri,
176
- server_api=ServerApi(version=SERVER_API_VERSION),
177
- driver=DriverInfo(name="unstructured", version=unstructured_version),
178
- )
179
- else:
180
- return MongoClient(
181
- host=self.connection_config.host,
182
- port=self.connection_config.port,
183
- server_api=ServerApi(version=SERVER_API_VERSION),
184
- )
183
+ # Extract date_created from the document or ObjectId
184
+ date_created = None
185
+ if "date_created" in doc:
186
+ # If the document has a 'date_created' field, use it
187
+ date_created = doc["date_created"]
188
+ if isinstance(date_created, datetime):
189
+ date_created = date_created.isoformat()
190
+ else:
191
+ # Convert to ISO format if it's a string
192
+ date_created = str(date_created)
193
+ elif isinstance(doc_id, ObjectId):
194
+ # Use the ObjectId's generation time
195
+ date_created = doc_id.generation_time.isoformat()
196
+
197
+ flattened_dict = flatten_dict(dictionary=doc)
198
+ concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
199
+
200
+ # Create a FileData object for each document with source_identifiers
201
+ cast_file_data = FileData.cast(file_data=file_data)
202
+ cast_file_data.identifier = str(doc_id)
203
+ filename = f"{doc_id}.txt"
204
+ cast_file_data.source_identifiers = SourceIdentifiers(
205
+ filename=filename,
206
+ fullpath=filename,
207
+ rel_path=filename,
208
+ )
209
+
210
+ # Determine the download path
211
+ download_path = self.get_download_path(file_data=cast_file_data)
212
+ if download_path is None:
213
+ raise ValueError("Download path could not be determined")
214
+
215
+ download_path.parent.mkdir(parents=True, exist_ok=True)
216
+
217
+ # Write the concatenated values to the file
218
+ with open(download_path, "w", encoding="utf8") as f:
219
+ f.write(concatenated_values)
220
+
221
+ # Update metadata
222
+ cast_file_data.metadata.record_locator["document_id"] = str(doc_id)
223
+ cast_file_data.metadata.date_created = date_created
224
+
225
+ return super().generate_download_response(
226
+ file_data=cast_file_data, download_path=download_path
227
+ )
185
228
 
186
229
  @SourceConnectionError.wrap
187
230
  @requires_dependencies(["bson"], extras="mongodb")
@@ -190,82 +233,34 @@ class MongoDBDownloader(Downloader):
190
233
  from bson.errors import InvalidId
191
234
  from bson.objectid import ObjectId
192
235
 
193
- client = self.create_client()
194
- database = client[file_data.metadata.record_locator["database"]]
195
- collection = database[file_data.metadata.record_locator["collection"]]
236
+ mongo_file_data = MongoDBBatchFileData.cast(file_data=file_data)
196
237
 
197
- ids = file_data.additional_metadata.get("ids", [])
198
- if not ids:
199
- raise ValueError("No document IDs provided in additional_metadata")
238
+ with self.connection_config.get_client() as client:
239
+ database = client[mongo_file_data.additional_metadata.database]
240
+ collection = database[mongo_file_data.additional_metadata.collection]
200
241
 
201
- object_ids = []
202
- for doc_id in ids:
203
- try:
204
- object_ids.append(ObjectId(doc_id))
205
- except InvalidId as e:
206
- error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
207
- logger.error(error_message)
208
- raise ValueError(error_message) from e
242
+ ids = [item.identifier for item in mongo_file_data.batch_items]
209
243
 
210
- try:
211
- docs = list(collection.find({"_id": {"$in": object_ids}}))
212
- except Exception as e:
213
- logger.error(f"Failed to fetch documents: {e}", exc_info=True)
214
- raise e
244
+ object_ids = []
245
+ for doc_id in ids:
246
+ try:
247
+ object_ids.append(ObjectId(doc_id))
248
+ except InvalidId as e:
249
+ error_message = f"Invalid ObjectId for doc_id '{doc_id}': {str(e)}"
250
+ logger.error(error_message)
251
+ raise ValueError(error_message) from e
252
+
253
+ try:
254
+ docs = list(collection.find({"_id": {"$in": object_ids}}))
255
+ except Exception as e:
256
+ logger.error(f"Failed to fetch documents: {e}", exc_info=True)
257
+ raise e
215
258
 
216
259
  download_responses = []
217
260
  for doc in docs:
218
- doc_id = doc["_id"]
219
- doc.pop("_id", None)
220
-
221
- # Extract date_created from the document or ObjectId
222
- date_created = None
223
- if "date_created" in doc:
224
- # If the document has a 'date_created' field, use it
225
- date_created = doc["date_created"]
226
- if isinstance(date_created, datetime):
227
- date_created = date_created.isoformat()
228
- else:
229
- # Convert to ISO format if it's a string
230
- date_created = str(date_created)
231
- elif isinstance(doc_id, ObjectId):
232
- # Use the ObjectId's generation time
233
- date_created = doc_id.generation_time.isoformat()
234
-
235
- flattened_dict = flatten_dict(dictionary=doc)
236
- concatenated_values = "\n".join(str(value) for value in flattened_dict.values())
237
-
238
- # Create a FileData object for each document with source_identifiers
239
- individual_file_data = replace(file_data)
240
- individual_file_data.identifier = str(doc_id)
241
- individual_file_data.source_identifiers = SourceIdentifiers(
242
- filename=str(doc_id),
243
- fullpath=str(doc_id),
244
- rel_path=str(doc_id),
245
- )
246
-
247
- # Determine the download path
248
- download_path = self.get_download_path(individual_file_data)
249
- if download_path is None:
250
- raise ValueError("Download path could not be determined")
251
-
252
- download_path.parent.mkdir(parents=True, exist_ok=True)
253
- download_path = download_path.with_suffix(".txt")
254
-
255
- # Write the concatenated values to the file
256
- with open(download_path, "w", encoding="utf8") as f:
257
- f.write(concatenated_values)
258
-
259
- individual_file_data.local_download_path = str(download_path)
260
-
261
- # Update metadata
262
- individual_file_data.metadata.record_locator["document_id"] = str(doc_id)
263
- individual_file_data.metadata.date_created = date_created
264
-
265
- download_response = self.generate_download_response(
266
- file_data=individual_file_data, download_path=download_path
261
+ download_responses.append(
262
+ self.generate_download_response(doc=doc, file_data=mongo_file_data)
267
263
  )
268
- download_responses.append(download_response)
269
264
 
270
265
  return download_responses
271
266
 
@@ -10,7 +10,6 @@ from enum import Enum
10
10
  from pathlib import Path
11
11
  from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
12
12
 
13
- import networkx as nx
14
13
  from pydantic import BaseModel, ConfigDict, Field, Secret
15
14
 
16
15
  from unstructured_ingest.error import DestinationConnectionError
@@ -33,6 +32,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
33
32
 
34
33
  if TYPE_CHECKING:
35
34
  from neo4j import AsyncDriver, Auth
35
+ from networkx import Graph, MultiDiGraph
36
36
 
37
37
  CONNECTOR_TYPE = "neo4j"
38
38
 
@@ -109,7 +109,9 @@ class Neo4jUploadStager(UploadStager):
109
109
 
110
110
  return output_filepath
111
111
 
112
- def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> nx.Graph:
112
+ def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> "Graph":
113
+ import networkx as nx
114
+
113
115
  graph = nx.MultiDiGraph()
114
116
  graph.add_node(document_node)
115
117
 
@@ -180,7 +182,7 @@ class _GraphData(BaseModel):
180
182
  edges: list[_Edge]
181
183
 
182
184
  @classmethod
183
- def from_nx(cls, nx_graph: nx.MultiDiGraph) -> _GraphData:
185
+ def from_nx(cls, nx_graph: "MultiDiGraph") -> _GraphData:
184
186
  nodes = list(nx_graph.nodes())
185
187
  edges = [
186
188
  _Edge(
@@ -202,7 +202,7 @@ class OnedriveDownloader(Downloader):
202
202
  if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
203
203
  raise ValueError(
204
204
  f"file data doesn't have enough information to get "
205
- f"file content: {file_data.to_dict()}"
205
+ f"file content: {file_data.model_dump()}"
206
206
  )
207
207
 
208
208
  server_relative_path = file_data.source_identifiers.fullpath
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Generator, Optional
5
5
  from pydantic import Field, Secret
6
6
 
7
7
  from unstructured_ingest.utils.dep_check import requires_dependencies
8
- from unstructured_ingest.v2.interfaces import FileData
9
8
  from unstructured_ingest.v2.logger import logger
10
9
  from unstructured_ingest.v2.processes.connector_registry import (
11
10
  DestinationRegistryEntry,
@@ -13,6 +12,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
13
12
  )
14
13
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
15
14
  SQLAccessConfig,
15
+ SqlBatchFileData,
16
16
  SQLConnectionConfig,
17
17
  SQLDownloader,
18
18
  SQLDownloaderConfig,
@@ -99,12 +99,12 @@ class PostgresDownloader(SQLDownloader):
99
99
  connector_type: str = CONNECTOR_TYPE
100
100
 
101
101
  @requires_dependencies(["psycopg2"], extras="postgres")
102
- def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
102
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
103
103
  from psycopg2 import sql
104
104
 
105
- table_name = file_data.additional_metadata["table_name"]
106
- id_column = file_data.additional_metadata["id_column"]
107
- ids = tuple(file_data.additional_metadata["ids"])
105
+ table_name = file_data.additional_metadata.table_name
106
+ id_column = file_data.additional_metadata.id_column
107
+ ids = tuple([item.identifier for item in file_data.batch_items])
108
108
 
109
109
  with self.connection_config.get_cursor() as cursor:
110
110
  fields = (
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
6
  from pydantic import Field, Secret
7
7
 
8
- from unstructured_ingest.v2.interfaces import FileData
9
8
  from unstructured_ingest.v2.logger import logger
10
9
  from unstructured_ingest.v2.processes.connector_registry import (
11
10
  DestinationRegistryEntry,
@@ -14,6 +13,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
14
13
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
15
14
  _DATE_COLUMNS,
16
15
  SQLAccessConfig,
16
+ SqlBatchFileData,
17
17
  SQLConnectionConfig,
18
18
  SQLDownloader,
19
19
  SQLDownloaderConfig,
@@ -93,10 +93,10 @@ class SingleStoreDownloader(SQLDownloader):
93
93
  connector_type: str = CONNECTOR_TYPE
94
94
  values_delimiter: str = "%s"
95
95
 
96
- def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
97
- table_name = file_data.additional_metadata["table_name"]
98
- id_column = file_data.additional_metadata["id_column"]
99
- ids = tuple(file_data.additional_metadata["ids"])
96
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
97
+ table_name = file_data.additional_metadata.table_name
98
+ id_column = file_data.additional_metadata.id_column
99
+ ids = tuple([item.identifier for item in file_data.batch_items])
100
100
  with self.connection_config.get_connection() as sqlite_connection:
101
101
  cursor = sqlite_connection.cursor()
102
102
  fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
@@ -9,7 +9,6 @@ from pydantic import Field, Secret
9
9
 
10
10
  from unstructured_ingest.utils.data_prep import split_dataframe
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
- from unstructured_ingest.v2.interfaces.file_data import FileData
13
12
  from unstructured_ingest.v2.logger import logger
14
13
  from unstructured_ingest.v2.processes.connector_registry import (
15
14
  DestinationRegistryEntry,
@@ -17,6 +16,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
17
16
  )
18
17
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
19
18
  SQLAccessConfig,
19
+ SqlBatchFileData,
20
20
  SQLConnectionConfig,
21
21
  SQLDownloader,
22
22
  SQLDownloaderConfig,
@@ -118,10 +118,10 @@ class SnowflakeDownloader(SQLDownloader):
118
118
 
119
119
  # The actual snowflake module package name is: snowflake-connector-python
120
120
  @requires_dependencies(["snowflake"], extras="snowflake")
121
- def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
122
- table_name = file_data.additional_metadata["table_name"]
123
- id_column = file_data.additional_metadata["id_column"]
124
- ids = file_data.additional_metadata["ids"]
121
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
122
+ table_name = file_data.additional_metadata.table_name
123
+ id_column = file_data.additional_metadata.id_column
124
+ ids = [item.identifier for item in file_data.batch_items]
125
125
 
126
126
  with self.connection_config.get_cursor() as cursor:
127
127
  query = "SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})".format(
@@ -1,9 +1,8 @@
1
1
  import hashlib
2
2
  import json
3
- import sys
4
3
  from abc import ABC, abstractmethod
5
4
  from contextlib import contextmanager
6
- from dataclasses import dataclass, field, replace
5
+ from dataclasses import dataclass, field
7
6
  from datetime import date, datetime
8
7
  from pathlib import Path
9
8
  from time import time
@@ -12,13 +11,15 @@ from typing import Any, Generator, Union
12
11
  import numpy as np
13
12
  import pandas as pd
14
13
  from dateutil import parser
15
- from pydantic import Field, Secret
14
+ from pydantic import BaseModel, Field, Secret
16
15
 
17
16
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
18
17
  from unstructured_ingest.utils.data_prep import get_data_df, split_dataframe
19
18
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
20
19
  from unstructured_ingest.v2.interfaces import (
21
20
  AccessConfig,
21
+ BatchFileData,
22
+ BatchItem,
22
23
  ConnectionConfig,
23
24
  Downloader,
24
25
  DownloaderConfig,
@@ -81,6 +82,15 @@ _COLUMNS = (
81
82
  _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
82
83
 
83
84
 
85
+ class SqlAdditionalMetadata(BaseModel):
86
+ table_name: str
87
+ id_column: str
88
+
89
+
90
+ class SqlBatchFileData(BatchFileData):
91
+ additional_metadata: SqlAdditionalMetadata
92
+
93
+
84
94
  def parse_date_string(date_value: Union[str, int]) -> date:
85
95
  try:
86
96
  timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
@@ -124,7 +134,7 @@ class SQLIndexer(Indexer, ABC):
124
134
  f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
125
135
  )
126
136
  results = cursor.fetchall()
127
- ids = [result[0] for result in results]
137
+ ids = sorted([result[0] for result in results])
128
138
  return ids
129
139
 
130
140
  def precheck(self) -> None:
@@ -135,7 +145,7 @@ class SQLIndexer(Indexer, ABC):
135
145
  logger.error(f"failed to validate connection: {e}", exc_info=True)
136
146
  raise SourceConnectionError(f"failed to validate connection: {e}")
137
147
 
138
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
148
+ def run(self, **kwargs: Any) -> Generator[SqlBatchFileData, None, None]:
139
149
  ids = self._get_doc_ids()
140
150
  id_batches: list[frozenset[str]] = [
141
151
  frozenset(
@@ -151,19 +161,15 @@ class SQLIndexer(Indexer, ABC):
151
161
  ]
152
162
  for batch in id_batches:
153
163
  # Make sure the hash is always a positive number to create identified
154
- identified = str(hash(batch) + sys.maxsize + 1)
155
- yield FileData(
156
- identifier=identified,
164
+ yield SqlBatchFileData(
157
165
  connector_type=self.connector_type,
158
166
  metadata=FileDataSourceMetadata(
159
167
  date_processed=str(time()),
160
168
  ),
161
- doc_type="batch",
162
- additional_metadata={
163
- "ids": list(batch),
164
- "table_name": self.index_config.table_name,
165
- "id_column": self.index_config.id_column,
166
- },
169
+ additional_metadata=SqlAdditionalMetadata(
170
+ table_name=self.index_config.table_name, id_column=self.index_config.id_column
171
+ ),
172
+ batch_items=[BatchItem(identifier=str(b)) for b in batch],
167
173
  )
168
174
 
169
175
 
@@ -176,7 +182,7 @@ class SQLDownloader(Downloader, ABC):
176
182
  download_config: SQLDownloaderConfig
177
183
 
178
184
  @abstractmethod
179
- def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
185
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
180
186
  pass
181
187
 
182
188
  def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[pd.DataFrame]:
@@ -185,7 +191,7 @@ class SQLDownloader(Downloader, ABC):
185
191
  dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
186
192
  return dfs
187
193
 
188
- def get_data(self, file_data: FileData) -> list[pd.DataFrame]:
194
+ def get_data(self, file_data: SqlBatchFileData) -> list[pd.DataFrame]:
189
195
  rows, columns = self.query_db(file_data=file_data)
190
196
  return self.sql_to_df(rows=rows, columns=columns)
191
197
 
@@ -199,10 +205,10 @@ class SQLDownloader(Downloader, ABC):
199
205
  return f
200
206
 
201
207
  def generate_download_response(
202
- self, result: pd.DataFrame, file_data: FileData
208
+ self, result: pd.DataFrame, file_data: SqlBatchFileData
203
209
  ) -> DownloadResponse:
204
- id_column = file_data.additional_metadata["id_column"]
205
- table_name = file_data.additional_metadata["table_name"]
210
+ id_column = file_data.additional_metadata.id_column
211
+ table_name = file_data.additional_metadata.table_name
206
212
  record_id = result.iloc[0][id_column]
207
213
  filename_id = self.get_identifier(table_name=table_name, record_id=record_id)
208
214
  filename = f"{filename_id}.csv"
@@ -212,20 +218,19 @@ class SQLDownloader(Downloader, ABC):
212
218
  )
213
219
  download_path.parent.mkdir(parents=True, exist_ok=True)
214
220
  result.to_csv(download_path, index=False)
215
- copied_file_data = replace(file_data)
216
- copied_file_data.identifier = filename_id
217
- copied_file_data.doc_type = "file"
218
- copied_file_data.additional_metadata.pop("ids", None)
221
+ cast_file_data = FileData.cast(file_data=file_data)
222
+ cast_file_data.identifier = filename_id
219
223
  return super().generate_download_response(
220
- file_data=copied_file_data, download_path=download_path
224
+ file_data=cast_file_data, download_path=download_path
221
225
  )
222
226
 
223
227
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
224
- data_dfs = self.get_data(file_data=file_data)
228
+ sql_filedata = SqlBatchFileData.cast(file_data=file_data)
229
+ data_dfs = self.get_data(file_data=sql_filedata)
225
230
  download_responses = []
226
231
  for df in data_dfs:
227
232
  download_responses.append(
228
- self.generate_download_response(result=df, file_data=file_data)
233
+ self.generate_download_response(result=df, file_data=sql_filedata)
229
234
  )
230
235
  return download_responses
231
236
 
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Generator
6
6
 
7
7
  from pydantic import Field, Secret, model_validator
8
8
 
9
- from unstructured_ingest.v2.interfaces import FileData
10
9
  from unstructured_ingest.v2.logger import logger
11
10
  from unstructured_ingest.v2.processes.connector_registry import (
12
11
  DestinationRegistryEntry,
@@ -15,6 +14,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
15
14
  from unstructured_ingest.v2.processes.connectors.sql.sql import (
16
15
  _DATE_COLUMNS,
17
16
  SQLAccessConfig,
17
+ SqlBatchFileData,
18
18
  SQLConnectionConfig,
19
19
  SQLDownloader,
20
20
  SQLDownloaderConfig,
@@ -97,10 +97,10 @@ class SQLiteDownloader(SQLDownloader):
97
97
  connector_type: str = CONNECTOR_TYPE
98
98
  values_delimiter: str = "?"
99
99
 
100
- def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
101
- table_name = file_data.additional_metadata["table_name"]
102
- id_column = file_data.additional_metadata["id_column"]
103
- ids = file_data.additional_metadata["ids"]
100
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
101
+ table_name = file_data.additional_metadata.table_name
102
+ id_column = file_data.additional_metadata.id_column
103
+ ids = [item.identifier for item in file_data.batch_items]
104
104
  with self.connection_config.get_connection() as sqlite_connection:
105
105
  cursor = sqlite_connection.cursor()
106
106
  fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"