unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (52) hide show
  1. test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
  2. test/integration/connectors/sql/test_postgres.py +9 -5
  3. test/integration/connectors/sql/test_singlestore.py +9 -5
  4. test/integration/connectors/sql/test_snowflake.py +6 -2
  5. test/integration/connectors/sql/test_sqlite.py +9 -5
  6. test/integration/connectors/test_astradb.py +40 -0
  7. test/integration/connectors/test_kafka.py +2 -2
  8. test/integration/connectors/test_mongodb.py +4 -1
  9. test/integration/connectors/utils/validation/source.py +31 -11
  10. unstructured_ingest/__version__.py +1 -1
  11. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  12. unstructured_ingest/v2/interfaces/file_data.py +69 -15
  13. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  14. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  15. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  16. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  17. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  18. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  19. unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
  20. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  21. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  22. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  23. unstructured_ingest/v2/processes/connectors/astradb.py +37 -33
  24. unstructured_ingest/v2/processes/connectors/couchbase.py +52 -41
  25. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
  26. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
  27. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
  28. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
  29. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
  30. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
  31. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  32. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  33. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  34. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  35. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  36. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  37. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  38. unstructured_ingest/v2/processes/connectors/mongodb.py +94 -100
  39. unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
  40. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  41. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  42. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  43. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  44. unstructured_ingest/v2/processes/connectors/sql/sql.py +36 -26
  45. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  46. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/METADATA +11 -10
  47. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/RECORD +52 -52
  48. /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
  49. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/LICENSE.md +0 -0
  50. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/WHEEL +0 -0
  51. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/entry_points.txt +0 -0
  52. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
9
10
  from unstructured_ingest.v2.logger import logger
10
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
12
  from unstructured_ingest.v2.processes.chunker import Chunker
@@ -51,7 +52,7 @@ class ChunkStep(PipelineStep):
51
52
  self, fn: Callable, path: str, file_data_path: str, **kwargs
52
53
  ) -> ChunkStepResponse:
53
54
  path = Path(path)
54
- file_data = FileData.from_file(path=file_data_path)
55
+ file_data = file_data_from_file(path=file_data_path)
55
56
  output_filepath = self.get_output_filepath(filename=path)
56
57
  if not self.should_chunk(filepath=output_filepath, file_data=file_data):
57
58
  logger.debug(f"skipping chunking, output already exists: {output_filepath}")
@@ -8,6 +8,7 @@ from typing import Callable, Optional, TypedDict, TypeVar
8
8
 
9
9
  from unstructured_ingest.v2.interfaces import FileData, download_responses
10
10
  from unstructured_ingest.v2.interfaces.downloader import Downloader
11
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
11
12
  from unstructured_ingest.v2.logger import logger
12
13
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
13
14
  from unstructured_ingest.v2.utils import serialize_base_model_json
@@ -87,12 +88,12 @@ class DownloadStep(PipelineStep):
87
88
  f"match size of local file: {file_size_bytes}, updating"
88
89
  )
89
90
  file_data.metadata.filesize_bytes = file_size_bytes
90
- logger.debug(f"updating file data with new content: {file_data.to_dict()}")
91
+ logger.debug(f"updating file data with new content: {file_data.model_dump()}")
91
92
  with file_data_path.open("w") as file:
92
- json.dump(file_data.to_dict(), file, indent=2)
93
+ json.dump(file_data.model_dump(), file, indent=2)
93
94
 
94
95
  async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
95
- file_data = FileData.from_file(path=file_data_path)
96
+ file_data = file_data_from_file(path=file_data_path)
96
97
  download_path = self.process.get_download_path(file_data=file_data)
97
98
  if not self.should_download(file_data=file_data, file_data_path=file_data_path):
98
99
  logger.debug(f"skipping download, file already exists locally: {download_path}")
@@ -172,7 +173,7 @@ class DownloadStep(PipelineStep):
172
173
  filepath = (self.cache_dir / filename).resolve()
173
174
  filepath.parent.mkdir(parents=True, exist_ok=True)
174
175
  with open(str(filepath), "w") as f:
175
- json.dump(file_data.to_dict(), f, indent=2)
176
+ json.dump(file_data.model_dump(), f, indent=2)
176
177
  return str(filepath)
177
178
 
178
179
  def get_hash(self, extras: Optional[list[str]]) -> str:
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
9
10
  from unstructured_ingest.v2.logger import logger
10
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
12
  from unstructured_ingest.v2.processes.embedder import Embedder
@@ -49,7 +50,7 @@ class EmbedStep(PipelineStep):
49
50
 
50
51
  async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
51
52
  path = Path(path)
52
- file_data = FileData.from_file(path=file_data_path)
53
+ file_data = file_data_from_file(path=file_data_path)
53
54
  output_filepath = self.get_output_filepath(filename=path)
54
55
  if not self.should_embed(filepath=output_filepath, file_data=file_data):
55
56
  logger.debug(f"skipping embedding, output already exists: {output_filepath}")
@@ -2,7 +2,7 @@ import asyncio
2
2
  from dataclasses import dataclass
3
3
  from typing import Callable, Optional
4
4
 
5
- from unstructured_ingest.v2.interfaces.file_data import FileData
5
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
6
6
  from unstructured_ingest.v2.logger import logger
7
7
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
8
  from unstructured_ingest.v2.processes.filter import Filterer
@@ -20,7 +20,7 @@ class FilterStep(PipelineStep):
20
20
  logger.info(f"created {self.identifier} with configs: {config}")
21
21
 
22
22
  async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
23
- file_data = FileData.from_file(path=file_data_path)
23
+ file_data = file_data_from_file(path=file_data_path)
24
24
  fn_kwargs = {"file_data": file_data}
25
25
  if not asyncio.iscoroutinefunction(fn):
26
26
  resp = fn(**fn_kwargs)
@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
37
37
  @instrument(span_name=STEP_ID)
38
38
  def run(self) -> Generator[str, None, None]:
39
39
  for file_data in self.process.run():
40
- logger.debug(f"generated file data: {file_data.to_dict()}")
40
+ logger.debug(f"generated file data: {file_data.model_dump()}")
41
41
  try:
42
42
  record_hash = self.get_hash(extras=[file_data.identifier])
43
43
  filename = f"{record_hash}.json"
44
44
  filepath = (self.cache_dir / filename).resolve()
45
45
  filepath.parent.mkdir(parents=True, exist_ok=True)
46
46
  with open(str(filepath), "w") as f:
47
- json.dump(file_data.to_dict(), f, indent=2)
47
+ json.dump(file_data.model_dump(), f, indent=2)
48
48
  yield str(filepath)
49
49
  except Exception as e:
50
50
  logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
54
54
 
55
55
  async def run_async(self) -> AsyncGenerator[str, None]:
56
56
  async for file_data in self.process.run_async():
57
- logger.debug(f"generated file data: {file_data.to_dict()}")
57
+ logger.debug(f"generated file data: {file_data.model_dump()}")
58
58
  try:
59
59
  record_hash = self.get_hash(extras=[file_data.identifier])
60
60
  filename = f"{record_hash}.json"
61
61
  filepath = (self.cache_dir / filename).resolve()
62
62
  filepath.parent.mkdir(parents=True, exist_ok=True)
63
63
  with open(str(filepath), "w") as f:
64
- json.dump(file_data.to_dict(), f, indent=2)
64
+ json.dump(file_data.model_dump(), f, indent=2)
65
65
  yield str(filepath)
66
66
  except Exception as e:
67
67
  logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
9
10
  from unstructured_ingest.v2.logger import logger
10
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
12
  from unstructured_ingest.v2.processes.partitioner import Partitioner
@@ -51,12 +52,12 @@ class PartitionStep(PipelineStep):
51
52
  self, fn: Callable, path: str, file_data_path: str
52
53
  ) -> Optional[PartitionStepResponse]:
53
54
  path = Path(path)
54
- file_data = FileData.from_file(path=file_data_path)
55
+ file_data = file_data_from_file(path=file_data_path)
55
56
  output_filepath = self.get_output_filepath(filename=Path(file_data_path))
56
57
  if not self.should_partition(filepath=output_filepath, file_data=file_data):
57
58
  logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
58
59
  return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
59
- fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
60
+ fn_kwargs = {"filename": path, "metadata": file_data.metadata.model_dump()}
60
61
  if not asyncio.iscoroutinefunction(fn):
61
62
  partitioned_content = fn(**fn_kwargs)
62
63
  elif semaphore := self.context.semaphore:
@@ -4,7 +4,7 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import Callable, Optional, TypedDict
6
6
 
7
- from unstructured_ingest.v2.interfaces.file_data import FileData
7
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
8
8
  from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
9
9
  from unstructured_ingest.v2.logger import logger
10
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
@@ -43,7 +43,7 @@ class UploadStageStep(PipelineStep):
43
43
  output_filename = f"{self.get_hash(extras=[path.name])}{path.suffix}"
44
44
  fn_kwargs = {
45
45
  "elements_filepath": path,
46
- "file_data": FileData.from_file(path=file_data_path),
46
+ "file_data": file_data_from_file(path=file_data_path),
47
47
  "output_dir": self.cache_dir,
48
48
  "output_filename": output_filename,
49
49
  }
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Callable, TypedDict
5
5
 
6
- from unstructured_ingest.v2.interfaces.file_data import FileData
6
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
7
7
  from unstructured_ingest.v2.logger import logger
8
8
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
9
  from unstructured_ingest.v2.processes.uncompress import Uncompressor
@@ -28,7 +28,7 @@ class UncompressStep(PipelineStep):
28
28
  async def _run_async(
29
29
  self, fn: Callable, path: str, file_data_path: str
30
30
  ) -> list[UncompressStepResponse]:
31
- file_data = FileData.from_file(path=file_data_path)
31
+ file_data = file_data_from_file(path=file_data_path)
32
32
  fn_kwargs = {"file_data": file_data}
33
33
  if not asyncio.iscoroutinefunction(fn):
34
34
  new_file_data = fn(**fn_kwargs)
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Callable, Optional, TypedDict
5
5
 
6
- from unstructured_ingest.v2.interfaces import FileData
6
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
7
7
  from unstructured_ingest.v2.interfaces.uploader import UploadContent
8
8
  from unstructured_ingest.v2.logger import logger
9
9
  from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
@@ -41,14 +41,14 @@ class UploadStep(BatchPipelineStep):
41
41
  @instrument(span_name=STEP_ID)
42
42
  def _run_batch(self, contents: list[UploadStepContent]) -> None:
43
43
  upload_contents = [
44
- UploadContent(path=Path(c["path"]), file_data=FileData.from_file(c["file_data_path"]))
44
+ UploadContent(path=Path(c["path"]), file_data=file_data_from_file(c["file_data_path"]))
45
45
  for c in contents
46
46
  ]
47
47
  self.process.run_batch(contents=upload_contents)
48
48
 
49
49
  async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None):
50
50
  fn = fn or self.process.run_async
51
- fn_kwargs = {"path": Path(path), "file_data": FileData.from_file(path=file_data_path)}
51
+ fn_kwargs = {"path": Path(path), "file_data": file_data_from_file(path=file_data_path)}
52
52
  if not asyncio.iscoroutinefunction(fn):
53
53
  fn(**fn_kwargs)
54
54
  elif semaphore := self.context.semaphore:
@@ -40,6 +40,8 @@ from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
40
40
  from .milvus import milvus_destination_entry
41
41
  from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
42
42
  from .mongodb import mongodb_destination_entry, mongodb_source_entry
43
+ from .neo4j import CONNECTOR_TYPE as NEO4J_CONNECTOR_TYPE
44
+ from .neo4j import neo4j_destination_entry
43
45
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
44
46
  from .onedrive import onedrive_destination_entry, onedrive_source_entry
45
47
  from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
@@ -74,6 +76,7 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
74
76
  add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
75
77
  add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
76
78
 
79
+ add_destination_entry(destination_type=NEO4J_CONNECTOR_TYPE, entry=neo4j_destination_entry)
77
80
 
78
81
  add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
79
82
 
@@ -1,13 +1,11 @@
1
- import copy
2
1
  import csv
3
2
  import hashlib
4
- import sys
5
3
  from dataclasses import dataclass, field
6
4
  from pathlib import Path
7
5
  from time import time
8
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
9
7
 
10
- from pydantic import Field, Secret
8
+ from pydantic import BaseModel, Field, Secret
11
9
 
12
10
  from unstructured_ingest import __name__ as integration_name
13
11
  from unstructured_ingest.__version__ import __version__ as integration_version
@@ -22,6 +20,8 @@ from unstructured_ingest.utils.string_and_date_utils import truncate_string_byte
22
20
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
23
21
  from unstructured_ingest.v2.interfaces import (
24
22
  AccessConfig,
23
+ BatchFileData,
24
+ BatchItem,
25
25
  ConnectionConfig,
26
26
  Downloader,
27
27
  DownloaderConfig,
@@ -30,6 +30,7 @@ from unstructured_ingest.v2.interfaces import (
30
30
  FileDataSourceMetadata,
31
31
  Indexer,
32
32
  IndexerConfig,
33
+ SourceIdentifiers,
33
34
  Uploader,
34
35
  UploaderConfig,
35
36
  UploadStager,
@@ -53,6 +54,15 @@ CONNECTOR_TYPE = "astradb"
53
54
  MAX_CONTENT_PARAM_BYTE_SIZE = 8000
54
55
 
55
56
 
57
+ class AstraDBAdditionalMetadata(BaseModel):
58
+ collection_name: str
59
+ keyspace: Optional[str] = None
60
+
61
+
62
+ class AstraDBBatchFileData(BatchFileData):
63
+ additional_metadata: AstraDBAdditionalMetadata
64
+
65
+
56
66
  class AstraDBAccessConfig(AccessConfig):
57
67
  token: str = Field(description="Astra DB Token with access to the database.")
58
68
  api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
@@ -179,9 +189,6 @@ class AstraDBIndexer(Indexer):
179
189
 
180
190
  def _get_doc_ids(self) -> set[str]:
181
191
  """Fetches all document ids in an index"""
182
- # Initialize set of ids
183
- ids = set()
184
-
185
192
  # Get the collection
186
193
  collection = self.get_collection()
187
194
 
@@ -194,31 +201,26 @@ class AstraDBIndexer(Indexer):
194
201
  astra_db_docs.append(result)
195
202
 
196
203
  # Create file data for each astra record
197
- for astra_record in astra_db_docs:
198
- ids.add(astra_record["_id"])
204
+ ids = sorted([astra_record["_id"] for astra_record in astra_db_docs])
199
205
 
200
- return ids
206
+ return set(ids)
201
207
 
202
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
208
+ def run(self, **kwargs: Any) -> Generator[AstraDBBatchFileData, None, None]:
203
209
  all_ids = self._get_doc_ids()
204
210
  ids = list(all_ids)
205
211
  id_batches = batch_generator(ids, self.index_config.batch_size)
206
212
 
207
213
  for batch in id_batches:
208
- # Make sure the hash is always a positive number to create identified
209
- identified = str(hash(batch) + sys.maxsize + 1)
210
- fd = FileData(
211
- identifier=identified,
214
+ fd = AstraDBBatchFileData(
212
215
  connector_type=CONNECTOR_TYPE,
213
- doc_type="batch",
214
216
  metadata=FileDataSourceMetadata(
215
217
  date_processed=str(time()),
216
218
  ),
217
- additional_metadata={
218
- "ids": list(batch),
219
- "collection_name": self.index_config.collection_name,
220
- "keyspace": self.index_config.keyspace,
221
- },
219
+ additional_metadata=AstraDBAdditionalMetadata(
220
+ collection_name=self.index_config.collection_name,
221
+ keyspace=self.index_config.keyspace,
222
+ ),
223
+ batch_items=[BatchItem(identifier=b) for b in batch],
222
224
  )
223
225
  yield fd
224
226
 
@@ -247,7 +249,9 @@ class AstraDBDownloader(Downloader):
247
249
  writer.writerow(astra_result.keys())
248
250
  writer.writerow(astra_result.values())
249
251
 
250
- def generate_download_response(self, result: dict, file_data: FileData) -> DownloadResponse:
252
+ def generate_download_response(
253
+ self, result: dict, file_data: AstraDBBatchFileData
254
+ ) -> DownloadResponse:
251
255
  record_id = result["_id"]
252
256
  filename_id = self.get_identifier(record_id=record_id)
253
257
  filename = f"{filename_id}.csv" # csv to preserve column info
@@ -255,7 +259,7 @@ class AstraDBDownloader(Downloader):
255
259
  logger.debug(f"Downloading results from record {record_id} as csv to {download_path}")
256
260
  download_path.parent.mkdir(parents=True, exist_ok=True)
257
261
  try:
258
- self.write_astra_result_to_csv(astra_result=result, download_path=download_path)
262
+ self.write_astra_result_to_csv(astra_result=result, download_path=str(download_path))
259
263
  except Exception as e:
260
264
  logger.error(
261
265
  f"failed to download from record {record_id} to {download_path}: {e}",
@@ -264,14 +268,13 @@ class AstraDBDownloader(Downloader):
264
268
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
265
269
 
266
270
  # modify input file_data for download_response
267
- copied_file_data = copy.deepcopy(file_data)
268
- copied_file_data.identifier = filename
269
- copied_file_data.doc_type = "file"
270
- copied_file_data.metadata.date_processed = str(time())
271
- copied_file_data.metadata.record_locator = {"document_id": record_id}
272
- copied_file_data.additional_metadata.pop("ids", None)
271
+ file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
272
+ cast_file_data = FileData.cast(file_data=file_data)
273
+ cast_file_data.identifier = filename
274
+ cast_file_data.metadata.date_processed = str(time())
275
+ cast_file_data.metadata.record_locator = {"document_id": record_id}
273
276
  return super().generate_download_response(
274
- file_data=copied_file_data, download_path=download_path
277
+ file_data=cast_file_data, download_path=download_path
275
278
  )
276
279
 
277
280
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
@@ -279,9 +282,10 @@ class AstraDBDownloader(Downloader):
279
282
 
280
283
  async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
281
284
  # Get metadata from file_data
282
- ids: list[str] = file_data.additional_metadata["ids"]
283
- collection_name: str = file_data.additional_metadata["collection_name"]
284
- keyspace: str = file_data.additional_metadata["keyspace"]
285
+ astra_file_data = AstraDBBatchFileData.cast(file_data=file_data)
286
+ ids: list[str] = [item.identifier for item in astra_file_data.batch_items]
287
+ collection_name: str = astra_file_data.additional_metadata.collection_name
288
+ keyspace: str = astra_file_data.additional_metadata.keyspace
285
289
 
286
290
  # Retrieve results from async collection
287
291
  download_responses = []
@@ -292,7 +296,7 @@ class AstraDBDownloader(Downloader):
292
296
  )
293
297
  async for result in async_astra_collection.find({"_id": {"$in": ids}}):
294
298
  download_responses.append(
295
- self.generate_download_response(result=result, file_data=file_data)
299
+ self.generate_download_response(result=result, file_data=astra_file_data)
296
300
  )
297
301
  return download_responses
298
302
 
@@ -1,5 +1,4 @@
1
1
  import hashlib
2
- import sys
3
2
  import time
4
3
  from contextlib import contextmanager
5
4
  from dataclasses import dataclass, field
@@ -7,7 +6,7 @@ from datetime import timedelta
7
6
  from pathlib import Path
8
7
  from typing import TYPE_CHECKING, Any, Generator, List
9
8
 
10
- from pydantic import Field, Secret
9
+ from pydantic import BaseModel, Field, Secret
11
10
 
12
11
  from unstructured_ingest.error import (
13
12
  DestinationConnectionError,
@@ -18,6 +17,8 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
18
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
19
18
  from unstructured_ingest.v2.interfaces import (
20
19
  AccessConfig,
20
+ BatchFileData,
21
+ BatchItem,
21
22
  ConnectionConfig,
22
23
  Downloader,
23
24
  DownloaderConfig,
@@ -26,6 +27,7 @@ from unstructured_ingest.v2.interfaces import (
26
27
  FileDataSourceMetadata,
27
28
  Indexer,
28
29
  IndexerConfig,
30
+ SourceIdentifiers,
29
31
  Uploader,
30
32
  UploaderConfig,
31
33
  UploadStager,
@@ -40,11 +42,20 @@ from unstructured_ingest.v2.processes.connector_registry import (
40
42
 
41
43
  if TYPE_CHECKING:
42
44
  from couchbase.cluster import Cluster
45
+ from couchbase.collection import Collection
43
46
 
44
47
  CONNECTOR_TYPE = "couchbase"
45
48
  SERVER_API_VERSION = "1"
46
49
 
47
50
 
51
+ class CouchbaseAdditionalMetadata(BaseModel):
52
+ bucket: str
53
+
54
+
55
+ class CouchbaseBatchFileData(BatchFileData):
56
+ additional_metadata: CouchbaseAdditionalMetadata
57
+
58
+
48
59
  class CouchbaseAccessConfig(AccessConfig):
49
60
  password: str = Field(description="The password for the Couchbase server")
50
61
 
@@ -180,31 +191,21 @@ class CouchbaseIndexer(Indexer):
180
191
  if attempts == max_attempts:
181
192
  raise SourceConnectionError(f"failed to get document ids: {e}")
182
193
 
183
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
194
+ def run(self, **kwargs: Any) -> Generator[CouchbaseBatchFileData, None, None]:
184
195
  ids = self._get_doc_ids()
185
-
186
- id_batches = [
187
- ids[i * self.index_config.batch_size : (i + 1) * self.index_config.batch_size]
188
- for i in range(
189
- (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
190
- )
191
- ]
192
- for batch in id_batches:
196
+ for batch in batch_generator(ids, self.index_config.batch_size):
193
197
  # Make sure the hash is always a positive number to create identified
194
- identified = str(hash(tuple(batch)) + sys.maxsize + 1)
195
- yield FileData(
196
- identifier=identified,
198
+ yield CouchbaseBatchFileData(
197
199
  connector_type=CONNECTOR_TYPE,
198
- doc_type="batch",
199
200
  metadata=FileDataSourceMetadata(
200
201
  url=f"{self.connection_config.connection_string}/"
201
202
  f"{self.connection_config.bucket}",
202
203
  date_processed=str(time.time()),
203
204
  ),
204
- additional_metadata={
205
- "ids": list(batch),
206
- "bucket": self.connection_config.bucket,
207
- },
205
+ additional_metadata=CouchbaseAdditionalMetadata(
206
+ bucket=self.connection_config.bucket
207
+ ),
208
+ batch_items=[BatchItem(identifier=b) for b in batch],
208
209
  )
209
210
 
210
211
 
@@ -241,7 +242,7 @@ class CouchbaseDownloader(Downloader):
241
242
  return concatenated_values
242
243
 
243
244
  def generate_download_response(
244
- self, result: dict, bucket: str, file_data: FileData
245
+ self, result: dict, bucket: str, file_data: CouchbaseBatchFileData
245
246
  ) -> DownloadResponse:
246
247
  record_id = result[self.download_config.collection_id]
247
248
  filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
@@ -261,28 +262,26 @@ class CouchbaseDownloader(Downloader):
261
262
  exc_info=True,
262
263
  )
263
264
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
264
- return DownloadResponse(
265
- file_data=FileData(
266
- identifier=filename_id,
267
- connector_type=CONNECTOR_TYPE,
268
- metadata=FileDataSourceMetadata(
269
- version=None,
270
- date_processed=str(time.time()),
271
- record_locator={
272
- "connection_string": self.connection_config.connection_string,
273
- "bucket": bucket,
274
- "scope": self.connection_config.scope,
275
- "collection": self.connection_config.collection,
276
- "document_id": record_id,
277
- },
278
- ),
279
- ),
280
- path=download_path,
265
+ file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
266
+ cast_file_data = FileData.cast(file_data=file_data)
267
+ cast_file_data.identifier = filename_id
268
+ cast_file_data.metadata.date_processed = str(time.time())
269
+ cast_file_data.metadata.record_locator = {
270
+ "connection_string": self.connection_config.connection_string,
271
+ "bucket": bucket,
272
+ "scope": self.connection_config.scope,
273
+ "collection": self.connection_config.collection,
274
+ "document_id": record_id,
275
+ }
276
+ return super().generate_download_response(
277
+ file_data=cast_file_data,
278
+ download_path=download_path,
281
279
  )
282
280
 
283
281
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
284
- bucket_name: str = file_data.additional_metadata["bucket"]
285
- ids: list[str] = file_data.additional_metadata["ids"]
282
+ couchbase_file_data = CouchbaseBatchFileData.cast(file_data=file_data)
283
+ bucket_name: str = couchbase_file_data.additional_metadata.bucket
284
+ ids: list[str] = [item.identifier for item in couchbase_file_data.batch_items]
286
285
 
287
286
  with self.connection_config.get_client() as client:
288
287
  bucket = client.bucket(bucket_name)
@@ -292,13 +291,25 @@ class CouchbaseDownloader(Downloader):
292
291
  download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
293
292
  return list(download_resp)
294
293
 
295
- def process_doc_id(self, doc_id, collection, bucket_name, file_data):
294
+ def process_doc_id(
295
+ self,
296
+ doc_id: str,
297
+ collection: "Collection",
298
+ bucket_name: str,
299
+ file_data: CouchbaseBatchFileData,
300
+ ):
296
301
  result = collection.get(doc_id)
297
302
  return self.generate_download_response(
298
303
  result=result.content_as[dict], bucket=bucket_name, file_data=file_data
299
304
  )
300
305
 
301
- def process_all_doc_ids(self, ids, collection, bucket_name, file_data):
306
+ def process_all_doc_ids(
307
+ self,
308
+ ids: list[str],
309
+ collection: "Collection",
310
+ bucket_name: str,
311
+ file_data: CouchbaseBatchFileData,
312
+ ):
302
313
  for doc_id in ids:
303
314
  yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
304
315
 
@@ -14,6 +14,7 @@ from unstructured_ingest.error import (
14
14
  )
15
15
  from unstructured_ingest.utils.dep_check import requires_dependencies
16
16
  from unstructured_ingest.v2.interfaces import (
17
+ AccessConfig,
17
18
  ConnectionConfig,
18
19
  Downloader,
19
20
  DownloaderConfig,
@@ -52,6 +53,10 @@ class DatabricksPathMixin(BaseModel):
52
53
  return path
53
54
 
54
55
 
56
+ class DatabricksVolumesAccessConfig(AccessConfig):
57
+ token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
58
+
59
+
55
60
  class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
56
61
  host: Optional[str] = Field(
57
62
  default=None,
@@ -3,12 +3,12 @@ from typing import Optional
3
3
 
4
4
  from pydantic import Field, Secret
5
5
 
6
- from unstructured_ingest.v2.interfaces import AccessConfig
7
6
  from unstructured_ingest.v2.processes.connector_registry import (
8
7
  DestinationRegistryEntry,
9
8
  SourceRegistryEntry,
10
9
  )
11
10
  from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
12
  DatabricksVolumesConnectionConfig,
13
13
  DatabricksVolumesDownloader,
14
14
  DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
21
21
  CONNECTOR_TYPE = "databricks_volumes_aws"
22
22
 
23
23
 
24
- class DatabricksAWSVolumesAccessConfig(AccessConfig):
24
+ class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
25
25
  account_id: Optional[str] = Field(
26
26
  default=None,
27
27
  description="The Databricks account ID for the Databricks " "accounts endpoint",
@@ -3,12 +3,12 @@ from typing import Optional
3
3
 
4
4
  from pydantic import Field, Secret
5
5
 
6
- from unstructured_ingest.v2.interfaces import AccessConfig
7
6
  from unstructured_ingest.v2.processes.connector_registry import (
8
7
  DestinationRegistryEntry,
9
8
  SourceRegistryEntry,
10
9
  )
11
10
  from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
12
  DatabricksVolumesConnectionConfig,
13
13
  DatabricksVolumesDownloader,
14
14
  DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
21
21
  CONNECTOR_TYPE = "databricks_volumes_azure"
22
22
 
23
23
 
24
- class DatabricksAzureVolumesAccessConfig(AccessConfig):
24
+ class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
25
25
  account_id: Optional[str] = Field(
26
26
  default=None,
27
27
  description="The Databricks account ID for the Databricks " "accounts endpoint.",
@@ -3,12 +3,12 @@ from typing import Optional
3
3
 
4
4
  from pydantic import Field, Secret
5
5
 
6
- from unstructured_ingest.v2.interfaces import AccessConfig
7
6
  from unstructured_ingest.v2.processes.connector_registry import (
8
7
  DestinationRegistryEntry,
9
8
  SourceRegistryEntry,
10
9
  )
11
10
  from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
12
  DatabricksVolumesConnectionConfig,
13
13
  DatabricksVolumesDownloader,
14
14
  DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
21
21
  CONNECTOR_TYPE = "databricks_volumes_gcp"
22
22
 
23
23
 
24
- class DatabricksGoogleVolumesAccessConfig(AccessConfig):
24
+ class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
25
25
  account_id: Optional[str] = Field(
26
26
  default=None,
27
27
  description="The Databricks account ID for the Databricks " "accounts endpoint.",