unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +103 -92
  10. test/integration/connectors/sql/test_singlestore.py +112 -100
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +87 -76
  13. test/integration/connectors/test_astradb.py +62 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +6 -6
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +7 -4
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  35. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  36. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  37. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  38. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  39. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  40. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  41. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  42. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  43. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  44. unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
  45. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  46. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  47. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  48. unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
  49. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  50. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  51. unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
  52. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  53. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  54. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  55. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  56. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
  57. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  58. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  59. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  60. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  61. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  62. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  63. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  64. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  66. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  67. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  68. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  69. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  70. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
  72. unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
  73. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
  75. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  77. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  78. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  79. unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
  80. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  81. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  82. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
  83. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
  84. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  87. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,10 @@
1
- from abc import ABC, abstractmethod
1
+ import json
2
+ from abc import ABC
2
3
  from dataclasses import dataclass
3
4
  from pathlib import Path
4
5
  from typing import Any, TypeVar
5
6
 
7
+ import ndjson
6
8
  from pydantic import BaseModel
7
9
 
8
10
  from unstructured_ingest.v2.interfaces.file_data import FileData
@@ -20,16 +22,78 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
20
22
  class UploadStager(BaseProcess, ABC):
21
23
  upload_stager_config: UploadStagerConfigT
22
24
 
23
- @abstractmethod
25
+ def write_output(self, output_path: Path, data: list[dict]) -> None:
26
+ if output_path.suffix == ".json":
27
+ with output_path.open("w") as f:
28
+ json.dump(data, f, indent=2)
29
+ elif output_path.suffix == ".ndjson":
30
+ with output_path.open("w") as f:
31
+ ndjson.dump(data, f)
32
+ else:
33
+ raise ValueError(f"Unsupported output format: {output_path}")
34
+
35
+ def get_data(self, elements_filepath: Path) -> list[dict]:
36
+ if elements_filepath.suffix == ".json":
37
+ with elements_filepath.open() as f:
38
+ return json.load(f)
39
+ elif elements_filepath.suffix == ".ndjson":
40
+ with elements_filepath.open() as f:
41
+ return ndjson.load(f)
42
+ else:
43
+ raise ValueError(f"Unsupported input format: {elements_filepath}")
44
+
45
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
46
+ return element_dict
47
+
48
+ def get_output_path(self, output_filename: str, output_dir: Path) -> Path:
49
+ output_path = Path(output_filename)
50
+ output_filename = f"{Path(output_filename).stem}{output_path.suffix}"
51
+ output_path = Path(output_dir) / Path(f"{output_filename}")
52
+ output_path.parent.mkdir(parents=True, exist_ok=True)
53
+ return output_path
54
+
55
+ def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
56
+ with input_file.open() as in_f:
57
+ reader = ndjson.reader(in_f)
58
+ with output_file.open("w") as out_f:
59
+ writer = ndjson.writer(out_f)
60
+ for element in reader:
61
+ conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
62
+ writer.writerow(row=conformed_element)
63
+ writer.f.flush()
64
+
65
+ def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
66
+ with input_file.open() as in_f:
67
+ elements_contents = json.load(in_f)
68
+
69
+ conformed_elements = [
70
+ self.conform_dict(element_dict=element, file_data=file_data)
71
+ for element in elements_contents
72
+ ]
73
+
74
+ with open(output_file, "w") as out_f:
75
+ json.dump(conformed_elements, out_f, indent=2)
76
+
24
77
  def run(
25
78
  self,
26
79
  elements_filepath: Path,
27
80
  file_data: FileData,
28
81
  output_dir: Path,
29
82
  output_filename: str,
30
- **kwargs: Any
83
+ **kwargs: Any,
31
84
  ) -> Path:
32
- pass
85
+ output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
86
+ if elements_filepath.suffix == ".ndjson":
87
+ self.stream_update(
88
+ input_file=elements_filepath, output_file=output_file, file_data=file_data
89
+ )
90
+ elif elements_filepath.suffix == ".json":
91
+ self.process_whole(
92
+ input_file=elements_filepath, output_file=output_file, file_data=file_data
93
+ )
94
+ else:
95
+ raise ValueError(f"Unsupported file extension: {elements_filepath}")
96
+ return output_file
33
97
 
34
98
  async def run_async(
35
99
  self,
@@ -37,12 +101,12 @@ class UploadStager(BaseProcess, ABC):
37
101
  file_data: FileData,
38
102
  output_dir: Path,
39
103
  output_filename: str,
40
- **kwargs: Any
104
+ **kwargs: Any,
41
105
  ) -> Path:
42
106
  return self.run(
43
107
  elements_filepath=elements_filepath,
44
108
  output_dir=output_dir,
45
109
  output_filename=output_filename,
46
110
  file_data=file_data,
47
- **kwargs
111
+ **kwargs,
48
112
  )
@@ -5,6 +5,7 @@ from typing import Any, TypeVar
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
8
+ from unstructured_ingest.utils.data_prep import get_data
8
9
  from unstructured_ingest.v2.interfaces.connector import BaseConnector
9
10
  from unstructured_ingest.v2.interfaces.file_data import FileData
10
11
  from unstructured_ingest.v2.interfaces.process import BaseProcess
@@ -38,7 +39,15 @@ class Uploader(BaseProcess, BaseConnector, ABC):
38
39
  raise NotImplementedError()
39
40
 
40
41
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
41
- raise NotImplementedError()
42
+ data = get_data(path=path)
43
+ self.run_data(data=data, file_data=file_data, **kwargs)
42
44
 
43
45
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
44
- return self.run(contents=[UploadContent(path=path, file_data=file_data)], **kwargs)
46
+ data = get_data(path=path)
47
+ await self.run_data_async(data=data, file_data=file_data, **kwargs)
48
+
49
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
50
+ raise NotImplementedError()
51
+
52
+ async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
53
+ return self.run_data(data=data, file_data=file_data, **kwargs)
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
9
10
  from unstructured_ingest.v2.logger import logger
10
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
12
  from unstructured_ingest.v2.processes.chunker import Chunker
@@ -51,7 +52,7 @@ class ChunkStep(PipelineStep):
51
52
  self, fn: Callable, path: str, file_data_path: str, **kwargs
52
53
  ) -> ChunkStepResponse:
53
54
  path = Path(path)
54
- file_data = FileData.from_file(path=file_data_path)
55
+ file_data = file_data_from_file(path=file_data_path)
55
56
  output_filepath = self.get_output_filepath(filename=path)
56
57
  if not self.should_chunk(filepath=output_filepath, file_data=file_data):
57
58
  logger.debug(f"skipping chunking, output already exists: {output_filepath}")
@@ -8,6 +8,7 @@ from typing import Callable, Optional, TypedDict, TypeVar
8
8
 
9
9
  from unstructured_ingest.v2.interfaces import FileData, download_responses
10
10
  from unstructured_ingest.v2.interfaces.downloader import Downloader
11
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
11
12
  from unstructured_ingest.v2.logger import logger
12
13
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
13
14
  from unstructured_ingest.v2.utils import serialize_base_model_json
@@ -87,12 +88,12 @@ class DownloadStep(PipelineStep):
87
88
  f"match size of local file: {file_size_bytes}, updating"
88
89
  )
89
90
  file_data.metadata.filesize_bytes = file_size_bytes
90
- logger.debug(f"updating file data with new content: {file_data.to_dict()}")
91
+ logger.debug(f"updating file data with new content: {file_data.model_dump()}")
91
92
  with file_data_path.open("w") as file:
92
- json.dump(file_data.to_dict(), file, indent=2)
93
+ json.dump(file_data.model_dump(), file, indent=2)
93
94
 
94
95
  async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
95
- file_data = FileData.from_file(path=file_data_path)
96
+ file_data = file_data_from_file(path=file_data_path)
96
97
  download_path = self.process.get_download_path(file_data=file_data)
97
98
  if not self.should_download(file_data=file_data, file_data_path=file_data_path):
98
99
  logger.debug(f"skipping download, file already exists locally: {download_path}")
@@ -172,7 +173,7 @@ class DownloadStep(PipelineStep):
172
173
  filepath = (self.cache_dir / filename).resolve()
173
174
  filepath.parent.mkdir(parents=True, exist_ok=True)
174
175
  with open(str(filepath), "w") as f:
175
- json.dump(file_data.to_dict(), f, indent=2)
176
+ json.dump(file_data.model_dump(), f, indent=2)
176
177
  return str(filepath)
177
178
 
178
179
  def get_hash(self, extras: Optional[list[str]]) -> str:
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
9
10
  from unstructured_ingest.v2.logger import logger
10
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
12
  from unstructured_ingest.v2.processes.embedder import Embedder
@@ -49,7 +50,7 @@ class EmbedStep(PipelineStep):
49
50
 
50
51
  async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
51
52
  path = Path(path)
52
- file_data = FileData.from_file(path=file_data_path)
53
+ file_data = file_data_from_file(path=file_data_path)
53
54
  output_filepath = self.get_output_filepath(filename=path)
54
55
  if not self.should_embed(filepath=output_filepath, file_data=file_data):
55
56
  logger.debug(f"skipping embedding, output already exists: {output_filepath}")
@@ -2,7 +2,7 @@ import asyncio
2
2
  from dataclasses import dataclass
3
3
  from typing import Callable, Optional
4
4
 
5
- from unstructured_ingest.v2.interfaces.file_data import FileData
5
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
6
6
  from unstructured_ingest.v2.logger import logger
7
7
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
8
  from unstructured_ingest.v2.processes.filter import Filterer
@@ -20,7 +20,7 @@ class FilterStep(PipelineStep):
20
20
  logger.info(f"created {self.identifier} with configs: {config}")
21
21
 
22
22
  async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
23
- file_data = FileData.from_file(path=file_data_path)
23
+ file_data = file_data_from_file(path=file_data_path)
24
24
  fn_kwargs = {"file_data": file_data}
25
25
  if not asyncio.iscoroutinefunction(fn):
26
26
  resp = fn(**fn_kwargs)
@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
37
37
  @instrument(span_name=STEP_ID)
38
38
  def run(self) -> Generator[str, None, None]:
39
39
  for file_data in self.process.run():
40
- logger.debug(f"generated file data: {file_data.to_dict()}")
40
+ logger.debug(f"generated file data: {file_data.model_dump()}")
41
41
  try:
42
42
  record_hash = self.get_hash(extras=[file_data.identifier])
43
43
  filename = f"{record_hash}.json"
44
44
  filepath = (self.cache_dir / filename).resolve()
45
45
  filepath.parent.mkdir(parents=True, exist_ok=True)
46
46
  with open(str(filepath), "w") as f:
47
- json.dump(file_data.to_dict(), f, indent=2)
47
+ json.dump(file_data.model_dump(), f, indent=2)
48
48
  yield str(filepath)
49
49
  except Exception as e:
50
50
  logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
54
54
 
55
55
  async def run_async(self) -> AsyncGenerator[str, None]:
56
56
  async for file_data in self.process.run_async():
57
- logger.debug(f"generated file data: {file_data.to_dict()}")
57
+ logger.debug(f"generated file data: {file_data.model_dump()}")
58
58
  try:
59
59
  record_hash = self.get_hash(extras=[file_data.identifier])
60
60
  filename = f"{record_hash}.json"
61
61
  filepath = (self.cache_dir / filename).resolve()
62
62
  filepath.parent.mkdir(parents=True, exist_ok=True)
63
63
  with open(str(filepath), "w") as f:
64
- json.dump(file_data.to_dict(), f, indent=2)
64
+ json.dump(file_data.model_dump(), f, indent=2)
65
65
  yield str(filepath)
66
66
  except Exception as e:
67
67
  logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
9
10
  from unstructured_ingest.v2.logger import logger
10
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
12
  from unstructured_ingest.v2.processes.partitioner import Partitioner
@@ -51,12 +52,12 @@ class PartitionStep(PipelineStep):
51
52
  self, fn: Callable, path: str, file_data_path: str
52
53
  ) -> Optional[PartitionStepResponse]:
53
54
  path = Path(path)
54
- file_data = FileData.from_file(path=file_data_path)
55
+ file_data = file_data_from_file(path=file_data_path)
55
56
  output_filepath = self.get_output_filepath(filename=Path(file_data_path))
56
57
  if not self.should_partition(filepath=output_filepath, file_data=file_data):
57
58
  logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
58
59
  return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
59
- fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
60
+ fn_kwargs = {"filename": path, "metadata": file_data.metadata.model_dump()}
60
61
  if not asyncio.iscoroutinefunction(fn):
61
62
  partitioned_content = fn(**fn_kwargs)
62
63
  elif semaphore := self.context.semaphore:
@@ -4,7 +4,7 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import Callable, Optional, TypedDict
6
6
 
7
- from unstructured_ingest.v2.interfaces.file_data import FileData
7
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
8
8
  from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
9
9
  from unstructured_ingest.v2.logger import logger
10
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
@@ -39,11 +39,13 @@ class UploadStageStep(PipelineStep):
39
39
  self, fn: Callable, path: str, file_data_path: str
40
40
  ) -> UploadStageStepResponse:
41
41
  path = Path(path)
42
+ # Maintain extension
43
+ output_filename = f"{self.get_hash(extras=[path.name])}{path.suffix}"
42
44
  fn_kwargs = {
43
45
  "elements_filepath": path,
44
- "file_data": FileData.from_file(path=file_data_path),
46
+ "file_data": file_data_from_file(path=file_data_path),
45
47
  "output_dir": self.cache_dir,
46
- "output_filename": self.get_hash(extras=[path.name]),
48
+ "output_filename": output_filename,
47
49
  }
48
50
  if not asyncio.iscoroutinefunction(fn):
49
51
  staged_output_path = fn(**fn_kwargs)
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Callable, TypedDict
5
5
 
6
- from unstructured_ingest.v2.interfaces.file_data import FileData
6
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
7
7
  from unstructured_ingest.v2.logger import logger
8
8
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
9
  from unstructured_ingest.v2.processes.uncompress import Uncompressor
@@ -28,7 +28,7 @@ class UncompressStep(PipelineStep):
28
28
  async def _run_async(
29
29
  self, fn: Callable, path: str, file_data_path: str
30
30
  ) -> list[UncompressStepResponse]:
31
- file_data = FileData.from_file(path=file_data_path)
31
+ file_data = file_data_from_file(path=file_data_path)
32
32
  fn_kwargs = {"file_data": file_data}
33
33
  if not asyncio.iscoroutinefunction(fn):
34
34
  new_file_data = fn(**fn_kwargs)
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Callable, Optional, TypedDict
5
5
 
6
- from unstructured_ingest.v2.interfaces import FileData
6
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
7
7
  from unstructured_ingest.v2.interfaces.uploader import UploadContent
8
8
  from unstructured_ingest.v2.logger import logger
9
9
  from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
@@ -41,14 +41,14 @@ class UploadStep(BatchPipelineStep):
41
41
  @instrument(span_name=STEP_ID)
42
42
  def _run_batch(self, contents: list[UploadStepContent]) -> None:
43
43
  upload_contents = [
44
- UploadContent(path=Path(c["path"]), file_data=FileData.from_file(c["file_data_path"]))
44
+ UploadContent(path=Path(c["path"]), file_data=file_data_from_file(c["file_data_path"]))
45
45
  for c in contents
46
46
  ]
47
47
  self.process.run_batch(contents=upload_contents)
48
48
 
49
49
  async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None):
50
50
  fn = fn or self.process.run_async
51
- fn_kwargs = {"path": Path(path), "file_data": FileData.from_file(path=file_data_path)}
51
+ fn_kwargs = {"path": Path(path), "file_data": file_data_from_file(path=file_data_path)}
52
52
  if not asyncio.iscoroutinefunction(fn):
53
53
  fn(**fn_kwargs)
54
54
  elif semaphore := self.context.semaphore:
@@ -40,6 +40,8 @@ from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
40
40
  from .milvus import milvus_destination_entry
41
41
  from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
42
42
  from .mongodb import mongodb_destination_entry, mongodb_source_entry
43
+ from .neo4j import CONNECTOR_TYPE as NEO4J_CONNECTOR_TYPE
44
+ from .neo4j import neo4j_destination_entry
43
45
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
44
46
  from .onedrive import onedrive_destination_entry, onedrive_source_entry
45
47
  from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
@@ -74,6 +76,7 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
74
76
  add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
75
77
  add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
76
78
 
79
+ add_destination_entry(destination_type=NEO4J_CONNECTOR_TYPE, entry=neo4j_destination_entry)
77
80
 
78
81
  add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
79
82
 
@@ -1,14 +1,11 @@
1
- import copy
2
1
  import csv
3
2
  import hashlib
4
- import json
5
- import sys
6
3
  from dataclasses import dataclass, field
7
4
  from pathlib import Path
8
5
  from time import time
9
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
10
7
 
11
- from pydantic import Field, Secret
8
+ from pydantic import BaseModel, Field, Secret
12
9
 
13
10
  from unstructured_ingest import __name__ as integration_name
14
11
  from unstructured_ingest.__version__ import __version__ as integration_version
@@ -17,12 +14,14 @@ from unstructured_ingest.error import (
17
14
  SourceConnectionError,
18
15
  SourceConnectionNetworkError,
19
16
  )
20
- from unstructured_ingest.utils.data_prep import batch_generator
17
+ from unstructured_ingest.utils.data_prep import batch_generator, get_data
21
18
  from unstructured_ingest.utils.dep_check import requires_dependencies
22
19
  from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
23
20
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
24
21
  from unstructured_ingest.v2.interfaces import (
25
22
  AccessConfig,
23
+ BatchFileData,
24
+ BatchItem,
26
25
  ConnectionConfig,
27
26
  Downloader,
28
27
  DownloaderConfig,
@@ -54,6 +53,15 @@ CONNECTOR_TYPE = "astradb"
54
53
  MAX_CONTENT_PARAM_BYTE_SIZE = 8000
55
54
 
56
55
 
56
+ class AstraDBAdditionalMetadata(BaseModel):
57
+ collection_name: str
58
+ keyspace: Optional[str] = None
59
+
60
+
61
+ class AstraDBBatchFileData(BatchFileData):
62
+ additional_metadata: AstraDBAdditionalMetadata
63
+
64
+
57
65
  class AstraDBAccessConfig(AccessConfig):
58
66
  token: str = Field(description="Astra DB Token with access to the database.")
59
67
  api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
@@ -180,9 +188,6 @@ class AstraDBIndexer(Indexer):
180
188
 
181
189
  def _get_doc_ids(self) -> set[str]:
182
190
  """Fetches all document ids in an index"""
183
- # Initialize set of ids
184
- ids = set()
185
-
186
191
  # Get the collection
187
192
  collection = self.get_collection()
188
193
 
@@ -195,31 +200,26 @@ class AstraDBIndexer(Indexer):
195
200
  astra_db_docs.append(result)
196
201
 
197
202
  # Create file data for each astra record
198
- for astra_record in astra_db_docs:
199
- ids.add(astra_record["_id"])
203
+ ids = sorted([astra_record["_id"] for astra_record in astra_db_docs])
200
204
 
201
- return ids
205
+ return set(ids)
202
206
 
203
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
207
+ def run(self, **kwargs: Any) -> Generator[AstraDBBatchFileData, None, None]:
204
208
  all_ids = self._get_doc_ids()
205
209
  ids = list(all_ids)
206
210
  id_batches = batch_generator(ids, self.index_config.batch_size)
207
211
 
208
212
  for batch in id_batches:
209
- # Make sure the hash is always a positive number to create identified
210
- identified = str(hash(batch) + sys.maxsize + 1)
211
- fd = FileData(
212
- identifier=identified,
213
+ fd = AstraDBBatchFileData(
213
214
  connector_type=CONNECTOR_TYPE,
214
- doc_type="batch",
215
215
  metadata=FileDataSourceMetadata(
216
216
  date_processed=str(time()),
217
217
  ),
218
- additional_metadata={
219
- "ids": list(batch),
220
- "collection_name": self.index_config.collection_name,
221
- "keyspace": self.index_config.keyspace,
222
- },
218
+ additional_metadata=AstraDBAdditionalMetadata(
219
+ collection_name=self.index_config.collection_name,
220
+ keyspace=self.index_config.keyspace,
221
+ ),
222
+ batch_items=[BatchItem(identifier=b) for b in batch],
223
223
  )
224
224
  yield fd
225
225
 
@@ -248,7 +248,9 @@ class AstraDBDownloader(Downloader):
248
248
  writer.writerow(astra_result.keys())
249
249
  writer.writerow(astra_result.values())
250
250
 
251
- def generate_download_response(self, result: dict, file_data: FileData) -> DownloadResponse:
251
+ def generate_download_response(
252
+ self, result: dict, file_data: AstraDBBatchFileData
253
+ ) -> DownloadResponse:
252
254
  record_id = result["_id"]
253
255
  filename_id = self.get_identifier(record_id=record_id)
254
256
  filename = f"{filename_id}.csv" # csv to preserve column info
@@ -256,7 +258,7 @@ class AstraDBDownloader(Downloader):
256
258
  logger.debug(f"Downloading results from record {record_id} as csv to {download_path}")
257
259
  download_path.parent.mkdir(parents=True, exist_ok=True)
258
260
  try:
259
- self.write_astra_result_to_csv(astra_result=result, download_path=download_path)
261
+ self.write_astra_result_to_csv(astra_result=result, download_path=str(download_path))
260
262
  except Exception as e:
261
263
  logger.error(
262
264
  f"failed to download from record {record_id} to {download_path}: {e}",
@@ -265,14 +267,12 @@ class AstraDBDownloader(Downloader):
265
267
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
266
268
 
267
269
  # modify input file_data for download_response
268
- copied_file_data = copy.deepcopy(file_data)
269
- copied_file_data.identifier = filename
270
- copied_file_data.doc_type = "file"
271
- copied_file_data.metadata.date_processed = str(time())
272
- copied_file_data.metadata.record_locator = {"document_id": record_id}
273
- copied_file_data.additional_metadata.pop("ids", None)
270
+ cast_file_data = FileData.cast(file_data=file_data)
271
+ cast_file_data.identifier = filename
272
+ cast_file_data.metadata.date_processed = str(time())
273
+ cast_file_data.metadata.record_locator = {"document_id": record_id}
274
274
  return super().generate_download_response(
275
- file_data=copied_file_data, download_path=download_path
275
+ file_data=cast_file_data, download_path=download_path
276
276
  )
277
277
 
278
278
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
@@ -280,9 +280,10 @@ class AstraDBDownloader(Downloader):
280
280
 
281
281
  async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
282
282
  # Get metadata from file_data
283
- ids: list[str] = file_data.additional_metadata["ids"]
284
- collection_name: str = file_data.additional_metadata["collection_name"]
285
- keyspace: str = file_data.additional_metadata["keyspace"]
283
+ astra_file_data = AstraDBBatchFileData.cast(file_data=file_data)
284
+ ids: list[str] = [item.identifier for item in astra_file_data.batch_items]
285
+ collection_name: str = astra_file_data.additional_metadata.collection_name
286
+ keyspace: str = astra_file_data.additional_metadata.keyspace
286
287
 
287
288
  # Retrieve results from async collection
288
289
  download_responses = []
@@ -293,7 +294,7 @@ class AstraDBDownloader(Downloader):
293
294
  )
294
295
  async for result in async_astra_collection.find({"_id": {"$in": ids}}):
295
296
  download_responses.append(
296
- self.generate_download_response(result=result, file_data=file_data)
297
+ self.generate_download_response(result=result, file_data=astra_file_data)
297
298
  )
298
299
  return download_responses
299
300
 
@@ -325,29 +326,6 @@ class AstraDBUploadStager(UploadStager):
325
326
  "metadata": element_dict,
326
327
  }
327
328
 
328
- def run(
329
- self,
330
- elements_filepath: Path,
331
- file_data: FileData,
332
- output_dir: Path,
333
- output_filename: str,
334
- **kwargs: Any,
335
- ) -> Path:
336
- with open(elements_filepath) as elements_file:
337
- elements_contents = json.load(elements_file)
338
- conformed_elements = []
339
- for element in elements_contents:
340
- conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
341
- output_filename_path = Path(output_filename)
342
- if output_filename_path.suffix == ".json":
343
- output_path = Path(output_dir) / output_filename_path
344
- else:
345
- output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
346
- output_path.parent.mkdir(parents=True, exist_ok=True)
347
- with open(output_path, "w") as output_file:
348
- json.dump(conformed_elements, output_file, indent=2)
349
- return output_path
350
-
351
329
 
352
330
  @dataclass
353
331
  class AstraDBUploader(Uploader):
@@ -386,11 +364,9 @@ class AstraDBUploader(Uploader):
386
364
  f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
387
365
  )
388
366
 
389
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
390
- with path.open("r") as file:
391
- elements_dict = json.load(file)
367
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
392
368
  logger.info(
393
- f"writing {len(elements_dict)} objects to destination "
369
+ f"writing {len(data)} objects to destination "
394
370
  f"collection {self.upload_config.collection_name}"
395
371
  )
396
372
 
@@ -399,9 +375,13 @@ class AstraDBUploader(Uploader):
399
375
 
400
376
  self.delete_by_record_id(collection=collection, file_data=file_data)
401
377
 
402
- for chunk in batch_generator(elements_dict, astra_db_batch_size):
378
+ for chunk in batch_generator(data, astra_db_batch_size):
403
379
  collection.insert_many(chunk)
404
380
 
381
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
382
+ data = get_data(path=path)
383
+ self.run_data(data=data, file_data=file_data, **kwargs)
384
+
405
385
 
406
386
  astra_db_source_entry = SourceRegistryEntry(
407
387
  indexer=AstraDBIndexer,