unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show
  1. test/integration/connectors/sql/test_postgres.py +3 -3
  2. test/integration/connectors/sql/test_singlestore.py +3 -3
  3. test/integration/connectors/sql/test_sqlite.py +3 -3
  4. test/integration/connectors/test_astradb.py +40 -0
  5. test/integration/connectors/test_kafka.py +2 -2
  6. test/integration/connectors/test_mongodb.py +4 -1
  7. test/integration/connectors/utils/validation/source.py +31 -11
  8. unstructured_ingest/__version__.py +1 -1
  9. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  10. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  11. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  12. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  13. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  14. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  15. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  16. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  17. unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
  18. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  19. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  20. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  21. unstructured_ingest/v2/processes/connectors/astradb.py +35 -33
  22. unstructured_ingest/v2/processes/connectors/couchbase.py +50 -41
  23. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
  24. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  25. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  26. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  27. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  28. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  29. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  30. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  31. unstructured_ingest/v2/processes/connectors/mongodb.py +95 -100
  32. unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
  33. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  34. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  35. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  36. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  37. unstructured_ingest/v2/processes/connectors/sql/sql.py +31 -26
  38. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  39. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +14 -13
  40. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +44 -44
  41. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  42. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  43. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  44. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -28,7 +28,7 @@ from unstructured_ingest.v2.processes.connectors.sql.postgres import (
28
28
  PostgresUploadStager,
29
29
  )
30
30
 
31
- SEED_DATA_ROWS = 20
31
+ SEED_DATA_ROWS = 10
32
32
 
33
33
 
34
34
  @pytest.fixture
@@ -69,7 +69,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
69
69
  )
70
70
  indexer = PostgresIndexer(
71
71
  connection_config=connection_config,
72
- index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
72
+ index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
73
73
  )
74
74
  downloader = PostgresDownloader(
75
75
  connection_config=connection_config,
@@ -81,7 +81,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
81
81
  configs=SourceValidationConfigs(
82
82
  test_id="postgres",
83
83
  expected_num_files=SEED_DATA_ROWS,
84
- expected_number_indexed_file_data=4,
84
+ expected_number_indexed_file_data=2,
85
85
  validate_downloaded_files=True,
86
86
  ),
87
87
  )
@@ -29,7 +29,7 @@ from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
29
29
  SingleStoreUploadStager,
30
30
  )
31
31
 
32
- SEED_DATA_ROWS = 20
32
+ SEED_DATA_ROWS = 10
33
33
 
34
34
 
35
35
  @pytest.fixture
@@ -66,7 +66,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
66
66
  )
67
67
  indexer = SingleStoreIndexer(
68
68
  connection_config=connection_config,
69
- index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
69
+ index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
70
70
  )
71
71
  downloader = SingleStoreDownloader(
72
72
  connection_config=connection_config,
@@ -80,7 +80,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
80
80
  configs=SourceValidationConfigs(
81
81
  test_id="singlestore",
82
82
  expected_num_files=SEED_DATA_ROWS,
83
- expected_number_indexed_file_data=4,
83
+ expected_number_indexed_file_data=2,
84
84
  validate_downloaded_files=True,
85
85
  ),
86
86
  )
@@ -27,7 +27,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
27
27
  SQLiteUploadStager,
28
28
  )
29
29
 
30
- SEED_DATA_ROWS = 20
30
+ SEED_DATA_ROWS = 10
31
31
 
32
32
 
33
33
  @pytest.fixture
@@ -57,7 +57,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
57
57
  connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
58
58
  indexer = SQLiteIndexer(
59
59
  connection_config=connection_config,
60
- index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=5),
60
+ index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
61
61
  )
62
62
  downloader = SQLiteDownloader(
63
63
  connection_config=connection_config,
@@ -69,7 +69,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
69
69
  configs=SourceValidationConfigs(
70
70
  test_id="sqlite",
71
71
  expected_num_files=SEED_DATA_ROWS,
72
- expected_number_indexed_file_data=4,
72
+ expected_number_indexed_file_data=2,
73
73
  validate_downloaded_files=True,
74
74
  ),
75
75
  )
@@ -14,12 +14,18 @@ from test.integration.connectors.utils.validation.destination import (
14
14
  StagerValidationConfigs,
15
15
  stager_validation,
16
16
  )
17
+ from test.integration.connectors.utils.validation.source import (
18
+ SourceValidationConfigs,
19
+ source_connector_validation,
20
+ )
17
21
  from test.integration.utils import requires_env
18
22
  from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
19
23
  from unstructured_ingest.v2.processes.connectors.astradb import (
20
24
  CONNECTOR_TYPE,
21
25
  AstraDBAccessConfig,
22
26
  AstraDBConnectionConfig,
27
+ AstraDBDownloader,
28
+ AstraDBDownloaderConfig,
23
29
  AstraDBIndexer,
24
30
  AstraDBIndexerConfig,
25
31
  AstraDBUploader,
@@ -110,6 +116,40 @@ def collection(upload_file: Path) -> Collection:
110
116
  astra_db.drop_collection(collection)
111
117
 
112
118
 
119
+ @pytest.mark.asyncio
120
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
121
+ @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
122
+ async def test_astra_search_source(
123
+ tmp_path: Path,
124
+ ):
125
+ env_data = get_env_data()
126
+ collection_name = "ingest_test_src"
127
+ connection_config = AstraDBConnectionConfig(
128
+ access_config=AstraDBAccessConfig(token=env_data.token, api_endpoint=env_data.api_endpoint)
129
+ )
130
+ indexer = AstraDBIndexer(
131
+ index_config=AstraDBIndexerConfig(
132
+ collection_name=collection_name,
133
+ ),
134
+ connection_config=connection_config,
135
+ )
136
+ downloader = AstraDBDownloader(
137
+ connection_config=connection_config,
138
+ download_config=AstraDBDownloaderConfig(download_dir=tmp_path),
139
+ )
140
+
141
+ await source_connector_validation(
142
+ indexer=indexer,
143
+ downloader=downloader,
144
+ configs=SourceValidationConfigs(
145
+ test_id=CONNECTOR_TYPE,
146
+ expected_num_files=5,
147
+ expected_number_indexed_file_data=1,
148
+ validate_downloaded_files=True,
149
+ ),
150
+ )
151
+
152
+
113
153
  @pytest.mark.asyncio
114
154
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
115
155
  @requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
@@ -122,7 +122,7 @@ async def test_kafka_source_local(kafka_seed_topic: str):
122
122
  indexer=indexer,
123
123
  downloader=downloader,
124
124
  configs=SourceValidationConfigs(
125
- test_id="kafka", expected_num_files=5, validate_downloaded_files=True
125
+ test_id="kafka-local", expected_num_files=5, validate_downloaded_files=True
126
126
  ),
127
127
  )
128
128
 
@@ -204,7 +204,7 @@ async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
204
204
  indexer=indexer,
205
205
  downloader=downloader,
206
206
  configs=SourceValidationConfigs(
207
- test_id="kafka",
207
+ test_id="kafka-cloud",
208
208
  exclude_fields_extend=["connector_type"],
209
209
  expected_num_files=expected_messages,
210
210
  validate_downloaded_files=True,
@@ -197,7 +197,10 @@ async def test_mongodb_source(temp_dir: Path):
197
197
  indexer=indexer,
198
198
  downloader=downloader,
199
199
  configs=SourceValidationConfigs(
200
- test_id=CONNECTOR_TYPE, expected_num_files=4, validate_downloaded_files=True
200
+ test_id=CONNECTOR_TYPE,
201
+ expected_num_files=4,
202
+ validate_downloaded_files=True,
203
+ expected_number_indexed_file_data=1,
201
204
  ),
202
205
  )
203
206
 
@@ -1,14 +1,13 @@
1
1
  import json
2
2
  import os
3
3
  import shutil
4
- from dataclasses import replace
5
4
  from pathlib import Path
6
5
  from typing import Callable, Optional
7
6
 
8
7
  from deepdiff import DeepDiff
9
8
  from pydantic import Field
10
9
 
11
- from test.integration.connectors.utils.validation.utils import ValidationConfig, reset_dir
10
+ from test.integration.connectors.utils.validation.utils import ValidationConfig
12
11
  from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
13
12
 
14
13
 
@@ -92,7 +91,7 @@ def check_contents(
92
91
  file_data_path = expected_output_dir / f"{file_data.identifier}.json"
93
92
  with file_data_path.open("r") as file:
94
93
  expected_file_data_contents = json.load(file)
95
- current_file_data_contents = file_data.to_dict()
94
+ current_file_data_contents = file_data.model_dump()
96
95
  expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
97
96
  current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
98
97
  diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
@@ -160,9 +159,11 @@ def update_fixtures(
160
159
  save_filedata: bool = True,
161
160
  ):
162
161
  # Rewrite the current file data
162
+ if not output_dir.exists():
163
+ output_dir.mkdir(parents=True)
163
164
  if save_filedata:
164
165
  file_data_output_path = output_dir / "file_data"
165
- reset_dir(dir_path=file_data_output_path)
166
+ shutil.rmtree(path=file_data_output_path, ignore_errors=True)
166
167
  print(
167
168
  f"Writing {len(all_file_data)} file data to "
168
169
  f"saved fixture location {file_data_output_path}"
@@ -171,7 +172,7 @@ def update_fixtures(
171
172
  for file_data in all_file_data:
172
173
  file_data_path = file_data_output_path / f"{file_data.identifier}.json"
173
174
  with file_data_path.open(mode="w") as f:
174
- json.dump(file_data.to_dict(), f, indent=2)
175
+ json.dump(file_data.model_dump(), f, indent=2)
175
176
 
176
177
  # Record file structure of download directory
177
178
  download_files = get_files(dir_path=download_dir)
@@ -183,7 +184,7 @@ def update_fixtures(
183
184
  # If applicable, save raw downloads
184
185
  if save_downloads:
185
186
  raw_download_output_path = output_dir / "downloads"
186
- reset_dir(raw_download_output_path)
187
+ shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
187
188
  print(
188
189
  f"Writing {len(download_files)} downloaded files to "
189
190
  f"saved fixture location {raw_download_output_path}"
@@ -213,7 +214,10 @@ def run_all_validations(
213
214
  if configs.validate_file_data:
214
215
  run_expected_results_validation(
215
216
  expected_output_dir=test_output_dir / "file_data",
216
- all_file_data=postdownload_file_data,
217
+ all_file_data=get_all_file_data(
218
+ all_predownload_file_data=predownload_file_data,
219
+ all_postdownload_file_data=postdownload_file_data,
220
+ ),
217
221
  configs=configs,
218
222
  )
219
223
  download_files = get_files(dir_path=download_dir)
@@ -229,6 +233,19 @@ def run_all_validations(
229
233
  )
230
234
 
231
235
 
236
+ def get_all_file_data(
237
+ all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
238
+ ) -> list[FileData]:
239
+ all_file_data = all_postdownload_file_data
240
+ indexed_file_data = [
241
+ fd
242
+ for fd in all_predownload_file_data
243
+ if fd.identifier not in [f.identifier for f in all_file_data]
244
+ ]
245
+ all_file_data += indexed_file_data
246
+ return all_file_data
247
+
248
+
232
249
  async def source_connector_validation(
233
250
  indexer: Indexer,
234
251
  downloader: Downloader,
@@ -246,7 +263,7 @@ async def source_connector_validation(
246
263
  test_output_dir = configs.test_output_dir()
247
264
  for file_data in indexer.run():
248
265
  assert file_data
249
- predownload_file_data = replace(file_data)
266
+ predownload_file_data = file_data.model_copy(deep=True)
250
267
  all_predownload_file_data.append(predownload_file_data)
251
268
  if downloader.is_async():
252
269
  resp = await downloader.run_async(file_data=file_data)
@@ -254,10 +271,10 @@ async def source_connector_validation(
254
271
  resp = downloader.run(file_data=file_data)
255
272
  if isinstance(resp, list):
256
273
  for r in resp:
257
- postdownload_file_data = replace(r["file_data"])
274
+ postdownload_file_data = r["file_data"].model_copy(deep=True)
258
275
  all_postdownload_file_data.append(postdownload_file_data)
259
276
  else:
260
- postdownload_file_data = replace(resp["file_data"])
277
+ postdownload_file_data = resp["file_data"].model_copy(deep=True)
261
278
  all_postdownload_file_data.append(postdownload_file_data)
262
279
  if not overwrite_fixtures:
263
280
  print("Running validation")
@@ -273,7 +290,10 @@ async def source_connector_validation(
273
290
  update_fixtures(
274
291
  output_dir=test_output_dir,
275
292
  download_dir=download_dir,
276
- all_file_data=all_postdownload_file_data,
293
+ all_file_data=get_all_file_data(
294
+ all_predownload_file_data=all_predownload_file_data,
295
+ all_postdownload_file_data=all_postdownload_file_data,
296
+ ),
277
297
  save_downloads=configs.validate_downloaded_files,
278
298
  save_filedata=configs.validate_file_data,
279
299
  )
@@ -1 +1 @@
1
- __version__ = "0.3.9" # pragma: no cover
1
+ __version__ = "0.3.10" # pragma: no cover
@@ -1,6 +1,6 @@
1
1
  from .connector import AccessConfig, BaseConnector, ConnectionConfig
2
2
  from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
3
- from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
3
+ from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
4
4
  from .indexer import Indexer, IndexerConfig
5
5
  from .process import BaseProcess
6
6
  from .processor import ProcessorConfig
@@ -27,4 +27,6 @@ __all__ = [
27
27
  "ConnectionConfig",
28
28
  "BaseConnector",
29
29
  "FileDataSourceMetadata",
30
+ "BatchFileData",
31
+ "BatchItem",
30
32
  ]
@@ -1,13 +1,14 @@
1
1
  import json
2
- from dataclasses import dataclass, field
3
2
  from pathlib import Path
4
- from typing import Any, Literal, Optional
3
+ from typing import Any, Optional
4
+ from uuid import NAMESPACE_DNS, uuid5
5
5
 
6
- from dataclasses_json import DataClassJsonMixin
6
+ from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
7
7
 
8
+ from unstructured_ingest.v2.logger import logger
8
9
 
9
- @dataclass
10
- class SourceIdentifiers:
10
+
11
+ class SourceIdentifiers(BaseModel):
11
12
  filename: str
12
13
  fullpath: str
13
14
  rel_path: Optional[str] = None
@@ -21,8 +22,7 @@ class SourceIdentifiers:
21
22
  return self.rel_path or self.fullpath
22
23
 
23
24
 
24
- @dataclass
25
- class FileDataSourceMetadata(DataClassJsonMixin):
25
+ class FileDataSourceMetadata(BaseModel):
26
26
  url: Optional[str] = None
27
27
  version: Optional[str] = None
28
28
  record_locator: Optional[dict[str, Any]] = None
@@ -33,14 +33,12 @@ class FileDataSourceMetadata(DataClassJsonMixin):
33
33
  filesize_bytes: Optional[int] = None
34
34
 
35
35
 
36
- @dataclass
37
- class FileData(DataClassJsonMixin):
36
+ class FileData(BaseModel):
38
37
  identifier: str
39
38
  connector_type: str
40
39
  source_identifiers: Optional[SourceIdentifiers] = None
41
- doc_type: Literal["file", "batch"] = field(default="file")
42
- metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
43
- additional_metadata: dict[str, Any] = field(default_factory=dict)
40
+ metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
41
+ additional_metadata: dict[str, Any] = Field(default_factory=dict)
44
42
  reprocess: bool = False
45
43
  local_download_path: Optional[str] = None
46
44
  display_name: Optional[str] = None
@@ -52,11 +50,57 @@ class FileData(DataClassJsonMixin):
52
50
  raise ValueError(f"file path not valid: {path}")
53
51
  with open(str(path.resolve()), "rb") as f:
54
52
  file_data_dict = json.load(f)
55
- file_data = FileData.from_dict(file_data_dict)
53
+ file_data = cls.model_validate(file_data_dict)
56
54
  return file_data
57
55
 
56
+ @classmethod
57
+ def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
58
+ file_data_dict = file_data.model_dump()
59
+ return cls.model_validate(file_data_dict, **kwargs)
60
+
58
61
  def to_file(self, path: str) -> None:
59
62
  path = Path(path).resolve()
60
63
  path.parent.mkdir(parents=True, exist_ok=True)
61
64
  with open(str(path.resolve()), "w") as f:
62
- json.dump(self.to_dict(), f, indent=2)
65
+ json.dump(self.model_dump(), f, indent=2)
66
+
67
+
68
+ class BatchItem(BaseModel):
69
+ identifier: str
70
+ version: Optional[str] = None
71
+
72
+
73
+ class BatchFileData(FileData):
74
+ identifier: str = Field(init=False)
75
+ batch_items: list[BatchItem]
76
+
77
+ @field_validator("batch_items")
78
+ @classmethod
79
+ def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
80
+ if not v:
81
+ raise ValueError("batch items cannot be empty")
82
+ all_identifiers = [item.identifier for item in v]
83
+ if len(all_identifiers) != len(set(all_identifiers)):
84
+ raise ValueError(f"duplicate identifiers: {all_identifiers}")
85
+ sorted_batch_items = sorted(v, key=lambda item: item.identifier)
86
+ return sorted_batch_items
87
+
88
+ @model_validator(mode="before")
89
+ @classmethod
90
+ def populate_identifier(cls, data: Any) -> Any:
91
+ if isinstance(data, dict) and "identifier" not in data:
92
+ batch_items = data["batch_items"]
93
+ identifier_data = json.dumps(
94
+ {item.identifier: item.version for item in batch_items}, sort_keys=True
95
+ )
96
+ data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
97
+ return data
98
+
99
+
100
+ def file_data_from_file(path: str) -> FileData:
101
+ try:
102
+ return BatchFileData.from_file(path=path)
103
+ except ValidationError:
104
+ logger.debug(f"{path} not valid for batch file data")
105
+
106
+ return FileData.from_file(path=path)
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
9
10
  from unstructured_ingest.v2.logger import logger
10
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
12
  from unstructured_ingest.v2.processes.chunker import Chunker
@@ -51,7 +52,7 @@ class ChunkStep(PipelineStep):
51
52
  self, fn: Callable, path: str, file_data_path: str, **kwargs
52
53
  ) -> ChunkStepResponse:
53
54
  path = Path(path)
54
- file_data = FileData.from_file(path=file_data_path)
55
+ file_data = file_data_from_file(path=file_data_path)
55
56
  output_filepath = self.get_output_filepath(filename=path)
56
57
  if not self.should_chunk(filepath=output_filepath, file_data=file_data):
57
58
  logger.debug(f"skipping chunking, output already exists: {output_filepath}")
@@ -8,6 +8,7 @@ from typing import Callable, Optional, TypedDict, TypeVar
8
8
 
9
9
  from unstructured_ingest.v2.interfaces import FileData, download_responses
10
10
  from unstructured_ingest.v2.interfaces.downloader import Downloader
11
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
11
12
  from unstructured_ingest.v2.logger import logger
12
13
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
13
14
  from unstructured_ingest.v2.utils import serialize_base_model_json
@@ -87,12 +88,12 @@ class DownloadStep(PipelineStep):
87
88
  f"match size of local file: {file_size_bytes}, updating"
88
89
  )
89
90
  file_data.metadata.filesize_bytes = file_size_bytes
90
- logger.debug(f"updating file data with new content: {file_data.to_dict()}")
91
+ logger.debug(f"updating file data with new content: {file_data.model_dump()}")
91
92
  with file_data_path.open("w") as file:
92
- json.dump(file_data.to_dict(), file, indent=2)
93
+ json.dump(file_data.model_dump(), file, indent=2)
93
94
 
94
95
  async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
95
- file_data = FileData.from_file(path=file_data_path)
96
+ file_data = file_data_from_file(path=file_data_path)
96
97
  download_path = self.process.get_download_path(file_data=file_data)
97
98
  if not self.should_download(file_data=file_data, file_data_path=file_data_path):
98
99
  logger.debug(f"skipping download, file already exists locally: {download_path}")
@@ -172,7 +173,7 @@ class DownloadStep(PipelineStep):
172
173
  filepath = (self.cache_dir / filename).resolve()
173
174
  filepath.parent.mkdir(parents=True, exist_ok=True)
174
175
  with open(str(filepath), "w") as f:
175
- json.dump(file_data.to_dict(), f, indent=2)
176
+ json.dump(file_data.model_dump(), f, indent=2)
176
177
  return str(filepath)
177
178
 
178
179
  def get_hash(self, extras: Optional[list[str]]) -> str:
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
9
10
  from unstructured_ingest.v2.logger import logger
10
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
12
  from unstructured_ingest.v2.processes.embedder import Embedder
@@ -49,7 +50,7 @@ class EmbedStep(PipelineStep):
49
50
 
50
51
  async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
51
52
  path = Path(path)
52
- file_data = FileData.from_file(path=file_data_path)
53
+ file_data = file_data_from_file(path=file_data_path)
53
54
  output_filepath = self.get_output_filepath(filename=path)
54
55
  if not self.should_embed(filepath=output_filepath, file_data=file_data):
55
56
  logger.debug(f"skipping embedding, output already exists: {output_filepath}")
@@ -2,7 +2,7 @@ import asyncio
2
2
  from dataclasses import dataclass
3
3
  from typing import Callable, Optional
4
4
 
5
- from unstructured_ingest.v2.interfaces.file_data import FileData
5
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
6
6
  from unstructured_ingest.v2.logger import logger
7
7
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
8
  from unstructured_ingest.v2.processes.filter import Filterer
@@ -20,7 +20,7 @@ class FilterStep(PipelineStep):
20
20
  logger.info(f"created {self.identifier} with configs: {config}")
21
21
 
22
22
  async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
23
- file_data = FileData.from_file(path=file_data_path)
23
+ file_data = file_data_from_file(path=file_data_path)
24
24
  fn_kwargs = {"file_data": file_data}
25
25
  if not asyncio.iscoroutinefunction(fn):
26
26
  resp = fn(**fn_kwargs)
@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
37
37
  @instrument(span_name=STEP_ID)
38
38
  def run(self) -> Generator[str, None, None]:
39
39
  for file_data in self.process.run():
40
- logger.debug(f"generated file data: {file_data.to_dict()}")
40
+ logger.debug(f"generated file data: {file_data.model_dump()}")
41
41
  try:
42
42
  record_hash = self.get_hash(extras=[file_data.identifier])
43
43
  filename = f"{record_hash}.json"
44
44
  filepath = (self.cache_dir / filename).resolve()
45
45
  filepath.parent.mkdir(parents=True, exist_ok=True)
46
46
  with open(str(filepath), "w") as f:
47
- json.dump(file_data.to_dict(), f, indent=2)
47
+ json.dump(file_data.model_dump(), f, indent=2)
48
48
  yield str(filepath)
49
49
  except Exception as e:
50
50
  logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
54
54
 
55
55
  async def run_async(self) -> AsyncGenerator[str, None]:
56
56
  async for file_data in self.process.run_async():
57
- logger.debug(f"generated file data: {file_data.to_dict()}")
57
+ logger.debug(f"generated file data: {file_data.model_dump()}")
58
58
  try:
59
59
  record_hash = self.get_hash(extras=[file_data.identifier])
60
60
  filename = f"{record_hash}.json"
61
61
  filepath = (self.cache_dir / filename).resolve()
62
62
  filepath.parent.mkdir(parents=True, exist_ok=True)
63
63
  with open(str(filepath), "w") as f:
64
- json.dump(file_data.to_dict(), f, indent=2)
64
+ json.dump(file_data.model_dump(), f, indent=2)
65
65
  yield str(filepath)
66
66
  except Exception as e:
67
67
  logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
9
10
  from unstructured_ingest.v2.logger import logger
10
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
12
  from unstructured_ingest.v2.processes.partitioner import Partitioner
@@ -51,12 +52,12 @@ class PartitionStep(PipelineStep):
51
52
  self, fn: Callable, path: str, file_data_path: str
52
53
  ) -> Optional[PartitionStepResponse]:
53
54
  path = Path(path)
54
- file_data = FileData.from_file(path=file_data_path)
55
+ file_data = file_data_from_file(path=file_data_path)
55
56
  output_filepath = self.get_output_filepath(filename=Path(file_data_path))
56
57
  if not self.should_partition(filepath=output_filepath, file_data=file_data):
57
58
  logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
58
59
  return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
59
- fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
60
+ fn_kwargs = {"filename": path, "metadata": file_data.metadata.model_dump()}
60
61
  if not asyncio.iscoroutinefunction(fn):
61
62
  partitioned_content = fn(**fn_kwargs)
62
63
  elif semaphore := self.context.semaphore:
@@ -4,7 +4,7 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import Callable, Optional, TypedDict
6
6
 
7
- from unstructured_ingest.v2.interfaces.file_data import FileData
7
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
8
8
  from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
9
9
  from unstructured_ingest.v2.logger import logger
10
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
@@ -43,7 +43,7 @@ class UploadStageStep(PipelineStep):
43
43
  output_filename = f"{self.get_hash(extras=[path.name])}{path.suffix}"
44
44
  fn_kwargs = {
45
45
  "elements_filepath": path,
46
- "file_data": FileData.from_file(path=file_data_path),
46
+ "file_data": file_data_from_file(path=file_data_path),
47
47
  "output_dir": self.cache_dir,
48
48
  "output_filename": output_filename,
49
49
  }
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Callable, TypedDict
5
5
 
6
- from unstructured_ingest.v2.interfaces.file_data import FileData
6
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
7
7
  from unstructured_ingest.v2.logger import logger
8
8
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
9
  from unstructured_ingest.v2.processes.uncompress import Uncompressor
@@ -28,7 +28,7 @@ class UncompressStep(PipelineStep):
28
28
  async def _run_async(
29
29
  self, fn: Callable, path: str, file_data_path: str
30
30
  ) -> list[UncompressStepResponse]:
31
- file_data = FileData.from_file(path=file_data_path)
31
+ file_data = file_data_from_file(path=file_data_path)
32
32
  fn_kwargs = {"file_data": file_data}
33
33
  if not asyncio.iscoroutinefunction(fn):
34
34
  new_file_data = fn(**fn_kwargs)
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Callable, Optional, TypedDict
5
5
 
6
- from unstructured_ingest.v2.interfaces import FileData
6
+ from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
7
7
  from unstructured_ingest.v2.interfaces.uploader import UploadContent
8
8
  from unstructured_ingest.v2.logger import logger
9
9
  from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
@@ -41,14 +41,14 @@ class UploadStep(BatchPipelineStep):
41
41
  @instrument(span_name=STEP_ID)
42
42
  def _run_batch(self, contents: list[UploadStepContent]) -> None:
43
43
  upload_contents = [
44
- UploadContent(path=Path(c["path"]), file_data=FileData.from_file(c["file_data_path"]))
44
+ UploadContent(path=Path(c["path"]), file_data=file_data_from_file(c["file_data_path"]))
45
45
  for c in contents
46
46
  ]
47
47
  self.process.run_batch(contents=upload_contents)
48
48
 
49
49
  async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None):
50
50
  fn = fn or self.process.run_async
51
- fn_kwargs = {"path": Path(path), "file_data": FileData.from_file(path=file_data_path)}
51
+ fn_kwargs = {"path": Path(path), "file_data": file_data_from_file(path=file_data_path)}
52
52
  if not asyncio.iscoroutinefunction(fn):
53
53
  fn(**fn_kwargs)
54
54
  elif semaphore := self.context.semaphore:
@@ -40,6 +40,8 @@ from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
40
40
  from .milvus import milvus_destination_entry
41
41
  from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
42
42
  from .mongodb import mongodb_destination_entry, mongodb_source_entry
43
+ from .neo4j import CONNECTOR_TYPE as NEO4J_CONNECTOR_TYPE
44
+ from .neo4j import neo4j_destination_entry
43
45
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
44
46
  from .onedrive import onedrive_destination_entry, onedrive_source_entry
45
47
  from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
@@ -74,6 +76,7 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
74
76
  add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
75
77
  add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
76
78
 
79
+ add_destination_entry(destination_type=NEO4J_CONNECTOR_TYPE, entry=neo4j_destination_entry)
77
80
 
78
81
  add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
79
82