unstructured-ingest 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (64) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +102 -91
  10. test/integration/connectors/sql/test_singlestore.py +111 -99
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +86 -75
  13. test/integration/connectors/test_astradb.py +22 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +4 -4
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +3 -3
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +15 -91
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  35. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  36. unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
  37. unstructured_ingest/v2/processes/connectors/astradb.py +8 -30
  38. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  39. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  40. unstructured_ingest/v2/processes/connectors/couchbase.py +42 -52
  41. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  42. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  43. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  44. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  45. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +5 -30
  46. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  47. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  48. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  49. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  50. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  51. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  52. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  53. unstructured_ingest/v2/processes/connectors/mongodb.py +4 -8
  54. unstructured_ingest/v2/processes/connectors/neo4j.py +381 -0
  55. unstructured_ingest/v2/processes/connectors/pinecone.py +23 -65
  56. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  57. unstructured_ingest/v2/processes/connectors/sql/sql.py +41 -40
  58. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  59. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/METADATA +21 -17
  60. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/RECORD +64 -56
  61. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/LICENSE.md +0 -0
  62. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/WHEEL +0 -0
  63. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/entry_points.txt +0 -0
  64. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/top_level.txt +0 -0
@@ -1,83 +1,28 @@
1
- import filecmp
2
1
  import json
3
2
  import os
4
3
  import shutil
5
- from dataclasses import dataclass, field, replace
4
+ from dataclasses import replace
6
5
  from pathlib import Path
7
6
  from typing import Callable, Optional
8
7
 
9
- import pandas as pd
10
- from bs4 import BeautifulSoup
11
8
  from deepdiff import DeepDiff
9
+ from pydantic import Field
12
10
 
13
- from test.integration.connectors.utils.constants import expected_results_path
11
+ from test.integration.connectors.utils.validation.utils import ValidationConfig, reset_dir
14
12
  from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
15
13
 
16
14
 
17
- def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
18
- expected_df = pd.read_csv(expected_filepath)
19
- current_df = pd.read_csv(current_filepath)
20
- if expected_df.equals(current_df):
21
- return True
22
- # Print diff
23
- diff = expected_df.merge(current_df, indicator=True, how="left").loc[
24
- lambda x: x["_merge"] != "both"
25
- ]
26
- print("diff between expected and current df:")
27
- print(diff)
28
- return False
29
-
30
-
31
- def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
32
- with expected_filepath.open() as expected_f:
33
- expected_soup = BeautifulSoup(expected_f, "html.parser")
34
- with current_filepath.open() as current_f:
35
- current_soup = BeautifulSoup(current_f, "html.parser")
36
- return expected_soup.text == current_soup.text
37
-
38
-
39
- def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
40
- with expected_filepath.open() as expected_f:
41
- expected_text_lines = expected_f.readlines()
42
- with current_filepath.open() as current_f:
43
- current_text_lines = current_f.readlines()
44
- if len(expected_text_lines) != len(current_text_lines):
45
- print(
46
- f"Lines in expected text file ({len(expected_text_lines)}) "
47
- f"don't match current text file ({len(current_text_lines)})"
48
- )
49
- return False
50
- expected_text = "\n".join(expected_text_lines)
51
- current_text = "\n".join(current_text_lines)
52
- if expected_text == current_text:
53
- return True
54
- print("txt content don't match:")
55
- print(f"expected: {expected_text}")
56
- print(f"current: {current_text}")
57
- return False
58
-
59
-
60
- file_type_equality_check = {
61
- ".json": json_equality_check,
62
- ".html": html_equality_check,
63
- ".txt": txt_equality_check,
64
- }
65
-
66
-
67
- @dataclass
68
- class ValidationConfigs:
69
- test_id: str
15
+ class SourceValidationConfigs(ValidationConfig):
70
16
  expected_number_indexed_file_data: Optional[int] = None
71
17
  expected_num_files: Optional[int] = None
72
18
  predownload_file_data_check: Optional[Callable[[FileData], None]] = None
73
19
  postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
74
- exclude_fields: list[str] = field(
20
+ exclude_fields: list[str] = Field(
75
21
  default_factory=lambda: ["local_download_path", "metadata.date_processed"]
76
22
  )
77
- exclude_fields_extend: list[str] = field(default_factory=list)
23
+ exclude_fields_extend: list[str] = Field(default_factory=list)
78
24
  validate_downloaded_files: bool = False
79
25
  validate_file_data: bool = True
80
- downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
81
26
 
82
27
  def get_exclude_fields(self) -> list[str]:
83
28
  exclude_fields = self.exclude_fields
@@ -97,9 +42,6 @@ class ValidationConfigs:
97
42
  downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
98
43
  assert len(downloaded_files) == expected_num_files
99
44
 
100
- def test_output_dir(self) -> Path:
101
- return expected_results_path / self.test_id
102
-
103
45
  def omit_ignored_fields(self, data: dict) -> dict:
104
46
  exclude_fields = self.get_exclude_fields()
105
47
  # Ignore fields that dynamically change every time the tests run
@@ -143,7 +85,7 @@ def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
143
85
 
144
86
 
145
87
  def check_contents(
146
- expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
88
+ expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
147
89
  ):
148
90
  found_diff = False
149
91
  for file_data in all_file_data:
@@ -160,27 +102,10 @@ def check_contents(
160
102
  assert not found_diff, f"Diffs found between files: {found_diff}"
161
103
 
162
104
 
163
- def detect_diff(
164
- configs: ValidationConfigs, expected_filepath: Path, current_filepath: Path
165
- ) -> bool:
166
- if expected_filepath.suffix != current_filepath.suffix:
167
- return True
168
- if downloaded_file_equality_check := configs.downloaded_file_equality_check:
169
- return not downloaded_file_equality_check(expected_filepath, current_filepath)
170
- current_suffix = expected_filepath.suffix
171
- if current_suffix in file_type_equality_check:
172
- equality_check_callable = file_type_equality_check[current_suffix]
173
- return not equality_check_callable(
174
- expected_filepath=expected_filepath, current_filepath=current_filepath
175
- )
176
- # Fallback is using filecmp.cmp to compare the files
177
- return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
178
-
179
-
180
105
  def check_raw_file_contents(
181
106
  expected_output_dir: Path,
182
107
  current_output_dir: Path,
183
- configs: ValidationConfigs,
108
+ configs: SourceValidationConfigs,
184
109
  ):
185
110
  current_files = get_files(dir_path=current_output_dir)
186
111
  found_diff = False
@@ -188,7 +113,7 @@ def check_raw_file_contents(
188
113
  for current_file in current_files:
189
114
  current_file_path = current_output_dir / current_file
190
115
  expected_file_path = expected_output_dir / current_file
191
- if detect_diff(configs, expected_file_path, current_file_path):
116
+ if configs.detect_diff(expected_file_path, current_file_path):
192
117
  found_diff = True
193
118
  files.append(str(expected_file_path))
194
119
  print(f"diffs between files {expected_file_path} and {current_file_path}")
@@ -196,7 +121,7 @@ def check_raw_file_contents(
196
121
 
197
122
 
198
123
  def run_expected_results_validation(
199
- expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
124
+ expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
200
125
  ):
201
126
  check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
202
127
  check_contents(
@@ -207,7 +132,7 @@ def run_expected_results_validation(
207
132
  def run_expected_download_files_validation(
208
133
  expected_output_dir: Path,
209
134
  current_download_dir: Path,
210
- configs: ValidationConfigs,
135
+ configs: SourceValidationConfigs,
211
136
  ):
212
137
  check_files_in_paths(
213
138
  expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
@@ -234,12 +159,10 @@ def update_fixtures(
234
159
  save_downloads: bool = False,
235
160
  save_filedata: bool = True,
236
161
  ):
237
- # Delete current files
238
- shutil.rmtree(path=output_dir, ignore_errors=True)
239
- output_dir.mkdir(parents=True)
240
162
  # Rewrite the current file data
241
163
  if save_filedata:
242
164
  file_data_output_path = output_dir / "file_data"
165
+ reset_dir(dir_path=file_data_output_path)
243
166
  print(
244
167
  f"Writing {len(all_file_data)} file data to "
245
168
  f"saved fixture location {file_data_output_path}"
@@ -260,6 +183,7 @@ def update_fixtures(
260
183
  # If applicable, save raw downloads
261
184
  if save_downloads:
262
185
  raw_download_output_path = output_dir / "downloads"
186
+ reset_dir(raw_download_output_path)
263
187
  print(
264
188
  f"Writing {len(download_files)} downloaded files to "
265
189
  f"saved fixture location {raw_download_output_path}"
@@ -268,7 +192,7 @@ def update_fixtures(
268
192
 
269
193
 
270
194
  def run_all_validations(
271
- configs: ValidationConfigs,
195
+ configs: SourceValidationConfigs,
272
196
  predownload_file_data: list[FileData],
273
197
  postdownload_file_data: list[FileData],
274
198
  download_dir: Path,
@@ -308,7 +232,7 @@ def run_all_validations(
308
232
  async def source_connector_validation(
309
233
  indexer: Indexer,
310
234
  downloader: Downloader,
311
- configs: ValidationConfigs,
235
+ configs: SourceValidationConfigs,
312
236
  overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
313
237
  ) -> None:
314
238
  # Run common validations on the process of running a source connector, supporting dynamic
@@ -0,0 +1,36 @@
1
+ import filecmp
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import Callable, Optional
5
+
6
+ from pydantic import BaseModel
7
+
8
+ from test.integration.connectors.utils.constants import expected_results_path
9
+ from test.integration.connectors.utils.validation.equality import file_type_equality_check
10
+
11
+
12
+ class ValidationConfig(BaseModel):
13
+ test_id: str
14
+ file_equality_check: Optional[Callable[[Path, Path], bool]] = None
15
+
16
+ def test_output_dir(self) -> Path:
17
+ return expected_results_path / self.test_id
18
+
19
+ def detect_diff(self, expected_filepath: Path, current_filepath: Path) -> bool:
20
+ if expected_filepath.suffix != current_filepath.suffix:
21
+ return True
22
+ if file_equality_check := self.file_equality_check:
23
+ return not file_equality_check(expected_filepath, current_filepath)
24
+ current_suffix = expected_filepath.suffix
25
+ if current_suffix in file_type_equality_check:
26
+ equality_check_callable = file_type_equality_check[current_suffix]
27
+ return not equality_check_callable(
28
+ expected_filepath=expected_filepath, current_filepath=current_filepath
29
+ )
30
+ # Fallback is using filecmp.cmp to compare the files
31
+ return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
32
+
33
+
34
+ def reset_dir(dir_path: Path) -> None:
35
+ shutil.rmtree(path=dir_path, ignore_errors=True)
36
+ dir_path.mkdir(parents=True)
@@ -1 +1 @@
1
- __version__ = "0.3.7" # pragma: no cover
1
+ __version__ = "0.3.9" # pragma: no cover
@@ -1,4 +1,7 @@
1
+ import base64
1
2
  import hashlib
3
+ import json
4
+ import zlib
2
5
  from itertools import groupby
3
6
 
4
7
 
@@ -43,3 +46,11 @@ def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
43
46
  e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
44
47
 
45
48
  return elements
49
+
50
+
51
+ def elements_from_base64_gzipped_json(raw_s: str) -> list[dict]:
52
+ decoded_b64_bytes = base64.b64decode(raw_s)
53
+ elements_json_bytes = zlib.decompress(decoded_b64_bytes)
54
+ elements_json_str = elements_json_bytes.decode("utf-8")
55
+ element_dicts = json.loads(elements_json_str)
56
+ return element_dicts
@@ -1,8 +1,10 @@
1
1
  import itertools
2
2
  import json
3
3
  from datetime import datetime
4
+ from pathlib import Path
4
5
  from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
5
6
 
7
+ import ndjson
6
8
  import pandas as pd
7
9
 
8
10
  DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
@@ -131,3 +133,37 @@ def validate_date_args(date: Optional[str] = None) -> bool:
131
133
  f"The argument {date} does not satisfy the format:"
132
134
  f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM",
133
135
  )
136
+
137
+
138
+ def get_data(path: Path) -> list[dict]:
139
+ with path.open() as f:
140
+ if path.suffix == ".json":
141
+ return json.load(f)
142
+ elif path.suffix == ".ndjson":
143
+ return ndjson.load(f)
144
+ elif path.suffix == ".csv":
145
+ df = pd.read_csv(path)
146
+ return df.to_dict(orient="records")
147
+ elif path.suffix == ".parquet":
148
+ df = pd.read_parquet(path)
149
+ return df.to_dict(orient="records")
150
+ else:
151
+ raise ValueError(f"Unsupported file type: {path}")
152
+
153
+
154
+ def get_data_df(path: Path) -> pd.DataFrame:
155
+ with path.open() as f:
156
+ if path.suffix == ".json":
157
+ data = json.load(f)
158
+ return pd.DataFrame(data=data)
159
+ elif path.suffix == ".ndjson":
160
+ data = ndjson.load(f)
161
+ return pd.DataFrame(data=data)
162
+ elif path.suffix == ".csv":
163
+ df = pd.read_csv(path)
164
+ return df
165
+ elif path.suffix == ".parquet":
166
+ df = pd.read_parquet(path)
167
+ return df
168
+ else:
169
+ raise ValueError(f"Unsupported file type: {path}")
@@ -1,8 +1,10 @@
1
- from abc import ABC, abstractmethod
1
+ import json
2
+ from abc import ABC
2
3
  from dataclasses import dataclass
3
4
  from pathlib import Path
4
5
  from typing import Any, TypeVar
5
6
 
7
+ import ndjson
6
8
  from pydantic import BaseModel
7
9
 
8
10
  from unstructured_ingest.v2.interfaces.file_data import FileData
@@ -20,16 +22,78 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
20
22
  class UploadStager(BaseProcess, ABC):
21
23
  upload_stager_config: UploadStagerConfigT
22
24
 
23
- @abstractmethod
25
+ def write_output(self, output_path: Path, data: list[dict]) -> None:
26
+ if output_path.suffix == ".json":
27
+ with output_path.open("w") as f:
28
+ json.dump(data, f, indent=2)
29
+ elif output_path.suffix == ".ndjson":
30
+ with output_path.open("w") as f:
31
+ ndjson.dump(data, f)
32
+ else:
33
+ raise ValueError(f"Unsupported output format: {output_path}")
34
+
35
+ def get_data(self, elements_filepath: Path) -> list[dict]:
36
+ if elements_filepath.suffix == ".json":
37
+ with elements_filepath.open() as f:
38
+ return json.load(f)
39
+ elif elements_filepath.suffix == ".ndjson":
40
+ with elements_filepath.open() as f:
41
+ return ndjson.load(f)
42
+ else:
43
+ raise ValueError(f"Unsupported input format: {elements_filepath}")
44
+
45
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
46
+ return element_dict
47
+
48
+ def get_output_path(self, output_filename: str, output_dir: Path) -> Path:
49
+ output_path = Path(output_filename)
50
+ output_filename = f"{Path(output_filename).stem}{output_path.suffix}"
51
+ output_path = Path(output_dir) / Path(f"{output_filename}")
52
+ output_path.parent.mkdir(parents=True, exist_ok=True)
53
+ return output_path
54
+
55
+ def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
56
+ with input_file.open() as in_f:
57
+ reader = ndjson.reader(in_f)
58
+ with output_file.open("w") as out_f:
59
+ writer = ndjson.writer(out_f)
60
+ for element in reader:
61
+ conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
62
+ writer.writerow(row=conformed_element)
63
+ writer.f.flush()
64
+
65
+ def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
66
+ with input_file.open() as in_f:
67
+ elements_contents = json.load(in_f)
68
+
69
+ conformed_elements = [
70
+ self.conform_dict(element_dict=element, file_data=file_data)
71
+ for element in elements_contents
72
+ ]
73
+
74
+ with open(output_file, "w") as out_f:
75
+ json.dump(conformed_elements, out_f, indent=2)
76
+
24
77
  def run(
25
78
  self,
26
79
  elements_filepath: Path,
27
80
  file_data: FileData,
28
81
  output_dir: Path,
29
82
  output_filename: str,
30
- **kwargs: Any
83
+ **kwargs: Any,
31
84
  ) -> Path:
32
- pass
85
+ output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
86
+ if elements_filepath.suffix == ".ndjson":
87
+ self.stream_update(
88
+ input_file=elements_filepath, output_file=output_file, file_data=file_data
89
+ )
90
+ elif elements_filepath.suffix == ".json":
91
+ self.process_whole(
92
+ input_file=elements_filepath, output_file=output_file, file_data=file_data
93
+ )
94
+ else:
95
+ raise ValueError(f"Unsupported file extension: {elements_filepath}")
96
+ return output_file
33
97
 
34
98
  async def run_async(
35
99
  self,
@@ -37,12 +101,12 @@ class UploadStager(BaseProcess, ABC):
37
101
  file_data: FileData,
38
102
  output_dir: Path,
39
103
  output_filename: str,
40
- **kwargs: Any
104
+ **kwargs: Any,
41
105
  ) -> Path:
42
106
  return self.run(
43
107
  elements_filepath=elements_filepath,
44
108
  output_dir=output_dir,
45
109
  output_filename=output_filename,
46
110
  file_data=file_data,
47
- **kwargs
111
+ **kwargs,
48
112
  )
@@ -5,6 +5,7 @@ from typing import Any, TypeVar
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
8
+ from unstructured_ingest.utils.data_prep import get_data
8
9
  from unstructured_ingest.v2.interfaces.connector import BaseConnector
9
10
  from unstructured_ingest.v2.interfaces.file_data import FileData
10
11
  from unstructured_ingest.v2.interfaces.process import BaseProcess
@@ -38,7 +39,15 @@ class Uploader(BaseProcess, BaseConnector, ABC):
38
39
  raise NotImplementedError()
39
40
 
40
41
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
41
- raise NotImplementedError()
42
+ data = get_data(path=path)
43
+ self.run_data(data=data, file_data=file_data, **kwargs)
42
44
 
43
45
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
44
- return self.run(contents=[UploadContent(path=path, file_data=file_data)], **kwargs)
46
+ data = get_data(path=path)
47
+ await self.run_data_async(data=data, file_data=file_data, **kwargs)
48
+
49
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
50
+ raise NotImplementedError()
51
+
52
+ async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
53
+ return self.run_data(data=data, file_data=file_data, **kwargs)
@@ -39,11 +39,13 @@ class UploadStageStep(PipelineStep):
39
39
  self, fn: Callable, path: str, file_data_path: str
40
40
  ) -> UploadStageStepResponse:
41
41
  path = Path(path)
42
+ # Maintain extension
43
+ output_filename = f"{self.get_hash(extras=[path.name])}{path.suffix}"
42
44
  fn_kwargs = {
43
45
  "elements_filepath": path,
44
46
  "file_data": FileData.from_file(path=file_data_path),
45
47
  "output_dir": self.cache_dir,
46
- "output_filename": self.get_hash(extras=[path.name]),
48
+ "output_filename": output_filename,
47
49
  }
48
50
  if not asyncio.iscoroutinefunction(fn):
49
51
  staged_output_path = fn(**fn_kwargs)
@@ -1,7 +1,6 @@
1
1
  import copy
2
2
  import csv
3
3
  import hashlib
4
- import json
5
4
  import sys
6
5
  from dataclasses import dataclass, field
7
6
  from pathlib import Path
@@ -17,7 +16,7 @@ from unstructured_ingest.error import (
17
16
  SourceConnectionError,
18
17
  SourceConnectionNetworkError,
19
18
  )
20
- from unstructured_ingest.utils.data_prep import batch_generator
19
+ from unstructured_ingest.utils.data_prep import batch_generator, get_data
21
20
  from unstructured_ingest.utils.dep_check import requires_dependencies
22
21
  from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
23
22
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
@@ -325,29 +324,6 @@ class AstraDBUploadStager(UploadStager):
325
324
  "metadata": element_dict,
326
325
  }
327
326
 
328
- def run(
329
- self,
330
- elements_filepath: Path,
331
- file_data: FileData,
332
- output_dir: Path,
333
- output_filename: str,
334
- **kwargs: Any,
335
- ) -> Path:
336
- with open(elements_filepath) as elements_file:
337
- elements_contents = json.load(elements_file)
338
- conformed_elements = []
339
- for element in elements_contents:
340
- conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
341
- output_filename_path = Path(output_filename)
342
- if output_filename_path.suffix == ".json":
343
- output_path = Path(output_dir) / output_filename_path
344
- else:
345
- output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
346
- output_path.parent.mkdir(parents=True, exist_ok=True)
347
- with open(output_path, "w") as output_file:
348
- json.dump(conformed_elements, output_file, indent=2)
349
- return output_path
350
-
351
327
 
352
328
  @dataclass
353
329
  class AstraDBUploader(Uploader):
@@ -386,11 +362,9 @@ class AstraDBUploader(Uploader):
386
362
  f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
387
363
  )
388
364
 
389
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
390
- with path.open("r") as file:
391
- elements_dict = json.load(file)
365
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
392
366
  logger.info(
393
- f"writing {len(elements_dict)} objects to destination "
367
+ f"writing {len(data)} objects to destination "
394
368
  f"collection {self.upload_config.collection_name}"
395
369
  )
396
370
 
@@ -399,9 +373,13 @@ class AstraDBUploader(Uploader):
399
373
 
400
374
  self.delete_by_record_id(collection=collection, file_data=file_data)
401
375
 
402
- for chunk in batch_generator(elements_dict, astra_db_batch_size):
376
+ for chunk in batch_generator(data, astra_db_batch_size):
403
377
  collection.insert_many(chunk)
404
378
 
379
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
380
+ data = get_data(path=path)
381
+ self.run_data(data=data, file_data=file_data, **kwargs)
382
+
405
383
 
406
384
  astra_db_source_entry = SourceRegistryEntry(
407
385
  indexer=AstraDBIndexer,
@@ -1,7 +1,7 @@
1
1
  import json
2
+ from contextlib import contextmanager
2
3
  from dataclasses import dataclass, field
3
- from pathlib import Path
4
- from typing import TYPE_CHECKING, Any
4
+ from typing import TYPE_CHECKING, Any, Generator
5
5
 
6
6
  from pydantic import Field, Secret
7
7
 
@@ -49,29 +49,33 @@ class AzureAISearchConnectionConfig(ConnectionConfig):
49
49
  access_config: Secret[AzureAISearchAccessConfig]
50
50
 
51
51
  @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
52
- def get_search_client(self) -> "SearchClient":
52
+ @contextmanager
53
+ def get_search_client(self) -> Generator["SearchClient", None, None]:
53
54
  from azure.core.credentials import AzureKeyCredential
54
55
  from azure.search.documents import SearchClient
55
56
 
56
- return SearchClient(
57
+ with SearchClient(
57
58
  endpoint=self.endpoint,
58
59
  index_name=self.index,
59
60
  credential=AzureKeyCredential(
60
61
  self.access_config.get_secret_value().azure_ai_search_key
61
62
  ),
62
- )
63
+ ) as client:
64
+ yield client
63
65
 
64
66
  @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
65
- def get_search_index_client(self) -> "SearchIndexClient":
67
+ @contextmanager
68
+ def get_search_index_client(self) -> Generator["SearchIndexClient", None, None]:
66
69
  from azure.core.credentials import AzureKeyCredential
67
70
  from azure.search.documents.indexes import SearchIndexClient
68
71
 
69
- return SearchIndexClient(
72
+ with SearchIndexClient(
70
73
  endpoint=self.endpoint,
71
74
  credential=AzureKeyCredential(
72
75
  self.access_config.get_secret_value().azure_ai_search_key
73
76
  ),
74
- )
77
+ ) as search_index_client:
78
+ yield search_index_client
75
79
 
76
80
 
77
81
  class AzureAISearchUploadStagerConfig(UploadStagerConfig):
@@ -92,14 +96,13 @@ class AzureAISearchUploadStager(UploadStager):
92
96
  default_factory=lambda: AzureAISearchUploadStagerConfig()
93
97
  )
94
98
 
95
- @staticmethod
96
- def conform_dict(data: dict, file_data: FileData) -> dict:
99
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
97
100
  """
98
101
  updates the dictionary that is from each Element being converted into a dict/json
99
102
  into a dictionary that conforms to the schema expected by the
100
103
  Azure Cognitive Search index
101
104
  """
102
-
105
+ data = element_dict.copy()
103
106
  data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
104
107
  data[RECORD_ID_LABEL] = file_data.identifier
105
108
 
@@ -140,31 +143,6 @@ class AzureAISearchUploadStager(UploadStager):
140
143
  data["metadata"]["page_number"] = str(page_number)
141
144
  return data
142
145
 
143
- def run(
144
- self,
145
- file_data: FileData,
146
- elements_filepath: Path,
147
- output_dir: Path,
148
- output_filename: str,
149
- **kwargs: Any,
150
- ) -> Path:
151
- with open(elements_filepath) as elements_file:
152
- elements_contents = json.load(elements_file)
153
-
154
- conformed_elements = [
155
- self.conform_dict(data=element, file_data=file_data) for element in elements_contents
156
- ]
157
-
158
- if Path(output_filename).suffix != ".json":
159
- output_filename = f"{output_filename}.json"
160
- else:
161
- output_filename = f"{Path(output_filename).stem}.json"
162
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
163
- output_path.parent.mkdir(parents=True, exist_ok=True)
164
- with open(output_path, "w") as output_file:
165
- json.dump(conformed_elements, output_file, indent=2)
166
- return output_path
167
-
168
146
 
169
147
  @dataclass
170
148
  class AzureAISearchUploader(Uploader):
@@ -270,9 +248,7 @@ class AzureAISearchUploader(Uploader):
270
248
  logger.error(f"failed to validate connection: {e}", exc_info=True)
271
249
  raise DestinationConnectionError(f"failed to validate connection: {e}")
272
250
 
273
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
274
- with path.open("r") as file:
275
- elements_dict = json.load(file)
251
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
276
252
  logger.info(
277
253
  f"writing document batches to destination"
278
254
  f" endpoint at {str(self.connection_config.endpoint)}"
@@ -287,7 +263,7 @@ class AzureAISearchUploader(Uploader):
287
263
 
288
264
  batch_size = self.upload_config.batch_size
289
265
  with self.connection_config.get_search_client() as search_client:
290
- for chunk in batch_generator(elements_dict, batch_size):
266
+ for chunk in batch_generator(data, batch_size):
291
267
  self.write_dict(elements_dict=chunk, search_client=search_client) # noqa: E203
292
268
 
293
269