unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +103 -92
  10. test/integration/connectors/sql/test_singlestore.py +112 -100
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +87 -76
  13. test/integration/connectors/test_astradb.py +62 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +6 -6
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +7 -4
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  35. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  36. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  37. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  38. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  39. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  40. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  41. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  42. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  43. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  44. unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
  45. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  46. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  47. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  48. unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
  49. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  50. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  51. unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
  52. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  53. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  54. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  55. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  56. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
  57. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  58. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  59. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  60. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  61. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  62. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  63. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  64. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  66. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  67. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  68. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  69. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  70. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
  72. unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
  73. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
  75. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  77. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  78. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  79. unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
  80. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  81. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  82. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
  83. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
  84. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  87. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,88 @@
1
+ import json
2
+ import os
3
+ import shutil
4
+ from pathlib import Path
5
+
6
+ import ndjson
7
+
8
+ from test.integration.connectors.utils.validation.utils import ValidationConfig
9
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers, UploadStager
10
+
11
+
12
+ class StagerValidationConfigs(ValidationConfig):
13
+ expected_count: int
14
+
15
+ def stager_output_dir(self) -> Path:
16
+ dir = self.test_output_dir() / "stager"
17
+ dir.mkdir(exist_ok=True, parents=True)
18
+ return dir
19
+
20
+ def stager_output_path(self, input_path: Path) -> Path:
21
+ return self.stager_output_dir() / input_path.name
22
+
23
+
24
+ def run_all_stager_validations(
25
+ configs: StagerValidationConfigs, input_file: Path, staged_filepath: Path
26
+ ):
27
+ # Validate matching extensions
28
+ assert input_file.suffix == staged_filepath.suffix
29
+
30
+ # Validate length
31
+ staged_data = get_data(staged_filepath=staged_filepath)
32
+ assert len(staged_data) == configs.expected_count
33
+
34
+ # Validate file
35
+ expected_filepath = configs.stager_output_path(input_path=input_file)
36
+ assert expected_filepath.exists(), f"{expected_filepath} does not exist"
37
+ assert expected_filepath.is_file(), f"{expected_filepath} is not a file"
38
+ if configs.detect_diff(expected_filepath=expected_filepath, current_filepath=staged_filepath):
39
+ raise AssertionError(
40
+ f"Current file ({staged_filepath}) does not match expected file: {expected_filepath}"
41
+ )
42
+
43
+
44
+ def update_stager_fixtures(stager_output_path: Path, staged_filepath: Path):
45
+ copied_filepath = stager_output_path / staged_filepath.name
46
+ shutil.copy(staged_filepath, copied_filepath)
47
+
48
+
49
+ def get_data(staged_filepath: Path) -> list[dict]:
50
+ if staged_filepath.suffix == ".json":
51
+ with staged_filepath.open() as f:
52
+ return json.load(f)
53
+ elif staged_filepath.suffix == ".ndjson":
54
+ with staged_filepath.open() as f:
55
+ return ndjson.load(f)
56
+ else:
57
+ raise ValueError(f"Unsupported file type: {staged_filepath.suffix}")
58
+
59
+
60
+ def stager_validation(
61
+ stager: UploadStager,
62
+ tmp_dir: Path,
63
+ input_file: Path,
64
+ configs: StagerValidationConfigs,
65
+ overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
66
+ ) -> None:
67
+ # Run stager
68
+ file_data = FileData(
69
+ source_identifiers=SourceIdentifiers(fullpath=input_file.name, filename=input_file.name),
70
+ connector_type=configs.test_id,
71
+ identifier="mock file data",
72
+ )
73
+ staged_filepath = stager.run(
74
+ elements_filepath=input_file,
75
+ file_data=file_data,
76
+ output_dir=tmp_dir,
77
+ output_filename=input_file.name,
78
+ )
79
+ if not overwrite_fixtures:
80
+ print("Running validation")
81
+ run_all_stager_validations(
82
+ configs=configs, input_file=input_file, staged_filepath=staged_filepath
83
+ )
84
+ else:
85
+ print("Running fixtures update")
86
+ update_stager_fixtures(
87
+ stager_output_path=configs.stager_output_dir(), staged_filepath=staged_filepath
88
+ )
@@ -0,0 +1,75 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import ndjson
5
+ from bs4 import BeautifulSoup
6
+ from deepdiff import DeepDiff
7
+
8
+
9
+ def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
10
+ with expected_filepath.open() as f:
11
+ expected_data = json.load(f)
12
+ with current_filepath.open() as f:
13
+ current_data = json.load(f)
14
+ diff = DeepDiff(expected_data, current_data)
15
+ if diff:
16
+ print("diff between expected and current json")
17
+ print(diff.to_json(indent=2))
18
+ return False
19
+ return True
20
+
21
+
22
+ def ndjson_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
23
+ with expected_filepath.open() as f:
24
+ expected_data = ndjson.load(f)
25
+ with current_filepath.open() as f:
26
+ current_data = ndjson.load(f)
27
+ if len(current_data) != len(expected_data):
28
+ print(
29
+ f"expected data length {len(expected_data)} "
30
+ f"didn't match current results: {len(current_data)}"
31
+ )
32
+ for i in range(len(expected_data)):
33
+ e = expected_data[i]
34
+ r = current_data[i]
35
+ if e != r:
36
+ print(f"{i}th element doesn't match:\nexpected {e}\ncurrent {r}")
37
+ return False
38
+ return True
39
+
40
+
41
+ def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
42
+ with expected_filepath.open() as expected_f:
43
+ expected_soup = BeautifulSoup(expected_f, "html.parser")
44
+ with current_filepath.open() as current_f:
45
+ current_soup = BeautifulSoup(current_f, "html.parser")
46
+ return expected_soup.text == current_soup.text
47
+
48
+
49
+ def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
50
+ with expected_filepath.open() as expected_f:
51
+ expected_text_lines = expected_f.readlines()
52
+ with current_filepath.open() as current_f:
53
+ current_text_lines = current_f.readlines()
54
+ if len(expected_text_lines) != len(current_text_lines):
55
+ print(
56
+ f"Lines in expected text file ({len(expected_text_lines)}) "
57
+ f"don't match current text file ({len(current_text_lines)})"
58
+ )
59
+ return False
60
+ expected_text = "\n".join(expected_text_lines)
61
+ current_text = "\n".join(current_text_lines)
62
+ if expected_text == current_text:
63
+ return True
64
+ print("txt content don't match:")
65
+ print(f"expected: {expected_text}")
66
+ print(f"current: {current_text}")
67
+ return False
68
+
69
+
70
+ file_type_equality_check = {
71
+ ".json": json_equality_check,
72
+ ".ndjson": ndjson_equality_check,
73
+ ".html": html_equality_check,
74
+ ".txt": txt_equality_check,
75
+ }
@@ -1,83 +1,27 @@
1
- import filecmp
2
1
  import json
3
2
  import os
4
3
  import shutil
5
- from dataclasses import dataclass, field, replace
6
4
  from pathlib import Path
7
5
  from typing import Callable, Optional
8
6
 
9
- import pandas as pd
10
- from bs4 import BeautifulSoup
11
7
  from deepdiff import DeepDiff
8
+ from pydantic import Field
12
9
 
13
- from test.integration.connectors.utils.constants import expected_results_path
10
+ from test.integration.connectors.utils.validation.utils import ValidationConfig
14
11
  from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
15
12
 
16
13
 
17
- def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
18
- expected_df = pd.read_csv(expected_filepath)
19
- current_df = pd.read_csv(current_filepath)
20
- if expected_df.equals(current_df):
21
- return True
22
- # Print diff
23
- diff = expected_df.merge(current_df, indicator=True, how="left").loc[
24
- lambda x: x["_merge"] != "both"
25
- ]
26
- print("diff between expected and current df:")
27
- print(diff)
28
- return False
29
-
30
-
31
- def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
32
- with expected_filepath.open() as expected_f:
33
- expected_soup = BeautifulSoup(expected_f, "html.parser")
34
- with current_filepath.open() as current_f:
35
- current_soup = BeautifulSoup(current_f, "html.parser")
36
- return expected_soup.text == current_soup.text
37
-
38
-
39
- def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
40
- with expected_filepath.open() as expected_f:
41
- expected_text_lines = expected_f.readlines()
42
- with current_filepath.open() as current_f:
43
- current_text_lines = current_f.readlines()
44
- if len(expected_text_lines) != len(current_text_lines):
45
- print(
46
- f"Lines in expected text file ({len(expected_text_lines)}) "
47
- f"don't match current text file ({len(current_text_lines)})"
48
- )
49
- return False
50
- expected_text = "\n".join(expected_text_lines)
51
- current_text = "\n".join(current_text_lines)
52
- if expected_text == current_text:
53
- return True
54
- print("txt content don't match:")
55
- print(f"expected: {expected_text}")
56
- print(f"current: {current_text}")
57
- return False
58
-
59
-
60
- file_type_equality_check = {
61
- ".json": json_equality_check,
62
- ".html": html_equality_check,
63
- ".txt": txt_equality_check,
64
- }
65
-
66
-
67
- @dataclass
68
- class ValidationConfigs:
69
- test_id: str
14
+ class SourceValidationConfigs(ValidationConfig):
70
15
  expected_number_indexed_file_data: Optional[int] = None
71
16
  expected_num_files: Optional[int] = None
72
17
  predownload_file_data_check: Optional[Callable[[FileData], None]] = None
73
18
  postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
74
- exclude_fields: list[str] = field(
19
+ exclude_fields: list[str] = Field(
75
20
  default_factory=lambda: ["local_download_path", "metadata.date_processed"]
76
21
  )
77
- exclude_fields_extend: list[str] = field(default_factory=list)
22
+ exclude_fields_extend: list[str] = Field(default_factory=list)
78
23
  validate_downloaded_files: bool = False
79
24
  validate_file_data: bool = True
80
- downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
81
25
 
82
26
  def get_exclude_fields(self) -> list[str]:
83
27
  exclude_fields = self.exclude_fields
@@ -97,9 +41,6 @@ class ValidationConfigs:
97
41
  downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
98
42
  assert len(downloaded_files) == expected_num_files
99
43
 
100
- def test_output_dir(self) -> Path:
101
- return expected_results_path / self.test_id
102
-
103
44
  def omit_ignored_fields(self, data: dict) -> dict:
104
45
  exclude_fields = self.get_exclude_fields()
105
46
  # Ignore fields that dynamically change every time the tests run
@@ -143,14 +84,14 @@ def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
143
84
 
144
85
 
145
86
  def check_contents(
146
- expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
87
+ expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
147
88
  ):
148
89
  found_diff = False
149
90
  for file_data in all_file_data:
150
91
  file_data_path = expected_output_dir / f"{file_data.identifier}.json"
151
92
  with file_data_path.open("r") as file:
152
93
  expected_file_data_contents = json.load(file)
153
- current_file_data_contents = file_data.to_dict()
94
+ current_file_data_contents = file_data.model_dump()
154
95
  expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
155
96
  current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
156
97
  diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
@@ -160,27 +101,10 @@ def check_contents(
160
101
  assert not found_diff, f"Diffs found between files: {found_diff}"
161
102
 
162
103
 
163
- def detect_diff(
164
- configs: ValidationConfigs, expected_filepath: Path, current_filepath: Path
165
- ) -> bool:
166
- if expected_filepath.suffix != current_filepath.suffix:
167
- return True
168
- if downloaded_file_equality_check := configs.downloaded_file_equality_check:
169
- return not downloaded_file_equality_check(expected_filepath, current_filepath)
170
- current_suffix = expected_filepath.suffix
171
- if current_suffix in file_type_equality_check:
172
- equality_check_callable = file_type_equality_check[current_suffix]
173
- return not equality_check_callable(
174
- expected_filepath=expected_filepath, current_filepath=current_filepath
175
- )
176
- # Fallback is using filecmp.cmp to compare the files
177
- return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
178
-
179
-
180
104
  def check_raw_file_contents(
181
105
  expected_output_dir: Path,
182
106
  current_output_dir: Path,
183
- configs: ValidationConfigs,
107
+ configs: SourceValidationConfigs,
184
108
  ):
185
109
  current_files = get_files(dir_path=current_output_dir)
186
110
  found_diff = False
@@ -188,7 +112,7 @@ def check_raw_file_contents(
188
112
  for current_file in current_files:
189
113
  current_file_path = current_output_dir / current_file
190
114
  expected_file_path = expected_output_dir / current_file
191
- if detect_diff(configs, expected_file_path, current_file_path):
115
+ if configs.detect_diff(expected_file_path, current_file_path):
192
116
  found_diff = True
193
117
  files.append(str(expected_file_path))
194
118
  print(f"diffs between files {expected_file_path} and {current_file_path}")
@@ -196,7 +120,7 @@ def check_raw_file_contents(
196
120
 
197
121
 
198
122
  def run_expected_results_validation(
199
- expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
123
+ expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
200
124
  ):
201
125
  check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
202
126
  check_contents(
@@ -207,7 +131,7 @@ def run_expected_results_validation(
207
131
  def run_expected_download_files_validation(
208
132
  expected_output_dir: Path,
209
133
  current_download_dir: Path,
210
- configs: ValidationConfigs,
134
+ configs: SourceValidationConfigs,
211
135
  ):
212
136
  check_files_in_paths(
213
137
  expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
@@ -234,12 +158,12 @@ def update_fixtures(
234
158
  save_downloads: bool = False,
235
159
  save_filedata: bool = True,
236
160
  ):
237
- # Delete current files
238
- shutil.rmtree(path=output_dir, ignore_errors=True)
239
- output_dir.mkdir(parents=True)
240
161
  # Rewrite the current file data
162
+ if not output_dir.exists():
163
+ output_dir.mkdir(parents=True)
241
164
  if save_filedata:
242
165
  file_data_output_path = output_dir / "file_data"
166
+ shutil.rmtree(path=file_data_output_path, ignore_errors=True)
243
167
  print(
244
168
  f"Writing {len(all_file_data)} file data to "
245
169
  f"saved fixture location {file_data_output_path}"
@@ -248,7 +172,7 @@ def update_fixtures(
248
172
  for file_data in all_file_data:
249
173
  file_data_path = file_data_output_path / f"{file_data.identifier}.json"
250
174
  with file_data_path.open(mode="w") as f:
251
- json.dump(file_data.to_dict(), f, indent=2)
175
+ json.dump(file_data.model_dump(), f, indent=2)
252
176
 
253
177
  # Record file structure of download directory
254
178
  download_files = get_files(dir_path=download_dir)
@@ -260,6 +184,7 @@ def update_fixtures(
260
184
  # If applicable, save raw downloads
261
185
  if save_downloads:
262
186
  raw_download_output_path = output_dir / "downloads"
187
+ shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
263
188
  print(
264
189
  f"Writing {len(download_files)} downloaded files to "
265
190
  f"saved fixture location {raw_download_output_path}"
@@ -268,7 +193,7 @@ def update_fixtures(
268
193
 
269
194
 
270
195
  def run_all_validations(
271
- configs: ValidationConfigs,
196
+ configs: SourceValidationConfigs,
272
197
  predownload_file_data: list[FileData],
273
198
  postdownload_file_data: list[FileData],
274
199
  download_dir: Path,
@@ -289,7 +214,10 @@ def run_all_validations(
289
214
  if configs.validate_file_data:
290
215
  run_expected_results_validation(
291
216
  expected_output_dir=test_output_dir / "file_data",
292
- all_file_data=postdownload_file_data,
217
+ all_file_data=get_all_file_data(
218
+ all_predownload_file_data=predownload_file_data,
219
+ all_postdownload_file_data=postdownload_file_data,
220
+ ),
293
221
  configs=configs,
294
222
  )
295
223
  download_files = get_files(dir_path=download_dir)
@@ -305,10 +233,23 @@ def run_all_validations(
305
233
  )
306
234
 
307
235
 
236
+ def get_all_file_data(
237
+ all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
238
+ ) -> list[FileData]:
239
+ all_file_data = all_postdownload_file_data
240
+ indexed_file_data = [
241
+ fd
242
+ for fd in all_predownload_file_data
243
+ if fd.identifier not in [f.identifier for f in all_file_data]
244
+ ]
245
+ all_file_data += indexed_file_data
246
+ return all_file_data
247
+
248
+
308
249
  async def source_connector_validation(
309
250
  indexer: Indexer,
310
251
  downloader: Downloader,
311
- configs: ValidationConfigs,
252
+ configs: SourceValidationConfigs,
312
253
  overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
313
254
  ) -> None:
314
255
  # Run common validations on the process of running a source connector, supporting dynamic
@@ -322,7 +263,7 @@ async def source_connector_validation(
322
263
  test_output_dir = configs.test_output_dir()
323
264
  for file_data in indexer.run():
324
265
  assert file_data
325
- predownload_file_data = replace(file_data)
266
+ predownload_file_data = file_data.model_copy(deep=True)
326
267
  all_predownload_file_data.append(predownload_file_data)
327
268
  if downloader.is_async():
328
269
  resp = await downloader.run_async(file_data=file_data)
@@ -330,10 +271,10 @@ async def source_connector_validation(
330
271
  resp = downloader.run(file_data=file_data)
331
272
  if isinstance(resp, list):
332
273
  for r in resp:
333
- postdownload_file_data = replace(r["file_data"])
274
+ postdownload_file_data = r["file_data"].model_copy(deep=True)
334
275
  all_postdownload_file_data.append(postdownload_file_data)
335
276
  else:
336
- postdownload_file_data = replace(resp["file_data"])
277
+ postdownload_file_data = resp["file_data"].model_copy(deep=True)
337
278
  all_postdownload_file_data.append(postdownload_file_data)
338
279
  if not overwrite_fixtures:
339
280
  print("Running validation")
@@ -349,7 +290,10 @@ async def source_connector_validation(
349
290
  update_fixtures(
350
291
  output_dir=test_output_dir,
351
292
  download_dir=download_dir,
352
- all_file_data=all_postdownload_file_data,
293
+ all_file_data=get_all_file_data(
294
+ all_predownload_file_data=all_predownload_file_data,
295
+ all_postdownload_file_data=all_postdownload_file_data,
296
+ ),
353
297
  save_downloads=configs.validate_downloaded_files,
354
298
  save_filedata=configs.validate_file_data,
355
299
  )
@@ -0,0 +1,36 @@
1
+ import filecmp
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import Callable, Optional
5
+
6
+ from pydantic import BaseModel
7
+
8
+ from test.integration.connectors.utils.constants import expected_results_path
9
+ from test.integration.connectors.utils.validation.equality import file_type_equality_check
10
+
11
+
12
+ class ValidationConfig(BaseModel):
13
+ test_id: str
14
+ file_equality_check: Optional[Callable[[Path, Path], bool]] = None
15
+
16
+ def test_output_dir(self) -> Path:
17
+ return expected_results_path / self.test_id
18
+
19
+ def detect_diff(self, expected_filepath: Path, current_filepath: Path) -> bool:
20
+ if expected_filepath.suffix != current_filepath.suffix:
21
+ return True
22
+ if file_equality_check := self.file_equality_check:
23
+ return not file_equality_check(expected_filepath, current_filepath)
24
+ current_suffix = expected_filepath.suffix
25
+ if current_suffix in file_type_equality_check:
26
+ equality_check_callable = file_type_equality_check[current_suffix]
27
+ return not equality_check_callable(
28
+ expected_filepath=expected_filepath, current_filepath=current_filepath
29
+ )
30
+ # Fallback is using filecmp.cmp to compare the files
31
+ return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
32
+
33
+
34
+ def reset_dir(dir_path: Path) -> None:
35
+ shutil.rmtree(path=dir_path, ignore_errors=True)
36
+ dir_path.mkdir(parents=True)
@@ -1 +1 @@
1
- __version__ = "0.3.8" # pragma: no cover
1
+ __version__ = "0.3.10" # pragma: no cover
@@ -1,4 +1,7 @@
1
+ import base64
1
2
  import hashlib
3
+ import json
4
+ import zlib
2
5
  from itertools import groupby
3
6
 
4
7
 
@@ -43,3 +46,11 @@ def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
43
46
  e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
44
47
 
45
48
  return elements
49
+
50
+
51
+ def elements_from_base64_gzipped_json(raw_s: str) -> list[dict]:
52
+ decoded_b64_bytes = base64.b64decode(raw_s)
53
+ elements_json_bytes = zlib.decompress(decoded_b64_bytes)
54
+ elements_json_str = elements_json_bytes.decode("utf-8")
55
+ element_dicts = json.loads(elements_json_str)
56
+ return element_dicts
@@ -1,8 +1,10 @@
1
1
  import itertools
2
2
  import json
3
3
  from datetime import datetime
4
+ from pathlib import Path
4
5
  from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
5
6
 
7
+ import ndjson
6
8
  import pandas as pd
7
9
 
8
10
  DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
@@ -131,3 +133,37 @@ def validate_date_args(date: Optional[str] = None) -> bool:
131
133
  f"The argument {date} does not satisfy the format:"
132
134
  f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM",
133
135
  )
136
+
137
+
138
+ def get_data(path: Path) -> list[dict]:
139
+ with path.open() as f:
140
+ if path.suffix == ".json":
141
+ return json.load(f)
142
+ elif path.suffix == ".ndjson":
143
+ return ndjson.load(f)
144
+ elif path.suffix == ".csv":
145
+ df = pd.read_csv(path)
146
+ return df.to_dict(orient="records")
147
+ elif path.suffix == ".parquet":
148
+ df = pd.read_parquet(path)
149
+ return df.to_dict(orient="records")
150
+ else:
151
+ raise ValueError(f"Unsupported file type: {path}")
152
+
153
+
154
+ def get_data_df(path: Path) -> pd.DataFrame:
155
+ with path.open() as f:
156
+ if path.suffix == ".json":
157
+ data = json.load(f)
158
+ return pd.DataFrame(data=data)
159
+ elif path.suffix == ".ndjson":
160
+ data = ndjson.load(f)
161
+ return pd.DataFrame(data=data)
162
+ elif path.suffix == ".csv":
163
+ df = pd.read_csv(path)
164
+ return df
165
+ elif path.suffix == ".parquet":
166
+ df = pd.read_parquet(path)
167
+ return df
168
+ else:
169
+ raise ValueError(f"Unsupported file type: {path}")
@@ -1,6 +1,6 @@
1
1
  from .connector import AccessConfig, BaseConnector, ConnectionConfig
2
2
  from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
3
- from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
3
+ from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
4
4
  from .indexer import Indexer, IndexerConfig
5
5
  from .process import BaseProcess
6
6
  from .processor import ProcessorConfig
@@ -27,4 +27,6 @@ __all__ = [
27
27
  "ConnectionConfig",
28
28
  "BaseConnector",
29
29
  "FileDataSourceMetadata",
30
+ "BatchFileData",
31
+ "BatchItem",
30
32
  ]
@@ -1,13 +1,14 @@
1
1
  import json
2
- from dataclasses import dataclass, field
3
2
  from pathlib import Path
4
- from typing import Any, Literal, Optional
3
+ from typing import Any, Optional
4
+ from uuid import NAMESPACE_DNS, uuid5
5
5
 
6
- from dataclasses_json import DataClassJsonMixin
6
+ from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
7
7
 
8
+ from unstructured_ingest.v2.logger import logger
8
9
 
9
- @dataclass
10
- class SourceIdentifiers:
10
+
11
+ class SourceIdentifiers(BaseModel):
11
12
  filename: str
12
13
  fullpath: str
13
14
  rel_path: Optional[str] = None
@@ -21,8 +22,7 @@ class SourceIdentifiers:
21
22
  return self.rel_path or self.fullpath
22
23
 
23
24
 
24
- @dataclass
25
- class FileDataSourceMetadata(DataClassJsonMixin):
25
+ class FileDataSourceMetadata(BaseModel):
26
26
  url: Optional[str] = None
27
27
  version: Optional[str] = None
28
28
  record_locator: Optional[dict[str, Any]] = None
@@ -33,14 +33,12 @@ class FileDataSourceMetadata(DataClassJsonMixin):
33
33
  filesize_bytes: Optional[int] = None
34
34
 
35
35
 
36
- @dataclass
37
- class FileData(DataClassJsonMixin):
36
+ class FileData(BaseModel):
38
37
  identifier: str
39
38
  connector_type: str
40
39
  source_identifiers: Optional[SourceIdentifiers] = None
41
- doc_type: Literal["file", "batch"] = field(default="file")
42
- metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
43
- additional_metadata: dict[str, Any] = field(default_factory=dict)
40
+ metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
41
+ additional_metadata: dict[str, Any] = Field(default_factory=dict)
44
42
  reprocess: bool = False
45
43
  local_download_path: Optional[str] = None
46
44
  display_name: Optional[str] = None
@@ -52,11 +50,57 @@ class FileData(DataClassJsonMixin):
52
50
  raise ValueError(f"file path not valid: {path}")
53
51
  with open(str(path.resolve()), "rb") as f:
54
52
  file_data_dict = json.load(f)
55
- file_data = FileData.from_dict(file_data_dict)
53
+ file_data = cls.model_validate(file_data_dict)
56
54
  return file_data
57
55
 
56
+ @classmethod
57
+ def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
58
+ file_data_dict = file_data.model_dump()
59
+ return cls.model_validate(file_data_dict, **kwargs)
60
+
58
61
  def to_file(self, path: str) -> None:
59
62
  path = Path(path).resolve()
60
63
  path.parent.mkdir(parents=True, exist_ok=True)
61
64
  with open(str(path.resolve()), "w") as f:
62
- json.dump(self.to_dict(), f, indent=2)
65
+ json.dump(self.model_dump(), f, indent=2)
66
+
67
+
68
+ class BatchItem(BaseModel):
69
+ identifier: str
70
+ version: Optional[str] = None
71
+
72
+
73
+ class BatchFileData(FileData):
74
+ identifier: str = Field(init=False)
75
+ batch_items: list[BatchItem]
76
+
77
+ @field_validator("batch_items")
78
+ @classmethod
79
+ def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
80
+ if not v:
81
+ raise ValueError("batch items cannot be empty")
82
+ all_identifiers = [item.identifier for item in v]
83
+ if len(all_identifiers) != len(set(all_identifiers)):
84
+ raise ValueError(f"duplicate identifiers: {all_identifiers}")
85
+ sorted_batch_items = sorted(v, key=lambda item: item.identifier)
86
+ return sorted_batch_items
87
+
88
+ @model_validator(mode="before")
89
+ @classmethod
90
+ def populate_identifier(cls, data: Any) -> Any:
91
+ if isinstance(data, dict) and "identifier" not in data:
92
+ batch_items = data["batch_items"]
93
+ identifier_data = json.dumps(
94
+ {item.identifier: item.version for item in batch_items}, sort_keys=True
95
+ )
96
+ data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
97
+ return data
98
+
99
+
100
+ def file_data_from_file(path: str) -> FileData:
101
+ try:
102
+ return BatchFileData.from_file(path=path)
103
+ except ValidationError:
104
+ logger.debug(f"{path} not valid for batch file data")
105
+
106
+ return FileData.from_file(path=path)