unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +103 -92
- test/integration/connectors/sql/test_singlestore.py +112 -100
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +87 -76
- test/integration/connectors/test_astradb.py +62 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +6 -6
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +7 -4
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
- unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import ndjson
|
|
7
|
+
|
|
8
|
+
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
9
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers, UploadStager
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class StagerValidationConfigs(ValidationConfig):
|
|
13
|
+
expected_count: int
|
|
14
|
+
|
|
15
|
+
def stager_output_dir(self) -> Path:
|
|
16
|
+
dir = self.test_output_dir() / "stager"
|
|
17
|
+
dir.mkdir(exist_ok=True, parents=True)
|
|
18
|
+
return dir
|
|
19
|
+
|
|
20
|
+
def stager_output_path(self, input_path: Path) -> Path:
|
|
21
|
+
return self.stager_output_dir() / input_path.name
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def run_all_stager_validations(
|
|
25
|
+
configs: StagerValidationConfigs, input_file: Path, staged_filepath: Path
|
|
26
|
+
):
|
|
27
|
+
# Validate matching extensions
|
|
28
|
+
assert input_file.suffix == staged_filepath.suffix
|
|
29
|
+
|
|
30
|
+
# Validate length
|
|
31
|
+
staged_data = get_data(staged_filepath=staged_filepath)
|
|
32
|
+
assert len(staged_data) == configs.expected_count
|
|
33
|
+
|
|
34
|
+
# Validate file
|
|
35
|
+
expected_filepath = configs.stager_output_path(input_path=input_file)
|
|
36
|
+
assert expected_filepath.exists(), f"{expected_filepath} does not exist"
|
|
37
|
+
assert expected_filepath.is_file(), f"{expected_filepath} is not a file"
|
|
38
|
+
if configs.detect_diff(expected_filepath=expected_filepath, current_filepath=staged_filepath):
|
|
39
|
+
raise AssertionError(
|
|
40
|
+
f"Current file ({staged_filepath}) does not match expected file: {expected_filepath}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def update_stager_fixtures(stager_output_path: Path, staged_filepath: Path):
|
|
45
|
+
copied_filepath = stager_output_path / staged_filepath.name
|
|
46
|
+
shutil.copy(staged_filepath, copied_filepath)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_data(staged_filepath: Path) -> list[dict]:
|
|
50
|
+
if staged_filepath.suffix == ".json":
|
|
51
|
+
with staged_filepath.open() as f:
|
|
52
|
+
return json.load(f)
|
|
53
|
+
elif staged_filepath.suffix == ".ndjson":
|
|
54
|
+
with staged_filepath.open() as f:
|
|
55
|
+
return ndjson.load(f)
|
|
56
|
+
else:
|
|
57
|
+
raise ValueError(f"Unsupported file type: {staged_filepath.suffix}")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def stager_validation(
|
|
61
|
+
stager: UploadStager,
|
|
62
|
+
tmp_dir: Path,
|
|
63
|
+
input_file: Path,
|
|
64
|
+
configs: StagerValidationConfigs,
|
|
65
|
+
overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
|
|
66
|
+
) -> None:
|
|
67
|
+
# Run stager
|
|
68
|
+
file_data = FileData(
|
|
69
|
+
source_identifiers=SourceIdentifiers(fullpath=input_file.name, filename=input_file.name),
|
|
70
|
+
connector_type=configs.test_id,
|
|
71
|
+
identifier="mock file data",
|
|
72
|
+
)
|
|
73
|
+
staged_filepath = stager.run(
|
|
74
|
+
elements_filepath=input_file,
|
|
75
|
+
file_data=file_data,
|
|
76
|
+
output_dir=tmp_dir,
|
|
77
|
+
output_filename=input_file.name,
|
|
78
|
+
)
|
|
79
|
+
if not overwrite_fixtures:
|
|
80
|
+
print("Running validation")
|
|
81
|
+
run_all_stager_validations(
|
|
82
|
+
configs=configs, input_file=input_file, staged_filepath=staged_filepath
|
|
83
|
+
)
|
|
84
|
+
else:
|
|
85
|
+
print("Running fixtures update")
|
|
86
|
+
update_stager_fixtures(
|
|
87
|
+
stager_output_path=configs.stager_output_dir(), staged_filepath=staged_filepath
|
|
88
|
+
)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import ndjson
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from deepdiff import DeepDiff
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
10
|
+
with expected_filepath.open() as f:
|
|
11
|
+
expected_data = json.load(f)
|
|
12
|
+
with current_filepath.open() as f:
|
|
13
|
+
current_data = json.load(f)
|
|
14
|
+
diff = DeepDiff(expected_data, current_data)
|
|
15
|
+
if diff:
|
|
16
|
+
print("diff between expected and current json")
|
|
17
|
+
print(diff.to_json(indent=2))
|
|
18
|
+
return False
|
|
19
|
+
return True
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def ndjson_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
23
|
+
with expected_filepath.open() as f:
|
|
24
|
+
expected_data = ndjson.load(f)
|
|
25
|
+
with current_filepath.open() as f:
|
|
26
|
+
current_data = ndjson.load(f)
|
|
27
|
+
if len(current_data) != len(expected_data):
|
|
28
|
+
print(
|
|
29
|
+
f"expected data length {len(expected_data)} "
|
|
30
|
+
f"didn't match current results: {len(current_data)}"
|
|
31
|
+
)
|
|
32
|
+
for i in range(len(expected_data)):
|
|
33
|
+
e = expected_data[i]
|
|
34
|
+
r = current_data[i]
|
|
35
|
+
if e != r:
|
|
36
|
+
print(f"{i}th element doesn't match:\nexpected {e}\ncurrent {r}")
|
|
37
|
+
return False
|
|
38
|
+
return True
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
42
|
+
with expected_filepath.open() as expected_f:
|
|
43
|
+
expected_soup = BeautifulSoup(expected_f, "html.parser")
|
|
44
|
+
with current_filepath.open() as current_f:
|
|
45
|
+
current_soup = BeautifulSoup(current_f, "html.parser")
|
|
46
|
+
return expected_soup.text == current_soup.text
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
50
|
+
with expected_filepath.open() as expected_f:
|
|
51
|
+
expected_text_lines = expected_f.readlines()
|
|
52
|
+
with current_filepath.open() as current_f:
|
|
53
|
+
current_text_lines = current_f.readlines()
|
|
54
|
+
if len(expected_text_lines) != len(current_text_lines):
|
|
55
|
+
print(
|
|
56
|
+
f"Lines in expected text file ({len(expected_text_lines)}) "
|
|
57
|
+
f"don't match current text file ({len(current_text_lines)})"
|
|
58
|
+
)
|
|
59
|
+
return False
|
|
60
|
+
expected_text = "\n".join(expected_text_lines)
|
|
61
|
+
current_text = "\n".join(current_text_lines)
|
|
62
|
+
if expected_text == current_text:
|
|
63
|
+
return True
|
|
64
|
+
print("txt content don't match:")
|
|
65
|
+
print(f"expected: {expected_text}")
|
|
66
|
+
print(f"current: {current_text}")
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
file_type_equality_check = {
|
|
71
|
+
".json": json_equality_check,
|
|
72
|
+
".ndjson": ndjson_equality_check,
|
|
73
|
+
".html": html_equality_check,
|
|
74
|
+
".txt": txt_equality_check,
|
|
75
|
+
}
|
|
@@ -1,83 +1,27 @@
|
|
|
1
|
-
import filecmp
|
|
2
1
|
import json
|
|
3
2
|
import os
|
|
4
3
|
import shutil
|
|
5
|
-
from dataclasses import dataclass, field, replace
|
|
6
4
|
from pathlib import Path
|
|
7
5
|
from typing import Callable, Optional
|
|
8
6
|
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from bs4 import BeautifulSoup
|
|
11
7
|
from deepdiff import DeepDiff
|
|
8
|
+
from pydantic import Field
|
|
12
9
|
|
|
13
|
-
from test.integration.connectors.utils.
|
|
10
|
+
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
14
11
|
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
15
12
|
|
|
16
13
|
|
|
17
|
-
|
|
18
|
-
expected_df = pd.read_csv(expected_filepath)
|
|
19
|
-
current_df = pd.read_csv(current_filepath)
|
|
20
|
-
if expected_df.equals(current_df):
|
|
21
|
-
return True
|
|
22
|
-
# Print diff
|
|
23
|
-
diff = expected_df.merge(current_df, indicator=True, how="left").loc[
|
|
24
|
-
lambda x: x["_merge"] != "both"
|
|
25
|
-
]
|
|
26
|
-
print("diff between expected and current df:")
|
|
27
|
-
print(diff)
|
|
28
|
-
return False
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
32
|
-
with expected_filepath.open() as expected_f:
|
|
33
|
-
expected_soup = BeautifulSoup(expected_f, "html.parser")
|
|
34
|
-
with current_filepath.open() as current_f:
|
|
35
|
-
current_soup = BeautifulSoup(current_f, "html.parser")
|
|
36
|
-
return expected_soup.text == current_soup.text
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
40
|
-
with expected_filepath.open() as expected_f:
|
|
41
|
-
expected_text_lines = expected_f.readlines()
|
|
42
|
-
with current_filepath.open() as current_f:
|
|
43
|
-
current_text_lines = current_f.readlines()
|
|
44
|
-
if len(expected_text_lines) != len(current_text_lines):
|
|
45
|
-
print(
|
|
46
|
-
f"Lines in expected text file ({len(expected_text_lines)}) "
|
|
47
|
-
f"don't match current text file ({len(current_text_lines)})"
|
|
48
|
-
)
|
|
49
|
-
return False
|
|
50
|
-
expected_text = "\n".join(expected_text_lines)
|
|
51
|
-
current_text = "\n".join(current_text_lines)
|
|
52
|
-
if expected_text == current_text:
|
|
53
|
-
return True
|
|
54
|
-
print("txt content don't match:")
|
|
55
|
-
print(f"expected: {expected_text}")
|
|
56
|
-
print(f"current: {current_text}")
|
|
57
|
-
return False
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
file_type_equality_check = {
|
|
61
|
-
".json": json_equality_check,
|
|
62
|
-
".html": html_equality_check,
|
|
63
|
-
".txt": txt_equality_check,
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
@dataclass
|
|
68
|
-
class ValidationConfigs:
|
|
69
|
-
test_id: str
|
|
14
|
+
class SourceValidationConfigs(ValidationConfig):
|
|
70
15
|
expected_number_indexed_file_data: Optional[int] = None
|
|
71
16
|
expected_num_files: Optional[int] = None
|
|
72
17
|
predownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
73
18
|
postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
74
|
-
exclude_fields: list[str] =
|
|
19
|
+
exclude_fields: list[str] = Field(
|
|
75
20
|
default_factory=lambda: ["local_download_path", "metadata.date_processed"]
|
|
76
21
|
)
|
|
77
|
-
exclude_fields_extend: list[str] =
|
|
22
|
+
exclude_fields_extend: list[str] = Field(default_factory=list)
|
|
78
23
|
validate_downloaded_files: bool = False
|
|
79
24
|
validate_file_data: bool = True
|
|
80
|
-
downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
|
|
81
25
|
|
|
82
26
|
def get_exclude_fields(self) -> list[str]:
|
|
83
27
|
exclude_fields = self.exclude_fields
|
|
@@ -97,9 +41,6 @@ class ValidationConfigs:
|
|
|
97
41
|
downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
|
|
98
42
|
assert len(downloaded_files) == expected_num_files
|
|
99
43
|
|
|
100
|
-
def test_output_dir(self) -> Path:
|
|
101
|
-
return expected_results_path / self.test_id
|
|
102
|
-
|
|
103
44
|
def omit_ignored_fields(self, data: dict) -> dict:
|
|
104
45
|
exclude_fields = self.get_exclude_fields()
|
|
105
46
|
# Ignore fields that dynamically change every time the tests run
|
|
@@ -143,14 +84,14 @@ def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
|
|
|
143
84
|
|
|
144
85
|
|
|
145
86
|
def check_contents(
|
|
146
|
-
expected_output_dir: Path, all_file_data: list[FileData], configs:
|
|
87
|
+
expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
|
|
147
88
|
):
|
|
148
89
|
found_diff = False
|
|
149
90
|
for file_data in all_file_data:
|
|
150
91
|
file_data_path = expected_output_dir / f"{file_data.identifier}.json"
|
|
151
92
|
with file_data_path.open("r") as file:
|
|
152
93
|
expected_file_data_contents = json.load(file)
|
|
153
|
-
current_file_data_contents = file_data.
|
|
94
|
+
current_file_data_contents = file_data.model_dump()
|
|
154
95
|
expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
|
|
155
96
|
current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
|
|
156
97
|
diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
|
|
@@ -160,27 +101,10 @@ def check_contents(
|
|
|
160
101
|
assert not found_diff, f"Diffs found between files: {found_diff}"
|
|
161
102
|
|
|
162
103
|
|
|
163
|
-
def detect_diff(
|
|
164
|
-
configs: ValidationConfigs, expected_filepath: Path, current_filepath: Path
|
|
165
|
-
) -> bool:
|
|
166
|
-
if expected_filepath.suffix != current_filepath.suffix:
|
|
167
|
-
return True
|
|
168
|
-
if downloaded_file_equality_check := configs.downloaded_file_equality_check:
|
|
169
|
-
return not downloaded_file_equality_check(expected_filepath, current_filepath)
|
|
170
|
-
current_suffix = expected_filepath.suffix
|
|
171
|
-
if current_suffix in file_type_equality_check:
|
|
172
|
-
equality_check_callable = file_type_equality_check[current_suffix]
|
|
173
|
-
return not equality_check_callable(
|
|
174
|
-
expected_filepath=expected_filepath, current_filepath=current_filepath
|
|
175
|
-
)
|
|
176
|
-
# Fallback is using filecmp.cmp to compare the files
|
|
177
|
-
return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
|
|
178
|
-
|
|
179
|
-
|
|
180
104
|
def check_raw_file_contents(
|
|
181
105
|
expected_output_dir: Path,
|
|
182
106
|
current_output_dir: Path,
|
|
183
|
-
configs:
|
|
107
|
+
configs: SourceValidationConfigs,
|
|
184
108
|
):
|
|
185
109
|
current_files = get_files(dir_path=current_output_dir)
|
|
186
110
|
found_diff = False
|
|
@@ -188,7 +112,7 @@ def check_raw_file_contents(
|
|
|
188
112
|
for current_file in current_files:
|
|
189
113
|
current_file_path = current_output_dir / current_file
|
|
190
114
|
expected_file_path = expected_output_dir / current_file
|
|
191
|
-
if detect_diff(
|
|
115
|
+
if configs.detect_diff(expected_file_path, current_file_path):
|
|
192
116
|
found_diff = True
|
|
193
117
|
files.append(str(expected_file_path))
|
|
194
118
|
print(f"diffs between files {expected_file_path} and {current_file_path}")
|
|
@@ -196,7 +120,7 @@ def check_raw_file_contents(
|
|
|
196
120
|
|
|
197
121
|
|
|
198
122
|
def run_expected_results_validation(
|
|
199
|
-
expected_output_dir: Path, all_file_data: list[FileData], configs:
|
|
123
|
+
expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
|
|
200
124
|
):
|
|
201
125
|
check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
|
|
202
126
|
check_contents(
|
|
@@ -207,7 +131,7 @@ def run_expected_results_validation(
|
|
|
207
131
|
def run_expected_download_files_validation(
|
|
208
132
|
expected_output_dir: Path,
|
|
209
133
|
current_download_dir: Path,
|
|
210
|
-
configs:
|
|
134
|
+
configs: SourceValidationConfigs,
|
|
211
135
|
):
|
|
212
136
|
check_files_in_paths(
|
|
213
137
|
expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
|
|
@@ -234,12 +158,12 @@ def update_fixtures(
|
|
|
234
158
|
save_downloads: bool = False,
|
|
235
159
|
save_filedata: bool = True,
|
|
236
160
|
):
|
|
237
|
-
# Delete current files
|
|
238
|
-
shutil.rmtree(path=output_dir, ignore_errors=True)
|
|
239
|
-
output_dir.mkdir(parents=True)
|
|
240
161
|
# Rewrite the current file data
|
|
162
|
+
if not output_dir.exists():
|
|
163
|
+
output_dir.mkdir(parents=True)
|
|
241
164
|
if save_filedata:
|
|
242
165
|
file_data_output_path = output_dir / "file_data"
|
|
166
|
+
shutil.rmtree(path=file_data_output_path, ignore_errors=True)
|
|
243
167
|
print(
|
|
244
168
|
f"Writing {len(all_file_data)} file data to "
|
|
245
169
|
f"saved fixture location {file_data_output_path}"
|
|
@@ -248,7 +172,7 @@ def update_fixtures(
|
|
|
248
172
|
for file_data in all_file_data:
|
|
249
173
|
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
250
174
|
with file_data_path.open(mode="w") as f:
|
|
251
|
-
json.dump(file_data.
|
|
175
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
252
176
|
|
|
253
177
|
# Record file structure of download directory
|
|
254
178
|
download_files = get_files(dir_path=download_dir)
|
|
@@ -260,6 +184,7 @@ def update_fixtures(
|
|
|
260
184
|
# If applicable, save raw downloads
|
|
261
185
|
if save_downloads:
|
|
262
186
|
raw_download_output_path = output_dir / "downloads"
|
|
187
|
+
shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
|
|
263
188
|
print(
|
|
264
189
|
f"Writing {len(download_files)} downloaded files to "
|
|
265
190
|
f"saved fixture location {raw_download_output_path}"
|
|
@@ -268,7 +193,7 @@ def update_fixtures(
|
|
|
268
193
|
|
|
269
194
|
|
|
270
195
|
def run_all_validations(
|
|
271
|
-
configs:
|
|
196
|
+
configs: SourceValidationConfigs,
|
|
272
197
|
predownload_file_data: list[FileData],
|
|
273
198
|
postdownload_file_data: list[FileData],
|
|
274
199
|
download_dir: Path,
|
|
@@ -289,7 +214,10 @@ def run_all_validations(
|
|
|
289
214
|
if configs.validate_file_data:
|
|
290
215
|
run_expected_results_validation(
|
|
291
216
|
expected_output_dir=test_output_dir / "file_data",
|
|
292
|
-
all_file_data=
|
|
217
|
+
all_file_data=get_all_file_data(
|
|
218
|
+
all_predownload_file_data=predownload_file_data,
|
|
219
|
+
all_postdownload_file_data=postdownload_file_data,
|
|
220
|
+
),
|
|
293
221
|
configs=configs,
|
|
294
222
|
)
|
|
295
223
|
download_files = get_files(dir_path=download_dir)
|
|
@@ -305,10 +233,23 @@ def run_all_validations(
|
|
|
305
233
|
)
|
|
306
234
|
|
|
307
235
|
|
|
236
|
+
def get_all_file_data(
|
|
237
|
+
all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
|
|
238
|
+
) -> list[FileData]:
|
|
239
|
+
all_file_data = all_postdownload_file_data
|
|
240
|
+
indexed_file_data = [
|
|
241
|
+
fd
|
|
242
|
+
for fd in all_predownload_file_data
|
|
243
|
+
if fd.identifier not in [f.identifier for f in all_file_data]
|
|
244
|
+
]
|
|
245
|
+
all_file_data += indexed_file_data
|
|
246
|
+
return all_file_data
|
|
247
|
+
|
|
248
|
+
|
|
308
249
|
async def source_connector_validation(
|
|
309
250
|
indexer: Indexer,
|
|
310
251
|
downloader: Downloader,
|
|
311
|
-
configs:
|
|
252
|
+
configs: SourceValidationConfigs,
|
|
312
253
|
overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
|
|
313
254
|
) -> None:
|
|
314
255
|
# Run common validations on the process of running a source connector, supporting dynamic
|
|
@@ -322,7 +263,7 @@ async def source_connector_validation(
|
|
|
322
263
|
test_output_dir = configs.test_output_dir()
|
|
323
264
|
for file_data in indexer.run():
|
|
324
265
|
assert file_data
|
|
325
|
-
predownload_file_data =
|
|
266
|
+
predownload_file_data = file_data.model_copy(deep=True)
|
|
326
267
|
all_predownload_file_data.append(predownload_file_data)
|
|
327
268
|
if downloader.is_async():
|
|
328
269
|
resp = await downloader.run_async(file_data=file_data)
|
|
@@ -330,10 +271,10 @@ async def source_connector_validation(
|
|
|
330
271
|
resp = downloader.run(file_data=file_data)
|
|
331
272
|
if isinstance(resp, list):
|
|
332
273
|
for r in resp:
|
|
333
|
-
postdownload_file_data =
|
|
274
|
+
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
334
275
|
all_postdownload_file_data.append(postdownload_file_data)
|
|
335
276
|
else:
|
|
336
|
-
postdownload_file_data =
|
|
277
|
+
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
337
278
|
all_postdownload_file_data.append(postdownload_file_data)
|
|
338
279
|
if not overwrite_fixtures:
|
|
339
280
|
print("Running validation")
|
|
@@ -349,7 +290,10 @@ async def source_connector_validation(
|
|
|
349
290
|
update_fixtures(
|
|
350
291
|
output_dir=test_output_dir,
|
|
351
292
|
download_dir=download_dir,
|
|
352
|
-
all_file_data=
|
|
293
|
+
all_file_data=get_all_file_data(
|
|
294
|
+
all_predownload_file_data=all_predownload_file_data,
|
|
295
|
+
all_postdownload_file_data=all_postdownload_file_data,
|
|
296
|
+
),
|
|
353
297
|
save_downloads=configs.validate_downloaded_files,
|
|
354
298
|
save_filedata=configs.validate_file_data,
|
|
355
299
|
)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import filecmp
|
|
2
|
+
import shutil
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from test.integration.connectors.utils.constants import expected_results_path
|
|
9
|
+
from test.integration.connectors.utils.validation.equality import file_type_equality_check
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ValidationConfig(BaseModel):
|
|
13
|
+
test_id: str
|
|
14
|
+
file_equality_check: Optional[Callable[[Path, Path], bool]] = None
|
|
15
|
+
|
|
16
|
+
def test_output_dir(self) -> Path:
|
|
17
|
+
return expected_results_path / self.test_id
|
|
18
|
+
|
|
19
|
+
def detect_diff(self, expected_filepath: Path, current_filepath: Path) -> bool:
|
|
20
|
+
if expected_filepath.suffix != current_filepath.suffix:
|
|
21
|
+
return True
|
|
22
|
+
if file_equality_check := self.file_equality_check:
|
|
23
|
+
return not file_equality_check(expected_filepath, current_filepath)
|
|
24
|
+
current_suffix = expected_filepath.suffix
|
|
25
|
+
if current_suffix in file_type_equality_check:
|
|
26
|
+
equality_check_callable = file_type_equality_check[current_suffix]
|
|
27
|
+
return not equality_check_callable(
|
|
28
|
+
expected_filepath=expected_filepath, current_filepath=current_filepath
|
|
29
|
+
)
|
|
30
|
+
# Fallback is using filecmp.cmp to compare the files
|
|
31
|
+
return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def reset_dir(dir_path: Path) -> None:
|
|
35
|
+
shutil.rmtree(path=dir_path, ignore_errors=True)
|
|
36
|
+
dir_path.mkdir(parents=True)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.10" # pragma: no cover
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
import zlib
|
|
2
5
|
from itertools import groupby
|
|
3
6
|
|
|
4
7
|
|
|
@@ -43,3 +46,11 @@ def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
|
|
|
43
46
|
e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
|
|
44
47
|
|
|
45
48
|
return elements
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def elements_from_base64_gzipped_json(raw_s: str) -> list[dict]:
|
|
52
|
+
decoded_b64_bytes = base64.b64decode(raw_s)
|
|
53
|
+
elements_json_bytes = zlib.decompress(decoded_b64_bytes)
|
|
54
|
+
elements_json_str = elements_json_bytes.decode("utf-8")
|
|
55
|
+
element_dicts = json.loads(elements_json_str)
|
|
56
|
+
return element_dicts
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
4
5
|
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
|
|
5
6
|
|
|
7
|
+
import ndjson
|
|
6
8
|
import pandas as pd
|
|
7
9
|
|
|
8
10
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
@@ -131,3 +133,37 @@ def validate_date_args(date: Optional[str] = None) -> bool:
|
|
|
131
133
|
f"The argument {date} does not satisfy the format:"
|
|
132
134
|
f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM",
|
|
133
135
|
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def get_data(path: Path) -> list[dict]:
|
|
139
|
+
with path.open() as f:
|
|
140
|
+
if path.suffix == ".json":
|
|
141
|
+
return json.load(f)
|
|
142
|
+
elif path.suffix == ".ndjson":
|
|
143
|
+
return ndjson.load(f)
|
|
144
|
+
elif path.suffix == ".csv":
|
|
145
|
+
df = pd.read_csv(path)
|
|
146
|
+
return df.to_dict(orient="records")
|
|
147
|
+
elif path.suffix == ".parquet":
|
|
148
|
+
df = pd.read_parquet(path)
|
|
149
|
+
return df.to_dict(orient="records")
|
|
150
|
+
else:
|
|
151
|
+
raise ValueError(f"Unsupported file type: {path}")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def get_data_df(path: Path) -> pd.DataFrame:
|
|
155
|
+
with path.open() as f:
|
|
156
|
+
if path.suffix == ".json":
|
|
157
|
+
data = json.load(f)
|
|
158
|
+
return pd.DataFrame(data=data)
|
|
159
|
+
elif path.suffix == ".ndjson":
|
|
160
|
+
data = ndjson.load(f)
|
|
161
|
+
return pd.DataFrame(data=data)
|
|
162
|
+
elif path.suffix == ".csv":
|
|
163
|
+
df = pd.read_csv(path)
|
|
164
|
+
return df
|
|
165
|
+
elif path.suffix == ".parquet":
|
|
166
|
+
df = pd.read_parquet(path)
|
|
167
|
+
return df
|
|
168
|
+
else:
|
|
169
|
+
raise ValueError(f"Unsupported file type: {path}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from .connector import AccessConfig, BaseConnector, ConnectionConfig
|
|
2
2
|
from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
|
|
3
|
-
from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
|
|
3
|
+
from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
|
|
4
4
|
from .indexer import Indexer, IndexerConfig
|
|
5
5
|
from .process import BaseProcess
|
|
6
6
|
from .processor import ProcessorConfig
|
|
@@ -27,4 +27,6 @@ __all__ = [
|
|
|
27
27
|
"ConnectionConfig",
|
|
28
28
|
"BaseConnector",
|
|
29
29
|
"FileDataSourceMetadata",
|
|
30
|
+
"BatchFileData",
|
|
31
|
+
"BatchItem",
|
|
30
32
|
]
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from dataclasses import dataclass, field
|
|
3
2
|
from pathlib import Path
|
|
4
|
-
from typing import Any,
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
|
|
7
7
|
|
|
8
|
+
from unstructured_ingest.v2.logger import logger
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
class SourceIdentifiers:
|
|
10
|
+
|
|
11
|
+
class SourceIdentifiers(BaseModel):
|
|
11
12
|
filename: str
|
|
12
13
|
fullpath: str
|
|
13
14
|
rel_path: Optional[str] = None
|
|
@@ -21,8 +22,7 @@ class SourceIdentifiers:
|
|
|
21
22
|
return self.rel_path or self.fullpath
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
|
|
25
|
-
class FileDataSourceMetadata(DataClassJsonMixin):
|
|
25
|
+
class FileDataSourceMetadata(BaseModel):
|
|
26
26
|
url: Optional[str] = None
|
|
27
27
|
version: Optional[str] = None
|
|
28
28
|
record_locator: Optional[dict[str, Any]] = None
|
|
@@ -33,14 +33,12 @@ class FileDataSourceMetadata(DataClassJsonMixin):
|
|
|
33
33
|
filesize_bytes: Optional[int] = None
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
class FileData(DataClassJsonMixin):
|
|
36
|
+
class FileData(BaseModel):
|
|
38
37
|
identifier: str
|
|
39
38
|
connector_type: str
|
|
40
39
|
source_identifiers: Optional[SourceIdentifiers] = None
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
additional_metadata: dict[str, Any] = field(default_factory=dict)
|
|
40
|
+
metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
|
|
41
|
+
additional_metadata: dict[str, Any] = Field(default_factory=dict)
|
|
44
42
|
reprocess: bool = False
|
|
45
43
|
local_download_path: Optional[str] = None
|
|
46
44
|
display_name: Optional[str] = None
|
|
@@ -52,11 +50,57 @@ class FileData(DataClassJsonMixin):
|
|
|
52
50
|
raise ValueError(f"file path not valid: {path}")
|
|
53
51
|
with open(str(path.resolve()), "rb") as f:
|
|
54
52
|
file_data_dict = json.load(f)
|
|
55
|
-
file_data =
|
|
53
|
+
file_data = cls.model_validate(file_data_dict)
|
|
56
54
|
return file_data
|
|
57
55
|
|
|
56
|
+
@classmethod
|
|
57
|
+
def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
|
|
58
|
+
file_data_dict = file_data.model_dump()
|
|
59
|
+
return cls.model_validate(file_data_dict, **kwargs)
|
|
60
|
+
|
|
58
61
|
def to_file(self, path: str) -> None:
|
|
59
62
|
path = Path(path).resolve()
|
|
60
63
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
61
64
|
with open(str(path.resolve()), "w") as f:
|
|
62
|
-
json.dump(self.
|
|
65
|
+
json.dump(self.model_dump(), f, indent=2)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class BatchItem(BaseModel):
|
|
69
|
+
identifier: str
|
|
70
|
+
version: Optional[str] = None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BatchFileData(FileData):
|
|
74
|
+
identifier: str = Field(init=False)
|
|
75
|
+
batch_items: list[BatchItem]
|
|
76
|
+
|
|
77
|
+
@field_validator("batch_items")
|
|
78
|
+
@classmethod
|
|
79
|
+
def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
|
|
80
|
+
if not v:
|
|
81
|
+
raise ValueError("batch items cannot be empty")
|
|
82
|
+
all_identifiers = [item.identifier for item in v]
|
|
83
|
+
if len(all_identifiers) != len(set(all_identifiers)):
|
|
84
|
+
raise ValueError(f"duplicate identifiers: {all_identifiers}")
|
|
85
|
+
sorted_batch_items = sorted(v, key=lambda item: item.identifier)
|
|
86
|
+
return sorted_batch_items
|
|
87
|
+
|
|
88
|
+
@model_validator(mode="before")
|
|
89
|
+
@classmethod
|
|
90
|
+
def populate_identifier(cls, data: Any) -> Any:
|
|
91
|
+
if isinstance(data, dict) and "identifier" not in data:
|
|
92
|
+
batch_items = data["batch_items"]
|
|
93
|
+
identifier_data = json.dumps(
|
|
94
|
+
{item.identifier: item.version for item in batch_items}, sort_keys=True
|
|
95
|
+
)
|
|
96
|
+
data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
|
|
97
|
+
return data
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def file_data_from_file(path: str) -> FileData:
|
|
101
|
+
try:
|
|
102
|
+
return BatchFileData.from_file(path=path)
|
|
103
|
+
except ValidationError:
|
|
104
|
+
logger.debug(f"{path} not valid for batch file data")
|
|
105
|
+
|
|
106
|
+
return FileData.from_file(path=path)
|