unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +48 -34
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import shutil
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
6
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
7
|
-
from unstructured_ingest.interfaces import UploadStager
|
|
8
|
-
from unstructured_ingest.utils.data_prep import get_data
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class StagerValidationConfigs(ValidationConfig):
|
|
12
|
-
expected_count: int
|
|
13
|
-
expected_folder: str = "stager"
|
|
14
|
-
|
|
15
|
-
def stager_output_dir(self) -> Path:
|
|
16
|
-
dir = self.test_output_dir() / self.expected_folder
|
|
17
|
-
dir.mkdir(exist_ok=True, parents=True)
|
|
18
|
-
return dir
|
|
19
|
-
|
|
20
|
-
def stager_output_path(self, input_path: Path) -> Path:
|
|
21
|
-
return self.stager_output_dir() / input_path.name
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def run_all_stager_validations(
|
|
25
|
-
configs: StagerValidationConfigs, input_file: Path, staged_filepath: Path
|
|
26
|
-
):
|
|
27
|
-
# Validate matching extensions
|
|
28
|
-
assert input_file.suffix == staged_filepath.suffix
|
|
29
|
-
|
|
30
|
-
# Validate length
|
|
31
|
-
staged_data = get_data(path=staged_filepath)
|
|
32
|
-
assert len(staged_data) == configs.expected_count
|
|
33
|
-
|
|
34
|
-
# Validate file
|
|
35
|
-
expected_filepath = configs.stager_output_path(input_path=input_file)
|
|
36
|
-
assert expected_filepath.exists(), f"{expected_filepath} does not exist"
|
|
37
|
-
assert expected_filepath.is_file(), f"{expected_filepath} is not a file"
|
|
38
|
-
if configs.detect_diff(expected_filepath=expected_filepath, current_filepath=staged_filepath):
|
|
39
|
-
raise AssertionError(
|
|
40
|
-
f"Current file ({staged_filepath}) does not match expected file: {expected_filepath}"
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def update_stager_fixtures(stager_output_path: Path, staged_filepath: Path):
|
|
45
|
-
copied_filepath = stager_output_path / staged_filepath.name
|
|
46
|
-
shutil.copy(staged_filepath, copied_filepath)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def stager_validation(
|
|
50
|
-
stager: UploadStager,
|
|
51
|
-
tmp_dir: Path,
|
|
52
|
-
input_file: Path,
|
|
53
|
-
configs: StagerValidationConfigs,
|
|
54
|
-
overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
|
|
55
|
-
) -> None:
|
|
56
|
-
# Run stager
|
|
57
|
-
file_data = FileData(
|
|
58
|
-
source_identifiers=SourceIdentifiers(fullpath=input_file.name, filename=input_file.name),
|
|
59
|
-
connector_type=configs.test_id,
|
|
60
|
-
identifier="mock file data",
|
|
61
|
-
)
|
|
62
|
-
staged_filepath = stager.run(
|
|
63
|
-
elements_filepath=input_file,
|
|
64
|
-
file_data=file_data,
|
|
65
|
-
output_dir=tmp_dir,
|
|
66
|
-
output_filename=input_file.name,
|
|
67
|
-
)
|
|
68
|
-
if not overwrite_fixtures:
|
|
69
|
-
print("Running validation")
|
|
70
|
-
run_all_stager_validations(
|
|
71
|
-
configs=configs, input_file=input_file, staged_filepath=staged_filepath
|
|
72
|
-
)
|
|
73
|
-
else:
|
|
74
|
-
print("Running fixtures update")
|
|
75
|
-
update_stager_fixtures(
|
|
76
|
-
stager_output_path=configs.stager_output_dir(), staged_filepath=staged_filepath
|
|
77
|
-
)
|
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from bs4 import BeautifulSoup
|
|
5
|
-
from deepdiff import DeepDiff
|
|
6
|
-
|
|
7
|
-
from unstructured_ingest.utils import ndjson
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
11
|
-
with expected_filepath.open() as f:
|
|
12
|
-
expected_data = json.load(f)
|
|
13
|
-
with current_filepath.open() as f:
|
|
14
|
-
current_data = json.load(f)
|
|
15
|
-
diff = DeepDiff(expected_data, current_data)
|
|
16
|
-
if diff:
|
|
17
|
-
print("diff between expected and current json")
|
|
18
|
-
print(diff.to_json(indent=2))
|
|
19
|
-
return False
|
|
20
|
-
return True
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def ndjson_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
24
|
-
with expected_filepath.open() as f:
|
|
25
|
-
expected_data = ndjson.load(f)
|
|
26
|
-
with current_filepath.open() as f:
|
|
27
|
-
current_data = ndjson.load(f)
|
|
28
|
-
if len(current_data) != len(expected_data):
|
|
29
|
-
print(
|
|
30
|
-
f"expected data length {len(expected_data)} "
|
|
31
|
-
f"didn't match current results: {len(current_data)}"
|
|
32
|
-
)
|
|
33
|
-
for i in range(len(expected_data)):
|
|
34
|
-
e = expected_data[i]
|
|
35
|
-
r = current_data[i]
|
|
36
|
-
if e != r:
|
|
37
|
-
print(f"{i}th element doesn't match:\nexpected {e}\ncurrent {r}")
|
|
38
|
-
return False
|
|
39
|
-
return True
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
43
|
-
with expected_filepath.open() as expected_f:
|
|
44
|
-
expected_soup = BeautifulSoup(expected_f, "html.parser")
|
|
45
|
-
with current_filepath.open() as current_f:
|
|
46
|
-
current_soup = BeautifulSoup(current_f, "html.parser")
|
|
47
|
-
return expected_soup.text == current_soup.text
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
51
|
-
with expected_filepath.open() as expected_f:
|
|
52
|
-
expected_text_lines = expected_f.readlines()
|
|
53
|
-
with current_filepath.open() as current_f:
|
|
54
|
-
current_text_lines = current_f.readlines()
|
|
55
|
-
if len(expected_text_lines) != len(current_text_lines):
|
|
56
|
-
print(
|
|
57
|
-
f"Lines in expected text file ({len(expected_text_lines)}) "
|
|
58
|
-
f"don't match current text file ({len(current_text_lines)})"
|
|
59
|
-
)
|
|
60
|
-
return False
|
|
61
|
-
expected_text = "\n".join(expected_text_lines)
|
|
62
|
-
current_text = "\n".join(current_text_lines)
|
|
63
|
-
if expected_text == current_text:
|
|
64
|
-
return True
|
|
65
|
-
print("txt content don't match:")
|
|
66
|
-
print(f"expected: {expected_text}")
|
|
67
|
-
print(f"current: {current_text}")
|
|
68
|
-
return False
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
file_type_equality_check = {
|
|
72
|
-
".json": json_equality_check,
|
|
73
|
-
".ndjson": ndjson_equality_check,
|
|
74
|
-
".html": html_equality_check,
|
|
75
|
-
".txt": txt_equality_check,
|
|
76
|
-
}
|
|
@@ -1,331 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import shutil
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import Callable, Optional
|
|
6
|
-
|
|
7
|
-
from deepdiff import DeepDiff
|
|
8
|
-
from pydantic import Field
|
|
9
|
-
|
|
10
|
-
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
11
|
-
from unstructured_ingest.data_types.file_data import FileData
|
|
12
|
-
from unstructured_ingest.interfaces import Downloader, Indexer
|
|
13
|
-
|
|
14
|
-
NONSTANDARD_METADATA_FIELDS = {
|
|
15
|
-
"additional_metadata.@microsoft.graph.downloadUrl": [
|
|
16
|
-
"additional_metadata",
|
|
17
|
-
"@microsoft.graph.downloadUrl",
|
|
18
|
-
]
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class SourceValidationConfigs(ValidationConfig):
|
|
23
|
-
expected_number_indexed_file_data: Optional[int] = None
|
|
24
|
-
expected_num_files: Optional[int] = None
|
|
25
|
-
predownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
26
|
-
postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
27
|
-
exclude_fields: list[str] = Field(
|
|
28
|
-
default_factory=lambda: ["local_download_path", "metadata.date_processed"]
|
|
29
|
-
)
|
|
30
|
-
exclude_fields_extend: list[str] = Field(default_factory=list)
|
|
31
|
-
validate_downloaded_files: bool = False
|
|
32
|
-
validate_file_data: bool = True
|
|
33
|
-
|
|
34
|
-
def get_exclude_fields(self) -> list[str]:
|
|
35
|
-
exclude_fields = self.exclude_fields
|
|
36
|
-
exclude_fields.extend(self.exclude_fields_extend)
|
|
37
|
-
return list(set(exclude_fields))
|
|
38
|
-
|
|
39
|
-
def run_file_data_validation(
|
|
40
|
-
self, predownload_file_data: FileData, postdownload_file_data: FileData
|
|
41
|
-
):
|
|
42
|
-
if predownload_file_data_check := self.predownload_file_data_check:
|
|
43
|
-
predownload_file_data_check(predownload_file_data)
|
|
44
|
-
if postdownload_file_data_check := self.postdownload_file_data_check:
|
|
45
|
-
postdownload_file_data_check(postdownload_file_data)
|
|
46
|
-
|
|
47
|
-
def run_download_dir_validation(self, download_dir: Path):
|
|
48
|
-
if expected_num_files := self.expected_num_files:
|
|
49
|
-
downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
|
|
50
|
-
assert len(downloaded_files) == expected_num_files
|
|
51
|
-
|
|
52
|
-
def omit_ignored_fields(self, data: dict) -> dict:
|
|
53
|
-
exclude_fields = self.get_exclude_fields()
|
|
54
|
-
# Ignore fields that dynamically change every time the tests run
|
|
55
|
-
copied_data = data.copy()
|
|
56
|
-
|
|
57
|
-
for exclude_field in exclude_fields:
|
|
58
|
-
exclude_field_vals = (
|
|
59
|
-
NONSTANDARD_METADATA_FIELDS[exclude_field]
|
|
60
|
-
if exclude_field in NONSTANDARD_METADATA_FIELDS
|
|
61
|
-
else exclude_field.split(".")
|
|
62
|
-
)
|
|
63
|
-
if len(exclude_field_vals) == 1:
|
|
64
|
-
current_val = copied_data
|
|
65
|
-
drop_field = exclude_field_vals[0]
|
|
66
|
-
copied_data.pop(exclude_field_vals[0], None)
|
|
67
|
-
else:
|
|
68
|
-
current_val = copied_data
|
|
69
|
-
for val in exclude_field_vals[:-1]:
|
|
70
|
-
current_val = current_val.get(val, {})
|
|
71
|
-
drop_field = exclude_field_vals[-1]
|
|
72
|
-
if drop_field == "*":
|
|
73
|
-
current_val.clear()
|
|
74
|
-
else:
|
|
75
|
-
current_val.pop(drop_field, None)
|
|
76
|
-
return copied_data
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def get_files(dir_path: Path) -> list[str]:
|
|
80
|
-
return [
|
|
81
|
-
str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.rglob("*") if f.is_file()
|
|
82
|
-
]
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
|
|
86
|
-
expected_files = get_files(dir_path=expected_output_dir)
|
|
87
|
-
current_files = [f"{file_data.identifier}.json" for file_data in all_file_data]
|
|
88
|
-
diff = set(expected_files) ^ set(current_files)
|
|
89
|
-
assert not diff, "diff in files that exist: {}".format(", ".join(diff))
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
|
|
93
|
-
expected_files = get_files(dir_path=expected_output_dir)
|
|
94
|
-
current_files = get_files(dir_path=current_output_dir)
|
|
95
|
-
diff = set(expected_files) ^ set(current_files)
|
|
96
|
-
assert not diff, "diff in files that exist: {}".format(", ".join(diff))
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def check_contents(
|
|
100
|
-
expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
|
|
101
|
-
):
|
|
102
|
-
found_diff = False
|
|
103
|
-
for file_data in all_file_data:
|
|
104
|
-
file_data_path = expected_output_dir / f"{file_data.identifier}.json"
|
|
105
|
-
with file_data_path.open("r") as file:
|
|
106
|
-
expected_file_data_contents = json.load(file)
|
|
107
|
-
current_file_data_contents = json.loads(file_data.model_dump_json())
|
|
108
|
-
expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
|
|
109
|
-
current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
|
|
110
|
-
diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
|
|
111
|
-
if diff:
|
|
112
|
-
found_diff = True
|
|
113
|
-
print(diff.to_json(indent=2))
|
|
114
|
-
assert not found_diff, f"Diffs found between files: {found_diff}"
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def check_raw_file_contents(
|
|
118
|
-
expected_output_dir: Path,
|
|
119
|
-
current_output_dir: Path,
|
|
120
|
-
configs: SourceValidationConfigs,
|
|
121
|
-
):
|
|
122
|
-
current_files = get_files(dir_path=current_output_dir)
|
|
123
|
-
found_diff = False
|
|
124
|
-
files = []
|
|
125
|
-
for current_file in current_files:
|
|
126
|
-
current_file_path = current_output_dir / current_file
|
|
127
|
-
expected_file_path = expected_output_dir / current_file
|
|
128
|
-
if configs.detect_diff(expected_file_path, current_file_path):
|
|
129
|
-
found_diff = True
|
|
130
|
-
files.append(str(expected_file_path))
|
|
131
|
-
print(f"diffs between files {expected_file_path} and {current_file_path}")
|
|
132
|
-
assert not found_diff, "Diffs found between files: {}".format(", ".join(files))
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
def run_expected_results_validation(
|
|
136
|
-
expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
|
|
137
|
-
):
|
|
138
|
-
check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
|
|
139
|
-
check_contents(
|
|
140
|
-
expected_output_dir=expected_output_dir, all_file_data=all_file_data, configs=configs
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
def run_expected_download_files_validation(
|
|
145
|
-
expected_output_dir: Path,
|
|
146
|
-
current_download_dir: Path,
|
|
147
|
-
configs: SourceValidationConfigs,
|
|
148
|
-
):
|
|
149
|
-
check_files_in_paths(
|
|
150
|
-
expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
|
|
151
|
-
)
|
|
152
|
-
check_raw_file_contents(
|
|
153
|
-
expected_output_dir=expected_output_dir,
|
|
154
|
-
current_output_dir=current_download_dir,
|
|
155
|
-
configs=configs,
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
|
|
160
|
-
directory_record = expected_output_dir / "directory_structure.json"
|
|
161
|
-
with directory_record.open("r") as directory_file:
|
|
162
|
-
directory_file_contents = json.load(directory_file)
|
|
163
|
-
directory_structure = directory_file_contents["directory_structure"]
|
|
164
|
-
assert directory_structure == download_files
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
def update_fixtures(
|
|
168
|
-
output_dir: Path,
|
|
169
|
-
download_dir: Path,
|
|
170
|
-
all_file_data: list[FileData],
|
|
171
|
-
save_downloads: bool = False,
|
|
172
|
-
save_filedata: bool = True,
|
|
173
|
-
):
|
|
174
|
-
# Rewrite the current file data
|
|
175
|
-
if not output_dir.exists():
|
|
176
|
-
output_dir.mkdir(parents=True)
|
|
177
|
-
if save_filedata:
|
|
178
|
-
file_data_output_path = output_dir / "file_data"
|
|
179
|
-
shutil.rmtree(path=file_data_output_path, ignore_errors=True)
|
|
180
|
-
print(
|
|
181
|
-
f"Writing {len(all_file_data)} file data to "
|
|
182
|
-
f"saved fixture location {file_data_output_path}"
|
|
183
|
-
)
|
|
184
|
-
file_data_output_path.mkdir(parents=True, exist_ok=True)
|
|
185
|
-
for file_data in all_file_data:
|
|
186
|
-
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
187
|
-
with file_data_path.open(mode="w") as f:
|
|
188
|
-
f.write(file_data.model_dump_json(indent=2))
|
|
189
|
-
|
|
190
|
-
# Record file structure of download directory
|
|
191
|
-
download_files = get_files(dir_path=download_dir)
|
|
192
|
-
download_files.sort()
|
|
193
|
-
download_dir_record = output_dir / "directory_structure.json"
|
|
194
|
-
with download_dir_record.open(mode="w") as f:
|
|
195
|
-
json.dump({"directory_structure": download_files}, f, indent=2)
|
|
196
|
-
|
|
197
|
-
# If applicable, save raw downloads
|
|
198
|
-
if save_downloads:
|
|
199
|
-
raw_download_output_path = output_dir / "downloads"
|
|
200
|
-
shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
|
|
201
|
-
print(
|
|
202
|
-
f"Writing {len(download_files)} downloaded files to "
|
|
203
|
-
f"saved fixture location {raw_download_output_path}"
|
|
204
|
-
)
|
|
205
|
-
shutil.copytree(download_dir, raw_download_output_path)
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
def run_all_validations(
|
|
209
|
-
configs: SourceValidationConfigs,
|
|
210
|
-
predownload_file_data: list[FileData],
|
|
211
|
-
postdownload_file_data: list[FileData],
|
|
212
|
-
download_dir: Path,
|
|
213
|
-
test_output_dir: Path,
|
|
214
|
-
):
|
|
215
|
-
if expected_number_indexed_file_data := configs.expected_number_indexed_file_data:
|
|
216
|
-
assert (
|
|
217
|
-
len(predownload_file_data) == expected_number_indexed_file_data
|
|
218
|
-
), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
|
|
219
|
-
if expected_num_files := configs.expected_num_files:
|
|
220
|
-
assert (
|
|
221
|
-
len(postdownload_file_data) == expected_num_files
|
|
222
|
-
), f"expected {expected_num_files} but got {len(postdownload_file_data)}"
|
|
223
|
-
|
|
224
|
-
for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
|
|
225
|
-
configs.run_file_data_validation(
|
|
226
|
-
predownload_file_data=pre_data, postdownload_file_data=post_data
|
|
227
|
-
)
|
|
228
|
-
configs.run_download_dir_validation(download_dir=download_dir)
|
|
229
|
-
if configs.validate_file_data:
|
|
230
|
-
run_expected_results_validation(
|
|
231
|
-
expected_output_dir=test_output_dir / "file_data",
|
|
232
|
-
all_file_data=get_all_file_data(
|
|
233
|
-
all_predownload_file_data=predownload_file_data,
|
|
234
|
-
all_postdownload_file_data=postdownload_file_data,
|
|
235
|
-
),
|
|
236
|
-
configs=configs,
|
|
237
|
-
)
|
|
238
|
-
download_files = get_files(dir_path=download_dir)
|
|
239
|
-
download_files.sort()
|
|
240
|
-
run_directory_structure_validation(
|
|
241
|
-
expected_output_dir=configs.test_output_dir(), download_files=download_files
|
|
242
|
-
)
|
|
243
|
-
if configs.validate_downloaded_files:
|
|
244
|
-
run_expected_download_files_validation(
|
|
245
|
-
expected_output_dir=test_output_dir / "downloads",
|
|
246
|
-
current_download_dir=download_dir,
|
|
247
|
-
configs=configs,
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
def get_all_file_data(
|
|
252
|
-
all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
|
|
253
|
-
) -> list[FileData]:
|
|
254
|
-
all_file_data = all_postdownload_file_data
|
|
255
|
-
indexed_file_data = [
|
|
256
|
-
fd
|
|
257
|
-
for fd in all_predownload_file_data
|
|
258
|
-
if fd.identifier not in [f.identifier for f in all_file_data]
|
|
259
|
-
]
|
|
260
|
-
all_file_data += indexed_file_data
|
|
261
|
-
return all_file_data
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
async def source_connector_validation(
|
|
265
|
-
indexer: Indexer,
|
|
266
|
-
downloader: Downloader,
|
|
267
|
-
configs: SourceValidationConfigs,
|
|
268
|
-
overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
|
|
269
|
-
) -> None:
|
|
270
|
-
# Run common validations on the process of running a source connector, supporting dynamic
|
|
271
|
-
# validators that get passed in along with comparisons on the saved expected values.
|
|
272
|
-
# If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
|
|
273
|
-
# expected values with what gets generated by this test.
|
|
274
|
-
all_predownload_file_data = []
|
|
275
|
-
all_postdownload_file_data = []
|
|
276
|
-
indexer.precheck()
|
|
277
|
-
download_dir = downloader.download_config.download_dir
|
|
278
|
-
test_output_dir = configs.test_output_dir()
|
|
279
|
-
if indexer.is_async():
|
|
280
|
-
async for file_data in indexer.run_async():
|
|
281
|
-
assert file_data
|
|
282
|
-
predownload_file_data = file_data.model_copy(deep=True)
|
|
283
|
-
all_predownload_file_data.append(predownload_file_data)
|
|
284
|
-
if downloader.is_async():
|
|
285
|
-
resp = await downloader.run_async(file_data=file_data)
|
|
286
|
-
else:
|
|
287
|
-
resp = downloader.run(file_data=file_data)
|
|
288
|
-
if isinstance(resp, list):
|
|
289
|
-
for r in resp:
|
|
290
|
-
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
291
|
-
all_postdownload_file_data.append(postdownload_file_data)
|
|
292
|
-
else:
|
|
293
|
-
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
294
|
-
all_postdownload_file_data.append(postdownload_file_data)
|
|
295
|
-
else:
|
|
296
|
-
for file_data in indexer.run():
|
|
297
|
-
assert file_data
|
|
298
|
-
predownload_file_data = file_data.model_copy(deep=True)
|
|
299
|
-
all_predownload_file_data.append(predownload_file_data)
|
|
300
|
-
if downloader.is_async():
|
|
301
|
-
resp = await downloader.run_async(file_data=file_data)
|
|
302
|
-
else:
|
|
303
|
-
resp = downloader.run(file_data=file_data)
|
|
304
|
-
if isinstance(resp, list):
|
|
305
|
-
for r in resp:
|
|
306
|
-
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
307
|
-
all_postdownload_file_data.append(postdownload_file_data)
|
|
308
|
-
else:
|
|
309
|
-
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
310
|
-
all_postdownload_file_data.append(postdownload_file_data)
|
|
311
|
-
if not overwrite_fixtures:
|
|
312
|
-
print("Running validation")
|
|
313
|
-
run_all_validations(
|
|
314
|
-
configs=configs,
|
|
315
|
-
predownload_file_data=all_predownload_file_data,
|
|
316
|
-
postdownload_file_data=all_postdownload_file_data,
|
|
317
|
-
download_dir=download_dir,
|
|
318
|
-
test_output_dir=test_output_dir,
|
|
319
|
-
)
|
|
320
|
-
else:
|
|
321
|
-
print("Running fixtures update")
|
|
322
|
-
update_fixtures(
|
|
323
|
-
output_dir=test_output_dir,
|
|
324
|
-
download_dir=download_dir,
|
|
325
|
-
all_file_data=get_all_file_data(
|
|
326
|
-
all_predownload_file_data=all_predownload_file_data,
|
|
327
|
-
all_postdownload_file_data=all_postdownload_file_data,
|
|
328
|
-
),
|
|
329
|
-
save_downloads=configs.validate_downloaded_files,
|
|
330
|
-
save_filedata=configs.validate_file_data,
|
|
331
|
-
)
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
import filecmp
|
|
2
|
-
import shutil
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Callable, Optional
|
|
5
|
-
|
|
6
|
-
from pydantic import BaseModel
|
|
7
|
-
|
|
8
|
-
from test.integration.connectors.utils.constants import expected_results_path
|
|
9
|
-
from test.integration.connectors.utils.validation.equality import file_type_equality_check
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class ValidationConfig(BaseModel):
|
|
13
|
-
test_id: str
|
|
14
|
-
file_equality_check: Optional[Callable[[Path, Path], bool]] = None
|
|
15
|
-
|
|
16
|
-
def test_output_dir(self) -> Path:
|
|
17
|
-
return expected_results_path / self.test_id
|
|
18
|
-
|
|
19
|
-
def detect_diff(self, expected_filepath: Path, current_filepath: Path) -> bool:
|
|
20
|
-
if expected_filepath.suffix != current_filepath.suffix:
|
|
21
|
-
return True
|
|
22
|
-
if file_equality_check := self.file_equality_check:
|
|
23
|
-
return not file_equality_check(expected_filepath, current_filepath)
|
|
24
|
-
current_suffix = expected_filepath.suffix
|
|
25
|
-
if current_suffix in file_type_equality_check:
|
|
26
|
-
equality_check_callable = file_type_equality_check[current_suffix]
|
|
27
|
-
return not equality_check_callable(
|
|
28
|
-
expected_filepath=expected_filepath, current_filepath=current_filepath
|
|
29
|
-
)
|
|
30
|
-
# Fallback is using filecmp.cmp to compare the files
|
|
31
|
-
return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def reset_dir(dir_path: Path) -> None:
|
|
35
|
-
shutil.rmtree(path=dir_path, ignore_errors=True)
|
|
36
|
-
dir_path.mkdir(parents=True)
|
|
File without changes
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@pytest.fixture
|
|
8
|
-
def collections_schema_config() -> dict:
|
|
9
|
-
int_test_dir = Path(__file__).parent
|
|
10
|
-
assets_dir = int_test_dir / "assets"
|
|
11
|
-
config_file = assets_dir / "elements.json"
|
|
12
|
-
assert config_file.exists()
|
|
13
|
-
assert config_file.is_file()
|
|
14
|
-
with config_file.open() as config_data:
|
|
15
|
-
return json.load(config_data)
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
from pydantic import ValidationError
|
|
3
|
-
|
|
4
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
5
|
-
from unstructured_ingest.processes.connectors.weaviate.cloud import (
|
|
6
|
-
CONNECTOR_TYPE,
|
|
7
|
-
CloudWeaviateAccessConfig,
|
|
8
|
-
CloudWeaviateConnectionConfig,
|
|
9
|
-
)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
13
|
-
def test_weaviate_failing_connection_config():
|
|
14
|
-
with pytest.raises(ValidationError):
|
|
15
|
-
CloudWeaviateConnectionConfig(
|
|
16
|
-
access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
|
|
17
|
-
username="username",
|
|
18
|
-
cluster_url="clusterurl",
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
23
|
-
def test_weaviate_connection_config_happy_path():
|
|
24
|
-
CloudWeaviateConnectionConfig(
|
|
25
|
-
access_config=CloudWeaviateAccessConfig(
|
|
26
|
-
api_key="my key",
|
|
27
|
-
),
|
|
28
|
-
cluster_url="clusterurl",
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
33
|
-
def test_weaviate_connection_config_anonymous():
|
|
34
|
-
CloudWeaviateConnectionConfig(
|
|
35
|
-
access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
|
|
36
|
-
username="username",
|
|
37
|
-
anonymous=True,
|
|
38
|
-
cluster_url="clusterurl",
|
|
39
|
-
)
|