PyPI - unstructured-ingest - Versions diffs - 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

unstructured-ingest 0.7.2py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show

unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/README.md +28 -0
unstructured_ingest/embed/mixedbreadai.py +0 -1
unstructured_ingest/interfaces/upload_stager.py +2 -2
unstructured_ingest/interfaces/uploader.py +3 -3
unstructured_ingest/main.py +0 -0
unstructured_ingest/pipeline/interfaces.py +1 -1
unstructured_ingest/pipeline/pipeline.py +1 -1
unstructured_ingest/processes/chunker.py +4 -0
unstructured_ingest/processes/connectors/airtable.py +4 -2
unstructured_ingest/processes/connectors/astradb.py +48 -34
unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
unstructured_ingest/processes/connectors/confluence.py +0 -1
unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
unstructured_ingest/processes/connectors/delta_table.py +1 -0
unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
unstructured_ingest/processes/connectors/gitlab.py +1 -2
unstructured_ingest/processes/connectors/google_drive.py +0 -2
unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
unstructured_ingest/processes/connectors/kdbai.py +1 -0
unstructured_ingest/processes/connectors/outlook.py +1 -2
unstructured_ingest/processes/connectors/pinecone.py +0 -1
unstructured_ingest/processes/connectors/redisdb.py +28 -24
unstructured_ingest/processes/connectors/salesforce.py +1 -1
unstructured_ingest/processes/connectors/slack.py +1 -2
unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
unstructured_ingest/processes/connectors/sql/sql.py +3 -4
unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
unstructured_ingest/processes/connectors/vectara.py +0 -2
unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
unstructured_ingest/processes/embedder.py +2 -2
unstructured_ingest/processes/filter.py +1 -1
unstructured_ingest/processes/partitioner.py +4 -0
unstructured_ingest/processes/utils/blob_storage.py +2 -2
unstructured_ingest/unstructured_api.py +13 -8
unstructured_ingest/utils/data_prep.py +8 -32
unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
examples/__init__.py +0 -0
examples/airtable.py +0 -44
examples/azure_cognitive_search.py +0 -55
examples/chroma.py +0 -54
examples/couchbase.py +0 -55
examples/databricks_volumes_dest.py +0 -55
examples/databricks_volumes_source.py +0 -53
examples/delta_table.py +0 -45
examples/discord_example.py +0 -36
examples/elasticsearch.py +0 -49
examples/google_drive.py +0 -45
examples/kdbai.py +0 -54
examples/local.py +0 -36
examples/milvus.py +0 -44
examples/mongodb.py +0 -53
examples/opensearch.py +0 -50
examples/pinecone.py +0 -57
examples/s3.py +0 -38
examples/salesforce.py +0 -44
examples/sharepoint.py +0 -47
examples/singlestore.py +0 -49
examples/sql.py +0 -90
examples/vectara.py +0 -54
examples/weaviate.py +0 -44
test/__init__.py +0 -0
test/integration/__init__.py +0 -0
test/integration/chunkers/__init__.py +0 -0
test/integration/chunkers/test_chunkers.py +0 -31
test/integration/connectors/__init__.py +0 -0
test/integration/connectors/conftest.py +0 -38
test/integration/connectors/databricks/__init__.py +0 -0
test/integration/connectors/databricks/test_volumes_native.py +0 -273
test/integration/connectors/discord/__init__.py +0 -0
test/integration/connectors/discord/test_discord.py +0 -90
test/integration/connectors/duckdb/__init__.py +0 -0
test/integration/connectors/duckdb/conftest.py +0 -14
test/integration/connectors/duckdb/test_duckdb.py +0 -90
test/integration/connectors/duckdb/test_motherduck.py +0 -95
test/integration/connectors/elasticsearch/__init__.py +0 -0
test/integration/connectors/elasticsearch/conftest.py +0 -34
test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
test/integration/connectors/sql/__init__.py +0 -0
test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
test/integration/connectors/sql/test_postgres.py +0 -201
test/integration/connectors/sql/test_singlestore.py +0 -182
test/integration/connectors/sql/test_snowflake.py +0 -244
test/integration/connectors/sql/test_sqlite.py +0 -168
test/integration/connectors/sql/test_vastdb.py +0 -34
test/integration/connectors/test_astradb.py +0 -287
test/integration/connectors/test_azure_ai_search.py +0 -254
test/integration/connectors/test_chroma.py +0 -136
test/integration/connectors/test_confluence.py +0 -111
test/integration/connectors/test_delta_table.py +0 -183
test/integration/connectors/test_dropbox.py +0 -151
test/integration/connectors/test_github.py +0 -49
test/integration/connectors/test_google_drive.py +0 -257
test/integration/connectors/test_jira.py +0 -67
test/integration/connectors/test_lancedb.py +0 -247
test/integration/connectors/test_milvus.py +0 -208
test/integration/connectors/test_mongodb.py +0 -335
test/integration/connectors/test_neo4j.py +0 -244
test/integration/connectors/test_notion.py +0 -152
test/integration/connectors/test_onedrive.py +0 -163
test/integration/connectors/test_pinecone.py +0 -387
test/integration/connectors/test_qdrant.py +0 -216
test/integration/connectors/test_redis.py +0 -143
test/integration/connectors/test_s3.py +0 -184
test/integration/connectors/test_sharepoint.py +0 -222
test/integration/connectors/test_vectara.py +0 -282
test/integration/connectors/test_zendesk.py +0 -120
test/integration/connectors/utils/__init__.py +0 -0
test/integration/connectors/utils/constants.py +0 -13
test/integration/connectors/utils/docker.py +0 -151
test/integration/connectors/utils/docker_compose.py +0 -59
test/integration/connectors/utils/validation/__init__.py +0 -0
test/integration/connectors/utils/validation/destination.py +0 -77
test/integration/connectors/utils/validation/equality.py +0 -76
test/integration/connectors/utils/validation/source.py +0 -331
test/integration/connectors/utils/validation/utils.py +0 -36
test/integration/connectors/weaviate/__init__.py +0 -0
test/integration/connectors/weaviate/conftest.py +0 -15
test/integration/connectors/weaviate/test_cloud.py +0 -39
test/integration/connectors/weaviate/test_local.py +0 -152
test/integration/embedders/__init__.py +0 -0
test/integration/embedders/conftest.py +0 -13
test/integration/embedders/test_azure_openai.py +0 -57
test/integration/embedders/test_bedrock.py +0 -103
test/integration/embedders/test_huggingface.py +0 -24
test/integration/embedders/test_mixedbread.py +0 -71
test/integration/embedders/test_octoai.py +0 -75
test/integration/embedders/test_openai.py +0 -74
test/integration/embedders/test_togetherai.py +0 -71
test/integration/embedders/test_vertexai.py +0 -63
test/integration/embedders/test_voyageai.py +0 -79
test/integration/embedders/utils.py +0 -66
test/integration/partitioners/__init__.py +0 -0
test/integration/partitioners/test_partitioner.py +0 -76
test/integration/utils.py +0 -15
test/unit/__init__.py +0 -0
test/unit/chunkers/__init__.py +0 -0
test/unit/chunkers/test_chunkers.py +0 -49
test/unit/connectors/__init__.py +0 -0
test/unit/connectors/ibm_watsonx/__init__.py +0 -0
test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
test/unit/connectors/motherduck/__init__.py +0 -0
test/unit/connectors/motherduck/test_base.py +0 -73
test/unit/connectors/sql/__init__.py +0 -0
test/unit/connectors/sql/test_sql.py +0 -152
test/unit/connectors/test_confluence.py +0 -71
test/unit/connectors/test_jira.py +0 -401
test/unit/embed/__init__.py +0 -0
test/unit/embed/test_mixedbreadai.py +0 -42
test/unit/embed/test_octoai.py +0 -27
test/unit/embed/test_openai.py +0 -28
test/unit/embed/test_vertexai.py +0 -25
test/unit/embed/test_voyageai.py +0 -24
test/unit/embedders/__init__.py +0 -0
test/unit/embedders/test_bedrock.py +0 -36
test/unit/embedders/test_huggingface.py +0 -48
test/unit/embedders/test_mixedbread.py +0 -37
test/unit/embedders/test_octoai.py +0 -35
test/unit/embedders/test_openai.py +0 -35
test/unit/embedders/test_togetherai.py +0 -37
test/unit/embedders/test_vertexai.py +0 -37
test/unit/embedders/test_voyageai.py +0 -38
test/unit/partitioners/__init__.py +0 -0
test/unit/partitioners/test_partitioner.py +0 -63
test/unit/test_error.py +0 -27
test/unit/test_html.py +0 -112
test/unit/test_interfaces.py +0 -26
test/unit/test_utils.py +0 -220
test/unit/utils/__init__.py +0 -0
test/unit/utils/data_generator.py +0 -32
unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
{unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0

test/integration/connectors/utils/validation/destination.py DELETED Viewed

@@ -1,77 +0,0 @@
-import os
-import shutil
-from pathlib import Path
-from test.integration.connectors.utils.validation.utils import ValidationConfig
-from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
-from unstructured_ingest.interfaces import UploadStager
-from unstructured_ingest.utils.data_prep import get_data
-class StagerValidationConfigs(ValidationConfig):
-    expected_count: int
-    expected_folder: str = "stager"
-    def stager_output_dir(self) -> Path:
-        dir = self.test_output_dir() / self.expected_folder
-        dir.mkdir(exist_ok=True, parents=True)
-        return dir
-    def stager_output_path(self, input_path: Path) -> Path:
-        return self.stager_output_dir() / input_path.name
-def run_all_stager_validations(
-    configs: StagerValidationConfigs, input_file: Path, staged_filepath: Path
-):
-    # Validate matching extensions
-    assert input_file.suffix == staged_filepath.suffix
-    # Validate length
-    staged_data = get_data(path=staged_filepath)
-    assert len(staged_data) == configs.expected_count
-    # Validate file
-    expected_filepath = configs.stager_output_path(input_path=input_file)
-    assert expected_filepath.exists(), f"{expected_filepath} does not exist"
-    assert expected_filepath.is_file(), f"{expected_filepath} is not a file"
-    if configs.detect_diff(expected_filepath=expected_filepath, current_filepath=staged_filepath):
-        raise AssertionError(
-            f"Current file ({staged_filepath}) does not match expected file: {expected_filepath}"
-        )
-def update_stager_fixtures(stager_output_path: Path, staged_filepath: Path):
-    copied_filepath = stager_output_path / staged_filepath.name
-    shutil.copy(staged_filepath, copied_filepath)
-def stager_validation(
-    stager: UploadStager,
-    tmp_dir: Path,
-    input_file: Path,
-    configs: StagerValidationConfigs,
-    overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
-) -> None:
-    # Run stager
-    file_data = FileData(
-        source_identifiers=SourceIdentifiers(fullpath=input_file.name, filename=input_file.name),
-        connector_type=configs.test_id,
-        identifier="mock file data",
-    )
-    staged_filepath = stager.run(
-        elements_filepath=input_file,
-        file_data=file_data,
-        output_dir=tmp_dir,
-        output_filename=input_file.name,
-    )
-    if not overwrite_fixtures:
-        print("Running validation")
-        run_all_stager_validations(
-            configs=configs, input_file=input_file, staged_filepath=staged_filepath
-        )
-    else:
-        print("Running fixtures update")
-        update_stager_fixtures(
-            stager_output_path=configs.stager_output_dir(), staged_filepath=staged_filepath
-        )

test/integration/connectors/utils/validation/equality.py DELETED Viewed

@@ -1,76 +0,0 @@
-import json
-from pathlib import Path
-from bs4 import BeautifulSoup
-from deepdiff import DeepDiff
-from unstructured_ingest.utils import ndjson
-def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
-    with expected_filepath.open() as f:
-        expected_data = json.load(f)
-    with current_filepath.open() as f:
-        current_data = json.load(f)
-    diff = DeepDiff(expected_data, current_data)
-    if diff:
-        print("diff between expected and current json")
-        print(diff.to_json(indent=2))
-        return False
-    return True
-def ndjson_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
-    with expected_filepath.open() as f:
-        expected_data = ndjson.load(f)
-    with current_filepath.open() as f:
-        current_data = ndjson.load(f)
-    if len(current_data) != len(expected_data):
-        print(
-            f"expected data length {len(expected_data)} "
-            f"didn't match current results: {len(current_data)}"
-        )
-    for i in range(len(expected_data)):
-        e = expected_data[i]
-        r = current_data[i]
-        if e != r:
-            print(f"{i}th element doesn't match:\nexpected {e}\ncurrent {r}")
-            return False
-    return True
-def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
-    with expected_filepath.open() as expected_f:
-        expected_soup = BeautifulSoup(expected_f, "html.parser")
-    with current_filepath.open() as current_f:
-        current_soup = BeautifulSoup(current_f, "html.parser")
-    return expected_soup.text == current_soup.text
-def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
-    with expected_filepath.open() as expected_f:
-        expected_text_lines = expected_f.readlines()
-    with current_filepath.open() as current_f:
-        current_text_lines = current_f.readlines()
-    if len(expected_text_lines) != len(current_text_lines):
-        print(
-            f"Lines in expected text file ({len(expected_text_lines)}) "
-            f"don't match current text file ({len(current_text_lines)})"
-        )
-        return False
-    expected_text = "\n".join(expected_text_lines)
-    current_text = "\n".join(current_text_lines)
-    if expected_text == current_text:
-        return True
-    print("txt content don't match:")
-    print(f"expected: {expected_text}")
-    print(f"current: {current_text}")
-    return False
-file_type_equality_check = {
-    ".json": json_equality_check,
-    ".ndjson": ndjson_equality_check,
-    ".html": html_equality_check,
-    ".txt": txt_equality_check,
-}

test/integration/connectors/utils/validation/source.py DELETED Viewed

@@ -1,331 +0,0 @@
-import json
-import os
-import shutil
-from pathlib import Path
-from typing import Callable, Optional
-from deepdiff import DeepDiff
-from pydantic import Field
-from test.integration.connectors.utils.validation.utils import ValidationConfig
-from unstructured_ingest.data_types.file_data import FileData
-from unstructured_ingest.interfaces import Downloader, Indexer
-NONSTANDARD_METADATA_FIELDS = {
-    "additional_metadata.@microsoft.graph.downloadUrl": [
-        "additional_metadata",
-        "@microsoft.graph.downloadUrl",
-    ]
-}
-class SourceValidationConfigs(ValidationConfig):
-    expected_number_indexed_file_data: Optional[int] = None
-    expected_num_files: Optional[int] = None
-    predownload_file_data_check: Optional[Callable[[FileData], None]] = None
-    postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
-    exclude_fields: list[str] = Field(
-        default_factory=lambda: ["local_download_path", "metadata.date_processed"]
-    )
-    exclude_fields_extend: list[str] = Field(default_factory=list)
-    validate_downloaded_files: bool = False
-    validate_file_data: bool = True
-    def get_exclude_fields(self) -> list[str]:
-        exclude_fields = self.exclude_fields
-        exclude_fields.extend(self.exclude_fields_extend)
-        return list(set(exclude_fields))
-    def run_file_data_validation(
-        self, predownload_file_data: FileData, postdownload_file_data: FileData
-    ):
-        if predownload_file_data_check := self.predownload_file_data_check:
-            predownload_file_data_check(predownload_file_data)
-        if postdownload_file_data_check := self.postdownload_file_data_check:
-            postdownload_file_data_check(postdownload_file_data)
-    def run_download_dir_validation(self, download_dir: Path):
-        if expected_num_files := self.expected_num_files:
-            downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
-            assert len(downloaded_files) == expected_num_files
-    def omit_ignored_fields(self, data: dict) -> dict:
-        exclude_fields = self.get_exclude_fields()
-        # Ignore fields that dynamically change every time the tests run
-        copied_data = data.copy()
-        for exclude_field in exclude_fields:
-            exclude_field_vals = (
-                NONSTANDARD_METADATA_FIELDS[exclude_field]
-                if exclude_field in NONSTANDARD_METADATA_FIELDS
-                else exclude_field.split(".")
-            )
-            if len(exclude_field_vals) == 1:
-                current_val = copied_data
-                drop_field = exclude_field_vals[0]
-                copied_data.pop(exclude_field_vals[0], None)
-            else:
-                current_val = copied_data
-                for val in exclude_field_vals[:-1]:
-                    current_val = current_val.get(val, {})
-                drop_field = exclude_field_vals[-1]
-            if drop_field == "*":
-                current_val.clear()
-            else:
-                current_val.pop(drop_field, None)
-        return copied_data
-def get_files(dir_path: Path) -> list[str]:
-    return [
-        str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.rglob("*") if f.is_file()
-    ]
-def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
-    expected_files = get_files(dir_path=expected_output_dir)
-    current_files = [f"{file_data.identifier}.json" for file_data in all_file_data]
-    diff = set(expected_files) ^ set(current_files)
-    assert not diff, "diff in files that exist: {}".format(", ".join(diff))
-def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
-    expected_files = get_files(dir_path=expected_output_dir)
-    current_files = get_files(dir_path=current_output_dir)
-    diff = set(expected_files) ^ set(current_files)
-    assert not diff, "diff in files that exist: {}".format(", ".join(diff))
-def check_contents(
-    expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
-):
-    found_diff = False
-    for file_data in all_file_data:
-        file_data_path = expected_output_dir / f"{file_data.identifier}.json"
-        with file_data_path.open("r") as file:
-            expected_file_data_contents = json.load(file)
-        current_file_data_contents = json.loads(file_data.model_dump_json())
-        expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
-        current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
-        diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
-        if diff:
-            found_diff = True
-            print(diff.to_json(indent=2))
-    assert not found_diff, f"Diffs found between files: {found_diff}"
-def check_raw_file_contents(
-    expected_output_dir: Path,
-    current_output_dir: Path,
-    configs: SourceValidationConfigs,
-):
-    current_files = get_files(dir_path=current_output_dir)
-    found_diff = False
-    files = []
-    for current_file in current_files:
-        current_file_path = current_output_dir / current_file
-        expected_file_path = expected_output_dir / current_file
-        if configs.detect_diff(expected_file_path, current_file_path):
-            found_diff = True
-            files.append(str(expected_file_path))
-            print(f"diffs between files {expected_file_path} and {current_file_path}")
-    assert not found_diff, "Diffs found between files: {}".format(", ".join(files))
-def run_expected_results_validation(
-    expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
-):
-    check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
-    check_contents(
-        expected_output_dir=expected_output_dir, all_file_data=all_file_data, configs=configs
-    )
-def run_expected_download_files_validation(
-    expected_output_dir: Path,
-    current_download_dir: Path,
-    configs: SourceValidationConfigs,
-):
-    check_files_in_paths(
-        expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
-    )
-    check_raw_file_contents(
-        expected_output_dir=expected_output_dir,
-        current_output_dir=current_download_dir,
-        configs=configs,
-    )
-def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
-    directory_record = expected_output_dir / "directory_structure.json"
-    with directory_record.open("r") as directory_file:
-        directory_file_contents = json.load(directory_file)
-    directory_structure = directory_file_contents["directory_structure"]
-    assert directory_structure == download_files
-def update_fixtures(
-    output_dir: Path,
-    download_dir: Path,
-    all_file_data: list[FileData],
-    save_downloads: bool = False,
-    save_filedata: bool = True,
-):
-    # Rewrite the current file data
-    if not output_dir.exists():
-        output_dir.mkdir(parents=True)
-    if save_filedata:
-        file_data_output_path = output_dir / "file_data"
-        shutil.rmtree(path=file_data_output_path, ignore_errors=True)
-        print(
-            f"Writing {len(all_file_data)} file data to "
-            f"saved fixture location {file_data_output_path}"
-        )
-        file_data_output_path.mkdir(parents=True, exist_ok=True)
-        for file_data in all_file_data:
-            file_data_path = file_data_output_path / f"{file_data.identifier}.json"
-            with file_data_path.open(mode="w") as f:
-                f.write(file_data.model_dump_json(indent=2))
-    # Record file structure of download directory
-    download_files = get_files(dir_path=download_dir)
-    download_files.sort()
-    download_dir_record = output_dir / "directory_structure.json"
-    with download_dir_record.open(mode="w") as f:
-        json.dump({"directory_structure": download_files}, f, indent=2)
-    # If applicable, save raw downloads
-    if save_downloads:
-        raw_download_output_path = output_dir / "downloads"
-        shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
-        print(
-            f"Writing {len(download_files)} downloaded files to "
-            f"saved fixture location {raw_download_output_path}"
-        )
-        shutil.copytree(download_dir, raw_download_output_path)
-def run_all_validations(
-    configs: SourceValidationConfigs,
-    predownload_file_data: list[FileData],
-    postdownload_file_data: list[FileData],
-    download_dir: Path,
-    test_output_dir: Path,
-):
-    if expected_number_indexed_file_data := configs.expected_number_indexed_file_data:
-        assert (
-            len(predownload_file_data) == expected_number_indexed_file_data
-        ), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
-    if expected_num_files := configs.expected_num_files:
-        assert (
-            len(postdownload_file_data) == expected_num_files
-        ), f"expected {expected_num_files} but got {len(postdownload_file_data)}"
-    for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
-        configs.run_file_data_validation(
-            predownload_file_data=pre_data, postdownload_file_data=post_data
-        )
-    configs.run_download_dir_validation(download_dir=download_dir)
-    if configs.validate_file_data:
-        run_expected_results_validation(
-            expected_output_dir=test_output_dir / "file_data",
-            all_file_data=get_all_file_data(
-                all_predownload_file_data=predownload_file_data,
-                all_postdownload_file_data=postdownload_file_data,
-            ),
-            configs=configs,
-        )
-    download_files = get_files(dir_path=download_dir)
-    download_files.sort()
-    run_directory_structure_validation(
-        expected_output_dir=configs.test_output_dir(), download_files=download_files
-    )
-    if configs.validate_downloaded_files:
-        run_expected_download_files_validation(
-            expected_output_dir=test_output_dir / "downloads",
-            current_download_dir=download_dir,
-            configs=configs,
-        )
-def get_all_file_data(
-    all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
-) -> list[FileData]:
-    all_file_data = all_postdownload_file_data
-    indexed_file_data = [
-        fd
-        for fd in all_predownload_file_data
-        if fd.identifier not in [f.identifier for f in all_file_data]
-    ]
-    all_file_data += indexed_file_data
-    return all_file_data
-async def source_connector_validation(
-    indexer: Indexer,
-    downloader: Downloader,
-    configs: SourceValidationConfigs,
-    overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
-) -> None:
-    # Run common validations on the process of running a source connector, supporting dynamic
-    # validators that get passed in along with comparisons on the saved expected values.
-    # If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
-    # expected values with what gets generated by this test.
-    all_predownload_file_data = []
-    all_postdownload_file_data = []
-    indexer.precheck()
-    download_dir = downloader.download_config.download_dir
-    test_output_dir = configs.test_output_dir()
-    if indexer.is_async():
-        async for file_data in indexer.run_async():
-            assert file_data
-            predownload_file_data = file_data.model_copy(deep=True)
-            all_predownload_file_data.append(predownload_file_data)
-            if downloader.is_async():
-                resp = await downloader.run_async(file_data=file_data)
-            else:
-                resp = downloader.run(file_data=file_data)
-            if isinstance(resp, list):
-                for r in resp:
-                    postdownload_file_data = r["file_data"].model_copy(deep=True)
-                    all_postdownload_file_data.append(postdownload_file_data)
-            else:
-                postdownload_file_data = resp["file_data"].model_copy(deep=True)
-                all_postdownload_file_data.append(postdownload_file_data)
-    else:
-        for file_data in indexer.run():
-            assert file_data
-            predownload_file_data = file_data.model_copy(deep=True)
-            all_predownload_file_data.append(predownload_file_data)
-            if downloader.is_async():
-                resp = await downloader.run_async(file_data=file_data)
-            else:
-                resp = downloader.run(file_data=file_data)
-            if isinstance(resp, list):
-                for r in resp:
-                    postdownload_file_data = r["file_data"].model_copy(deep=True)
-                    all_postdownload_file_data.append(postdownload_file_data)
-            else:
-                postdownload_file_data = resp["file_data"].model_copy(deep=True)
-                all_postdownload_file_data.append(postdownload_file_data)
-    if not overwrite_fixtures:
-        print("Running validation")
-        run_all_validations(
-            configs=configs,
-            predownload_file_data=all_predownload_file_data,
-            postdownload_file_data=all_postdownload_file_data,
-            download_dir=download_dir,
-            test_output_dir=test_output_dir,
-        )
-    else:
-        print("Running fixtures update")
-        update_fixtures(
-            output_dir=test_output_dir,
-            download_dir=download_dir,
-            all_file_data=get_all_file_data(
-                all_predownload_file_data=all_predownload_file_data,
-                all_postdownload_file_data=all_postdownload_file_data,
-            ),
-            save_downloads=configs.validate_downloaded_files,
-            save_filedata=configs.validate_file_data,
-        )

test/integration/connectors/utils/validation/utils.py DELETED Viewed

@@ -1,36 +0,0 @@
-import filecmp
-import shutil
-from pathlib import Path
-from typing import Callable, Optional
-from pydantic import BaseModel
-from test.integration.connectors.utils.constants import expected_results_path
-from test.integration.connectors.utils.validation.equality import file_type_equality_check
-class ValidationConfig(BaseModel):
-    test_id: str
-    file_equality_check: Optional[Callable[[Path, Path], bool]] = None
-    def test_output_dir(self) -> Path:
-        return expected_results_path / self.test_id
-    def detect_diff(self, expected_filepath: Path, current_filepath: Path) -> bool:
-        if expected_filepath.suffix != current_filepath.suffix:
-            return True
-        if file_equality_check := self.file_equality_check:
-            return not file_equality_check(expected_filepath, current_filepath)
-        current_suffix = expected_filepath.suffix
-        if current_suffix in file_type_equality_check:
-            equality_check_callable = file_type_equality_check[current_suffix]
-            return not equality_check_callable(
-                expected_filepath=expected_filepath, current_filepath=current_filepath
-            )
-        # Fallback is using filecmp.cmp to compare the files
-        return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
-def reset_dir(dir_path: Path) -> None:
-    shutil.rmtree(path=dir_path, ignore_errors=True)
-    dir_path.mkdir(parents=True)

test/integration/connectors/weaviate/__init__.py DELETED Viewed

File without changes

test/integration/connectors/weaviate/conftest.py DELETED Viewed

@@ -1,15 +0,0 @@
-import json
-from pathlib import Path
-import pytest
-@pytest.fixture
-def collections_schema_config() -> dict:
-    int_test_dir = Path(__file__).parent
-    assets_dir = int_test_dir / "assets"
-    config_file = assets_dir / "elements.json"
-    assert config_file.exists()
-    assert config_file.is_file()
-    with config_file.open() as config_data:
-        return json.load(config_data)

test/integration/connectors/weaviate/test_cloud.py DELETED Viewed

@@ -1,39 +0,0 @@
-import pytest
-from pydantic import ValidationError
-from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
-from unstructured_ingest.processes.connectors.weaviate.cloud import (
-    CONNECTOR_TYPE,
-    CloudWeaviateAccessConfig,
-    CloudWeaviateConnectionConfig,
-)
-@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
-def test_weaviate_failing_connection_config():
-    with pytest.raises(ValidationError):
-        CloudWeaviateConnectionConfig(
-            access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
-            username="username",
-            cluster_url="clusterurl",
-        )
-@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
-def test_weaviate_connection_config_happy_path():
-    CloudWeaviateConnectionConfig(
-        access_config=CloudWeaviateAccessConfig(
-            api_key="my key",
-        ),
-        cluster_url="clusterurl",
-    )
-@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
-def test_weaviate_connection_config_anonymous():
-    CloudWeaviateConnectionConfig(
-        access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
-        username="username",
-        anonymous=True,
-        cluster_url="clusterurl",
-    )

unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.7.2py3-none-any.whl → 1.0.2py3-none-any.whl