unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +178 -0
- test/integration/connectors/sql/test_sqlite.py +151 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +203 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +177 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +310 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +172 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -14
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +85 -31
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import uuid
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from test.integration.connectors.utils.constants import (
|
|
9
|
+
DESTINATION_TAG,
|
|
10
|
+
SOURCE_TAG,
|
|
11
|
+
env_setup_path,
|
|
12
|
+
)
|
|
13
|
+
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
14
|
+
from test.integration.connectors.utils.validation import (
|
|
15
|
+
ValidationConfigs,
|
|
16
|
+
source_connector_validation,
|
|
17
|
+
)
|
|
18
|
+
from test.integration.utils import requires_env
|
|
19
|
+
from unstructured_ingest.error import (
|
|
20
|
+
SourceConnectionError,
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
23
|
+
from unstructured_ingest.v2.processes.connectors.fsspec.s3 import (
|
|
24
|
+
CONNECTOR_TYPE,
|
|
25
|
+
S3AccessConfig,
|
|
26
|
+
S3ConnectionConfig,
|
|
27
|
+
S3Downloader,
|
|
28
|
+
S3DownloaderConfig,
|
|
29
|
+
S3Indexer,
|
|
30
|
+
S3IndexerConfig,
|
|
31
|
+
S3Uploader,
|
|
32
|
+
S3UploaderConfig,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def validate_predownload_file_data(file_data: FileData):
|
|
37
|
+
assert file_data.connector_type == CONNECTOR_TYPE
|
|
38
|
+
assert file_data.local_download_path is None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def validate_postdownload_file_data(file_data: FileData):
|
|
42
|
+
assert file_data.connector_type == CONNECTOR_TYPE
|
|
43
|
+
assert file_data.local_download_path is not None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.fixture
|
|
47
|
+
def anon_connection_config() -> S3ConnectionConfig:
|
|
48
|
+
return S3ConnectionConfig(access_config=S3AccessConfig(), anonymous=True)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@pytest.mark.asyncio
|
|
52
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
53
|
+
async def test_s3_source(anon_connection_config: S3ConnectionConfig):
|
|
54
|
+
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/")
|
|
55
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
|
56
|
+
tempdir_path = Path(tempdir)
|
|
57
|
+
download_config = S3DownloaderConfig(download_dir=tempdir_path)
|
|
58
|
+
indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
|
|
59
|
+
downloader = S3Downloader(
|
|
60
|
+
connection_config=anon_connection_config, download_config=download_config
|
|
61
|
+
)
|
|
62
|
+
await source_connector_validation(
|
|
63
|
+
indexer=indexer,
|
|
64
|
+
downloader=downloader,
|
|
65
|
+
configs=ValidationConfigs(
|
|
66
|
+
test_id="s3",
|
|
67
|
+
predownload_file_data_check=validate_predownload_file_data,
|
|
68
|
+
postdownload_file_data_check=validate_postdownload_file_data,
|
|
69
|
+
expected_num_files=4,
|
|
70
|
+
),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@pytest.mark.asyncio
|
|
75
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
76
|
+
async def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
|
|
77
|
+
indexer_config = S3IndexerConfig(remote_url="s3://utic-ingest-test-fixtures/destination/")
|
|
78
|
+
indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
|
|
79
|
+
with pytest.raises(SourceConnectionError):
|
|
80
|
+
indexer.precheck()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@pytest.mark.asyncio
|
|
84
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio")
|
|
85
|
+
async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
|
|
86
|
+
anon_connection_config.endpoint_url = "http://localhost:9000"
|
|
87
|
+
indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
|
|
88
|
+
with docker_compose_context(docker_compose_path=env_setup_path / "minio"):
|
|
89
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
|
90
|
+
tempdir_path = Path(tempdir)
|
|
91
|
+
download_config = S3DownloaderConfig(download_dir=tempdir_path)
|
|
92
|
+
indexer = S3Indexer(
|
|
93
|
+
connection_config=anon_connection_config, index_config=indexer_config
|
|
94
|
+
)
|
|
95
|
+
downloader = S3Downloader(
|
|
96
|
+
connection_config=anon_connection_config, download_config=download_config
|
|
97
|
+
)
|
|
98
|
+
await source_connector_validation(
|
|
99
|
+
indexer=indexer,
|
|
100
|
+
downloader=downloader,
|
|
101
|
+
configs=ValidationConfigs(
|
|
102
|
+
test_id="s3-minio",
|
|
103
|
+
predownload_file_data_check=validate_predownload_file_data,
|
|
104
|
+
postdownload_file_data_check=validate_postdownload_file_data,
|
|
105
|
+
expected_num_files=1,
|
|
106
|
+
exclude_fields_extend=[
|
|
107
|
+
"metadata.date_modified",
|
|
108
|
+
"metadata.date_created",
|
|
109
|
+
"additional_metadata.LastModified",
|
|
110
|
+
],
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_aws_credentials() -> dict:
|
|
116
|
+
access_key = os.getenv("S3_INGEST_TEST_ACCESS_KEY", None)
|
|
117
|
+
assert access_key
|
|
118
|
+
secret_key = os.getenv("S3_INGEST_TEST_SECRET_KEY", None)
|
|
119
|
+
assert secret_key
|
|
120
|
+
return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@pytest.mark.asyncio
|
|
124
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
125
|
+
@requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
|
|
126
|
+
async def test_s3_destination(upload_file: Path):
|
|
127
|
+
aws_credentials = get_aws_credentials()
|
|
128
|
+
s3_bucket = "s3://utic-ingest-test-fixtures"
|
|
129
|
+
destination_path = f"{s3_bucket}/destination/{uuid.uuid4()}"
|
|
130
|
+
connection_config = S3ConnectionConfig(
|
|
131
|
+
access_config=S3AccessConfig(
|
|
132
|
+
key=aws_credentials["aws_access_key_id"],
|
|
133
|
+
secret=aws_credentials["aws_secret_access_key"],
|
|
134
|
+
),
|
|
135
|
+
)
|
|
136
|
+
upload_config = S3UploaderConfig(remote_url=destination_path)
|
|
137
|
+
uploader = S3Uploader(connection_config=connection_config, upload_config=upload_config)
|
|
138
|
+
s3fs = uploader.fs
|
|
139
|
+
file_data = FileData(
|
|
140
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
141
|
+
connector_type=CONNECTOR_TYPE,
|
|
142
|
+
identifier="mock file data",
|
|
143
|
+
)
|
|
144
|
+
try:
|
|
145
|
+
if uploader.is_async():
|
|
146
|
+
await uploader.run_async(path=upload_file, file_data=file_data)
|
|
147
|
+
else:
|
|
148
|
+
uploader.run(path=upload_file, file_data=file_data)
|
|
149
|
+
uploaded_files = s3fs.ls(path=destination_path)
|
|
150
|
+
assert len(uploaded_files) == 1
|
|
151
|
+
finally:
|
|
152
|
+
s3fs.rm(path=destination_path, recursive=True)
|
|
File without changes
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@contextmanager
|
|
7
|
+
def docker_compose_context(docker_compose_path: Path):
|
|
8
|
+
# Dynamically run a specific docker compose file and make sure it gets cleanup by
|
|
9
|
+
# by leveraging a context manager. Uses subprocess to map docker compose commands
|
|
10
|
+
# to the underlying shell.
|
|
11
|
+
assert docker_compose_path.exists()
|
|
12
|
+
if docker_compose_path.is_dir():
|
|
13
|
+
if (docker_compose_path / "docker-compose.yml").exists():
|
|
14
|
+
docker_compose_path = docker_compose_path / "docker-compose.yml"
|
|
15
|
+
elif (docker_compose_path / "docker-compose.yaml").exists():
|
|
16
|
+
docker_compose_path = docker_compose_path / "docker-compose.yaml"
|
|
17
|
+
assert docker_compose_path.is_file()
|
|
18
|
+
resp = None
|
|
19
|
+
try:
|
|
20
|
+
cmd = f"docker compose -f {docker_compose_path.resolve()} up -d --wait"
|
|
21
|
+
print(f"Running command: {cmd}")
|
|
22
|
+
resp = subprocess.run(
|
|
23
|
+
cmd,
|
|
24
|
+
shell=True,
|
|
25
|
+
capture_output=True,
|
|
26
|
+
)
|
|
27
|
+
# Return code from docker compose using --wait can be 1 even if no error
|
|
28
|
+
yield
|
|
29
|
+
except Exception as e:
|
|
30
|
+
if resp:
|
|
31
|
+
print("STDOUT: {}".format(resp.stdout.decode("utf-8")))
|
|
32
|
+
print("STDERR: {}".format(resp.stderr.decode("utf-8")))
|
|
33
|
+
raise e
|
|
34
|
+
finally:
|
|
35
|
+
cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v"
|
|
36
|
+
print(f"Running command: {cmd}")
|
|
37
|
+
final_resp = subprocess.run(
|
|
38
|
+
cmd,
|
|
39
|
+
shell=True,
|
|
40
|
+
capture_output=True,
|
|
41
|
+
)
|
|
42
|
+
if final_resp.returncode != 0:
|
|
43
|
+
print("STDOUT: {}".format(final_resp.stdout.decode("utf-8")))
|
|
44
|
+
print("STDERR: {}".format(final_resp.stderr.decode("utf-8")))
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
from dataclasses import dataclass, field, replace
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Optional
|
|
7
|
+
|
|
8
|
+
from deepdiff import DeepDiff
|
|
9
|
+
|
|
10
|
+
from test.integration.connectors.utils.constants import expected_results_path
|
|
11
|
+
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ValidationConfigs:
|
|
16
|
+
test_id: str
|
|
17
|
+
expected_num_files: Optional[int] = None
|
|
18
|
+
predownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
19
|
+
postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
20
|
+
exclude_fields: list[str] = field(
|
|
21
|
+
default_factory=lambda: ["local_download_path", "metadata.date_processed"]
|
|
22
|
+
)
|
|
23
|
+
exclude_fields_extend: list[str] = field(default_factory=list)
|
|
24
|
+
|
|
25
|
+
def get_exclude_fields(self) -> list[str]:
|
|
26
|
+
exclude_fields = self.exclude_fields
|
|
27
|
+
exclude_fields.extend(self.exclude_fields_extend)
|
|
28
|
+
return exclude_fields
|
|
29
|
+
|
|
30
|
+
def run_file_data_validation(
|
|
31
|
+
self, predownload_file_data: FileData, postdownload_file_data: FileData
|
|
32
|
+
):
|
|
33
|
+
if predownload_file_data_check := self.predownload_file_data_check:
|
|
34
|
+
predownload_file_data_check(predownload_file_data)
|
|
35
|
+
if postdownload_file_data_check := self.postdownload_file_data_check:
|
|
36
|
+
postdownload_file_data_check(postdownload_file_data)
|
|
37
|
+
|
|
38
|
+
def run_download_dir_validation(self, download_dir: Path):
|
|
39
|
+
if expected_num_files := self.expected_num_files:
|
|
40
|
+
downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
|
|
41
|
+
assert len(downloaded_files) == expected_num_files
|
|
42
|
+
|
|
43
|
+
def test_output_dir(self) -> Path:
|
|
44
|
+
return expected_results_path / self.test_id
|
|
45
|
+
|
|
46
|
+
def omit_ignored_fields(self, data: dict) -> dict:
|
|
47
|
+
exclude_fields = self.get_exclude_fields()
|
|
48
|
+
# Ignore fields that dynamically change every time the tests run
|
|
49
|
+
copied_data = data.copy()
|
|
50
|
+
for exclude_field in exclude_fields:
|
|
51
|
+
exclude_field_vals = exclude_field.split(".")
|
|
52
|
+
if len(exclude_field_vals) == 1:
|
|
53
|
+
current_val = copied_data
|
|
54
|
+
drop_field = exclude_field_vals[0]
|
|
55
|
+
copied_data.pop(exclude_field_vals[0], None)
|
|
56
|
+
else:
|
|
57
|
+
current_val = copied_data
|
|
58
|
+
for val in exclude_field_vals[:-1]:
|
|
59
|
+
current_val = current_val.get(val, {})
|
|
60
|
+
drop_field = exclude_field_vals[-1]
|
|
61
|
+
if drop_field == "*":
|
|
62
|
+
current_val.clear()
|
|
63
|
+
else:
|
|
64
|
+
current_val.pop(drop_field, None)
|
|
65
|
+
return copied_data
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_files(dir_path: Path) -> list[str]:
|
|
69
|
+
return [
|
|
70
|
+
str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.iterdir() if f.is_file()
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
|
|
75
|
+
expected_files = get_files(dir_path=expected_output_dir)
|
|
76
|
+
current_files = [f"{file_data.identifier}.json" for file_data in all_file_data]
|
|
77
|
+
diff = set(expected_files) ^ set(current_files)
|
|
78
|
+
assert not diff, "diff in files that exist: {}".format(", ".join(diff))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def check_contents(
|
|
82
|
+
expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
|
|
83
|
+
):
|
|
84
|
+
found_diff = False
|
|
85
|
+
for file_data in all_file_data:
|
|
86
|
+
file_data_path = expected_output_dir / f"{file_data.identifier}.json"
|
|
87
|
+
with file_data_path.open("r") as file:
|
|
88
|
+
expected_file_data_contents = json.load(file)
|
|
89
|
+
current_file_data_contents = file_data.to_dict()
|
|
90
|
+
expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
|
|
91
|
+
current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
|
|
92
|
+
diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
|
|
93
|
+
if diff:
|
|
94
|
+
found_diff = True
|
|
95
|
+
print(diff.to_json(indent=2))
|
|
96
|
+
assert not found_diff, f"Diffs found between files: {found_diff}"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def run_expected_results_validation(
|
|
100
|
+
expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
|
|
101
|
+
):
|
|
102
|
+
check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
|
|
103
|
+
check_contents(
|
|
104
|
+
expected_output_dir=expected_output_dir, all_file_data=all_file_data, configs=configs
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
|
|
109
|
+
directory_record = expected_output_dir / "directory_structure.json"
|
|
110
|
+
with directory_record.open("r") as directory_file:
|
|
111
|
+
directory_file_contents = json.load(directory_file)
|
|
112
|
+
directory_structure = directory_file_contents["directory_structure"]
|
|
113
|
+
assert directory_structure == download_files
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def update_fixtures(output_dir: Path, download_dir: Path, all_file_data: list[FileData]):
|
|
117
|
+
# Delete current files
|
|
118
|
+
shutil.rmtree(path=output_dir, ignore_errors=True)
|
|
119
|
+
output_dir.mkdir(parents=True)
|
|
120
|
+
# Rewrite the current file data
|
|
121
|
+
file_data_output_path = output_dir / "file_data"
|
|
122
|
+
file_data_output_path.mkdir(parents=True)
|
|
123
|
+
for file_data in all_file_data:
|
|
124
|
+
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
125
|
+
with file_data_path.open(mode="w") as f:
|
|
126
|
+
json.dump(file_data.to_dict(), f, indent=2)
|
|
127
|
+
|
|
128
|
+
# Record file structure of download directory
|
|
129
|
+
download_files = get_files(dir_path=download_dir)
|
|
130
|
+
download_files.sort()
|
|
131
|
+
download_dir_record = output_dir / "directory_structure.json"
|
|
132
|
+
with download_dir_record.open(mode="w") as f:
|
|
133
|
+
json.dump({"directory_structure": download_files}, f, indent=2)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def run_all_validations(
|
|
137
|
+
configs: ValidationConfigs,
|
|
138
|
+
predownload_file_data: list[FileData],
|
|
139
|
+
postdownload_file_data: list[FileData],
|
|
140
|
+
download_dir: Path,
|
|
141
|
+
test_output_dir: Path,
|
|
142
|
+
):
|
|
143
|
+
for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
|
|
144
|
+
configs.run_file_data_validation(
|
|
145
|
+
predownload_file_data=pre_data, postdownload_file_data=post_data
|
|
146
|
+
)
|
|
147
|
+
configs.run_download_dir_validation(download_dir=download_dir)
|
|
148
|
+
run_expected_results_validation(
|
|
149
|
+
expected_output_dir=test_output_dir / "file_data",
|
|
150
|
+
all_file_data=postdownload_file_data,
|
|
151
|
+
configs=configs,
|
|
152
|
+
)
|
|
153
|
+
download_files = get_files(dir_path=download_dir)
|
|
154
|
+
download_files.sort()
|
|
155
|
+
run_directory_structure_validation(
|
|
156
|
+
expected_output_dir=configs.test_output_dir(), download_files=download_files
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
async def source_connector_validation(
|
|
161
|
+
indexer: Indexer,
|
|
162
|
+
downloader: Downloader,
|
|
163
|
+
configs: ValidationConfigs,
|
|
164
|
+
overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
|
|
165
|
+
) -> None:
|
|
166
|
+
# Run common validations on the process of running a source connector, supporting dynamic
|
|
167
|
+
# validators that get passed in along with comparisons on the saved expected values.
|
|
168
|
+
# If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
|
|
169
|
+
# expected values with what gets generated by this test.
|
|
170
|
+
all_predownload_file_data = []
|
|
171
|
+
all_postdownload_file_data = []
|
|
172
|
+
indexer.precheck()
|
|
173
|
+
download_dir = downloader.download_config.download_dir
|
|
174
|
+
test_output_dir = configs.test_output_dir()
|
|
175
|
+
for file_data in indexer.run():
|
|
176
|
+
assert file_data
|
|
177
|
+
predownload_file_data = replace(file_data)
|
|
178
|
+
all_predownload_file_data.append(predownload_file_data)
|
|
179
|
+
if downloader.is_async():
|
|
180
|
+
resp = await downloader.run_async(file_data=file_data)
|
|
181
|
+
else:
|
|
182
|
+
resp = downloader.run(file_data=file_data)
|
|
183
|
+
if isinstance(resp, list):
|
|
184
|
+
for r in resp:
|
|
185
|
+
postdownload_file_data = replace(r["file_data"])
|
|
186
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
187
|
+
else:
|
|
188
|
+
postdownload_file_data = replace(resp["file_data"])
|
|
189
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
190
|
+
if not overwrite_fixtures:
|
|
191
|
+
run_all_validations(
|
|
192
|
+
configs=configs,
|
|
193
|
+
predownload_file_data=all_predownload_file_data,
|
|
194
|
+
postdownload_file_data=all_postdownload_file_data,
|
|
195
|
+
download_dir=download_dir,
|
|
196
|
+
test_output_dir=test_output_dir,
|
|
197
|
+
)
|
|
198
|
+
else:
|
|
199
|
+
update_fixtures(
|
|
200
|
+
output_dir=test_output_dir,
|
|
201
|
+
download_dir=download_dir,
|
|
202
|
+
all_file_data=all_postdownload_file_data,
|
|
203
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.fixture
|
|
7
|
+
def embedder_file() -> Path:
|
|
8
|
+
int_test_dir = Path(__file__).parent
|
|
9
|
+
assets_dir = int_test_dir / "assets"
|
|
10
|
+
embedder_file = assets_dir / "DA-1p-with-duplicate-pages.pdf.json"
|
|
11
|
+
assert embedder_file.exists()
|
|
12
|
+
assert embedder_file.is_file()
|
|
13
|
+
return embedder_file
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
|
|
8
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_aws_credentials() -> dict:
|
|
12
|
+
access_key = os.getenv("AWS_ACCESS_KEY_ID", None)
|
|
13
|
+
assert access_key
|
|
14
|
+
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", None)
|
|
15
|
+
assert secret_key
|
|
16
|
+
return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
|
|
20
|
+
def test_bedrock_embedder(embedder_file: Path):
|
|
21
|
+
aws_credentials = get_aws_credentials()
|
|
22
|
+
embedder_config = EmbedderConfig(
|
|
23
|
+
embedding_provider="aws-bedrock",
|
|
24
|
+
embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
25
|
+
embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
26
|
+
)
|
|
27
|
+
embedder = Embedder(config=embedder_config)
|
|
28
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
29
|
+
assert results
|
|
30
|
+
with embedder_file.open("r") as f:
|
|
31
|
+
original_elements = json.load(f)
|
|
32
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
|
|
36
|
+
def test_raw_bedrock_embedder(embedder_file: Path):
|
|
37
|
+
aws_credentials = get_aws_credentials()
|
|
38
|
+
embedder = BedrockEmbeddingEncoder(
|
|
39
|
+
config=BedrockEmbeddingConfig(
|
|
40
|
+
aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
41
|
+
aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
validate_raw_embedder(
|
|
45
|
+
embedder=embedder,
|
|
46
|
+
embedder_file=embedder_file,
|
|
47
|
+
expected_dimensions=(1536,),
|
|
48
|
+
expected_is_unit_vector=False,
|
|
49
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
5
|
+
from unstructured_ingest.embed.huggingface import (
|
|
6
|
+
HuggingFaceEmbeddingConfig,
|
|
7
|
+
HuggingFaceEmbeddingEncoder,
|
|
8
|
+
)
|
|
9
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_huggingface_embedder(embedder_file: Path):
|
|
13
|
+
embedder_config = EmbedderConfig(embedding_provider="huggingface")
|
|
14
|
+
embedder = Embedder(config=embedder_config)
|
|
15
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
16
|
+
assert results
|
|
17
|
+
with embedder_file.open("r") as f:
|
|
18
|
+
original_elements = json.load(f)
|
|
19
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_raw_hugginface_embedder(embedder_file: Path):
|
|
23
|
+
embedder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
|
|
24
|
+
validate_raw_embedder(
|
|
25
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(384,)
|
|
26
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.mixedbreadai import (
|
|
8
|
+
MixedbreadAIEmbeddingConfig,
|
|
9
|
+
MixedbreadAIEmbeddingEncoder,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
12
|
+
|
|
13
|
+
API_KEY = "MXBAI_API_KEY"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_api_key() -> str:
|
|
17
|
+
api_key = os.getenv(API_KEY, None)
|
|
18
|
+
assert api_key
|
|
19
|
+
return api_key
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@requires_env(API_KEY)
|
|
23
|
+
def test_mixedbread_embedder(embedder_file: Path):
|
|
24
|
+
api_key = get_api_key()
|
|
25
|
+
embedder_config = EmbedderConfig(embedding_provider="mixedbread-ai", embedding_api_key=api_key)
|
|
26
|
+
embedder = Embedder(config=embedder_config)
|
|
27
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
28
|
+
assert results
|
|
29
|
+
with embedder_file.open("r") as f:
|
|
30
|
+
original_elements = json.load(f)
|
|
31
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@requires_env(API_KEY)
|
|
35
|
+
def test_raw_mixedbread_embedder(embedder_file: Path):
|
|
36
|
+
api_key = get_api_key()
|
|
37
|
+
embedder = MixedbreadAIEmbeddingEncoder(
|
|
38
|
+
config=MixedbreadAIEmbeddingConfig(
|
|
39
|
+
api_key=api_key,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
validate_raw_embedder(
|
|
43
|
+
embedder=embedder,
|
|
44
|
+
embedder_file=embedder_file,
|
|
45
|
+
expected_dimensions=(1024,),
|
|
46
|
+
expected_is_unit_vector=False,
|
|
47
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
8
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
|
+
|
|
10
|
+
API_KEY = "OCTOAI_API_KEY"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_api_key() -> str:
|
|
14
|
+
api_key = os.getenv(API_KEY, None)
|
|
15
|
+
assert api_key
|
|
16
|
+
return api_key
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@requires_env(API_KEY)
|
|
20
|
+
def test_octoai_embedder(embedder_file: Path):
|
|
21
|
+
api_key = get_api_key()
|
|
22
|
+
embedder_config = EmbedderConfig(embedding_provider="octoai", embedding_api_key=api_key)
|
|
23
|
+
embedder = Embedder(config=embedder_config)
|
|
24
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
25
|
+
assert results
|
|
26
|
+
with embedder_file.open("r") as f:
|
|
27
|
+
original_elements = json.load(f)
|
|
28
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@requires_env(API_KEY)
|
|
32
|
+
def test_raw_octoai_embedder(embedder_file: Path):
|
|
33
|
+
api_key = get_api_key()
|
|
34
|
+
embedder = OctoAIEmbeddingEncoder(
|
|
35
|
+
config=OctoAiEmbeddingConfig(
|
|
36
|
+
api_key=api_key,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
validate_raw_embedder(
|
|
40
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
|
|
41
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
8
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
|
+
|
|
10
|
+
API_KEY = "OPENAI_API_KEY"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_api_key() -> str:
|
|
14
|
+
api_key = os.getenv(API_KEY, None)
|
|
15
|
+
assert api_key
|
|
16
|
+
return api_key
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@requires_env(API_KEY)
|
|
20
|
+
def test_openai_embedder(embedder_file: Path):
|
|
21
|
+
api_key = get_api_key()
|
|
22
|
+
embedder_config = EmbedderConfig(embedding_provider="openai", embedding_api_key=api_key)
|
|
23
|
+
embedder = Embedder(config=embedder_config)
|
|
24
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
25
|
+
assert results
|
|
26
|
+
with embedder_file.open("r") as f:
|
|
27
|
+
original_elements = json.load(f)
|
|
28
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@requires_env(API_KEY)
|
|
32
|
+
def test_raw_openai_embedder(embedder_file: Path):
|
|
33
|
+
api_key = get_api_key()
|
|
34
|
+
embedder = OpenAIEmbeddingEncoder(
|
|
35
|
+
config=OpenAIEmbeddingConfig(
|
|
36
|
+
api_key=api_key,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
validate_raw_embedder(
|
|
40
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
|
|
41
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
|
|
8
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
|
+
|
|
10
|
+
API_KEY = "VERTEXAI_API_KEY"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_api_key() -> str:
|
|
14
|
+
api_key = os.getenv(API_KEY, None)
|
|
15
|
+
assert api_key
|
|
16
|
+
return api_key
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@requires_env(API_KEY)
|
|
20
|
+
def test_vertexai_embedder(embedder_file: Path):
|
|
21
|
+
api_key = get_api_key()
|
|
22
|
+
embedder_config = EmbedderConfig(embedding_provider="vertexai", embedding_api_key=api_key)
|
|
23
|
+
embedder = Embedder(config=embedder_config)
|
|
24
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
25
|
+
assert results
|
|
26
|
+
with embedder_file.open("r") as f:
|
|
27
|
+
original_elements = json.load(f)
|
|
28
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@requires_env(API_KEY)
|
|
32
|
+
def test_raw_vertexai_embedder(embedder_file: Path):
|
|
33
|
+
api_key = get_api_key()
|
|
34
|
+
embedder = VertexAIEmbeddingEncoder(
|
|
35
|
+
config=VertexAIEmbeddingConfig(
|
|
36
|
+
api_key=api_key,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
validate_raw_embedder(
|
|
40
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(768,)
|
|
41
|
+
)
|