unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/test_postgres.py +100 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/test_sqlite.py +91 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +198 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +14 -12
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +82 -29
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
from dataclasses import dataclass, field, replace
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Optional
|
|
7
|
+
|
|
8
|
+
from deepdiff import DeepDiff
|
|
9
|
+
|
|
10
|
+
from test.integration.connectors.utils.constants import expected_results_path
|
|
11
|
+
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ValidationConfigs:
|
|
16
|
+
test_id: str
|
|
17
|
+
expected_num_files: Optional[int] = None
|
|
18
|
+
predownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
19
|
+
postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
20
|
+
exclude_fields: list[str] = field(
|
|
21
|
+
default_factory=lambda: ["local_download_path", "metadata.date_processed"]
|
|
22
|
+
)
|
|
23
|
+
exclude_fields_extend: list[str] = field(default_factory=list)
|
|
24
|
+
|
|
25
|
+
def get_exclude_fields(self) -> list[str]:
|
|
26
|
+
exclude_fields = self.exclude_fields
|
|
27
|
+
exclude_fields.extend(self.exclude_fields_extend)
|
|
28
|
+
return exclude_fields
|
|
29
|
+
|
|
30
|
+
def run_file_data_validation(
|
|
31
|
+
self, predownload_file_data: FileData, postdownload_file_data: FileData
|
|
32
|
+
):
|
|
33
|
+
if predownload_file_data_check := self.predownload_file_data_check:
|
|
34
|
+
predownload_file_data_check(predownload_file_data)
|
|
35
|
+
if postdownload_file_data_check := self.postdownload_file_data_check:
|
|
36
|
+
postdownload_file_data_check(postdownload_file_data)
|
|
37
|
+
|
|
38
|
+
def run_download_dir_validation(self, download_dir: Path):
|
|
39
|
+
if expected_num_files := self.expected_num_files:
|
|
40
|
+
downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
|
|
41
|
+
assert len(downloaded_files) == expected_num_files
|
|
42
|
+
|
|
43
|
+
def test_output_dir(self) -> Path:
|
|
44
|
+
return expected_results_path / self.test_id
|
|
45
|
+
|
|
46
|
+
def omit_ignored_fields(self, data: dict) -> dict:
|
|
47
|
+
exclude_fields = self.get_exclude_fields()
|
|
48
|
+
# Ignore fields that dynamically change every time the tests run
|
|
49
|
+
copied_data = data.copy()
|
|
50
|
+
for exclude_field in exclude_fields:
|
|
51
|
+
exclude_field_vals = exclude_field.split(".")
|
|
52
|
+
if len(exclude_field_vals) == 1:
|
|
53
|
+
current_val = copied_data
|
|
54
|
+
drop_field = exclude_field_vals[0]
|
|
55
|
+
copied_data.pop(exclude_field_vals[0], None)
|
|
56
|
+
else:
|
|
57
|
+
current_val = copied_data
|
|
58
|
+
for val in exclude_field_vals[:-1]:
|
|
59
|
+
current_val = current_val.get(val, {})
|
|
60
|
+
drop_field = exclude_field_vals[-1]
|
|
61
|
+
if drop_field == "*":
|
|
62
|
+
current_val.clear()
|
|
63
|
+
else:
|
|
64
|
+
current_val.pop(drop_field, None)
|
|
65
|
+
return copied_data
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_files(dir_path: Path) -> list[str]:
|
|
69
|
+
return [
|
|
70
|
+
str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.iterdir() if f.is_file()
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
|
|
75
|
+
expected_files = get_files(dir_path=expected_output_dir)
|
|
76
|
+
current_files = [f"{file_data.identifier}.json" for file_data in all_file_data]
|
|
77
|
+
diff = set(expected_files) ^ set(current_files)
|
|
78
|
+
assert not diff, "diff in files that exist: {}".format(", ".join(diff))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def check_contents(
|
|
82
|
+
expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
|
|
83
|
+
):
|
|
84
|
+
found_diff = False
|
|
85
|
+
for file_data in all_file_data:
|
|
86
|
+
file_data_path = expected_output_dir / f"{file_data.identifier}.json"
|
|
87
|
+
with file_data_path.open("r") as file:
|
|
88
|
+
expected_file_data_contents = json.load(file)
|
|
89
|
+
current_file_data_contents = file_data.to_dict()
|
|
90
|
+
expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
|
|
91
|
+
current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
|
|
92
|
+
diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
|
|
93
|
+
if diff:
|
|
94
|
+
found_diff = True
|
|
95
|
+
print(diff.to_json(indent=2))
|
|
96
|
+
assert not found_diff, f"Diffs found between files: {found_diff}"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def run_expected_results_validation(
|
|
100
|
+
expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
|
|
101
|
+
):
|
|
102
|
+
check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
|
|
103
|
+
check_contents(
|
|
104
|
+
expected_output_dir=expected_output_dir, all_file_data=all_file_data, configs=configs
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
|
|
109
|
+
directory_record = expected_output_dir / "directory_structure.json"
|
|
110
|
+
with directory_record.open("r") as directory_file:
|
|
111
|
+
directory_file_contents = json.load(directory_file)
|
|
112
|
+
directory_structure = directory_file_contents["directory_structure"]
|
|
113
|
+
assert directory_structure == download_files
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def update_fixtures(output_dir: Path, download_dir: Path, all_file_data: list[FileData]):
|
|
117
|
+
# Delete current files
|
|
118
|
+
shutil.rmtree(path=output_dir, ignore_errors=True)
|
|
119
|
+
output_dir.mkdir(parents=True)
|
|
120
|
+
# Rewrite the current file data
|
|
121
|
+
file_data_output_path = output_dir / "file_data"
|
|
122
|
+
file_data_output_path.mkdir(parents=True)
|
|
123
|
+
for file_data in all_file_data:
|
|
124
|
+
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
125
|
+
with file_data_path.open(mode="w") as f:
|
|
126
|
+
json.dump(file_data.to_dict(), f, indent=2)
|
|
127
|
+
|
|
128
|
+
# Record file structure of download directory
|
|
129
|
+
download_files = get_files(dir_path=download_dir)
|
|
130
|
+
download_files.sort()
|
|
131
|
+
download_dir_record = output_dir / "directory_structure.json"
|
|
132
|
+
with download_dir_record.open(mode="w") as f:
|
|
133
|
+
json.dump({"directory_structure": download_files}, f, indent=2)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def run_all_validations(
|
|
137
|
+
configs: ValidationConfigs,
|
|
138
|
+
predownload_file_data: list[FileData],
|
|
139
|
+
postdownload_file_data: list[FileData],
|
|
140
|
+
download_dir: Path,
|
|
141
|
+
test_output_dir: Path,
|
|
142
|
+
):
|
|
143
|
+
for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
|
|
144
|
+
configs.run_file_data_validation(
|
|
145
|
+
predownload_file_data=pre_data, postdownload_file_data=post_data
|
|
146
|
+
)
|
|
147
|
+
configs.run_download_dir_validation(download_dir=download_dir)
|
|
148
|
+
run_expected_results_validation(
|
|
149
|
+
expected_output_dir=test_output_dir / "file_data",
|
|
150
|
+
all_file_data=postdownload_file_data,
|
|
151
|
+
configs=configs,
|
|
152
|
+
)
|
|
153
|
+
download_files = get_files(dir_path=download_dir)
|
|
154
|
+
download_files.sort()
|
|
155
|
+
run_directory_structure_validation(
|
|
156
|
+
expected_output_dir=configs.test_output_dir(), download_files=download_files
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
async def source_connector_validation(
|
|
161
|
+
indexer: Indexer,
|
|
162
|
+
downloader: Downloader,
|
|
163
|
+
configs: ValidationConfigs,
|
|
164
|
+
overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
|
|
165
|
+
) -> None:
|
|
166
|
+
# Run common validations on the process of running a source connector, supporting dynamic
|
|
167
|
+
# validators that get passed in along with comparisons on the saved expected values.
|
|
168
|
+
# If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
|
|
169
|
+
# expected values with what gets generated by this test.
|
|
170
|
+
all_predownload_file_data = []
|
|
171
|
+
all_postdownload_file_data = []
|
|
172
|
+
indexer.precheck()
|
|
173
|
+
download_dir = downloader.download_config.download_dir
|
|
174
|
+
test_output_dir = configs.test_output_dir()
|
|
175
|
+
for file_data in indexer.run():
|
|
176
|
+
assert file_data
|
|
177
|
+
predownload_file_data = replace(file_data)
|
|
178
|
+
all_predownload_file_data.append(predownload_file_data)
|
|
179
|
+
if downloader.is_async():
|
|
180
|
+
resp = await downloader.run_async(file_data=file_data)
|
|
181
|
+
else:
|
|
182
|
+
resp = downloader.run(file_data=file_data)
|
|
183
|
+
postdownload_file_data = replace(resp["file_data"])
|
|
184
|
+
all_postdownload_file_data.append(postdownload_file_data)
|
|
185
|
+
if not overwrite_fixtures:
|
|
186
|
+
run_all_validations(
|
|
187
|
+
configs=configs,
|
|
188
|
+
predownload_file_data=all_predownload_file_data,
|
|
189
|
+
postdownload_file_data=all_postdownload_file_data,
|
|
190
|
+
download_dir=download_dir,
|
|
191
|
+
test_output_dir=test_output_dir,
|
|
192
|
+
)
|
|
193
|
+
else:
|
|
194
|
+
update_fixtures(
|
|
195
|
+
output_dir=test_output_dir,
|
|
196
|
+
download_dir=download_dir,
|
|
197
|
+
all_file_data=all_postdownload_file_data,
|
|
198
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.fixture
|
|
7
|
+
def embedder_file() -> Path:
|
|
8
|
+
int_test_dir = Path(__file__).parent
|
|
9
|
+
assets_dir = int_test_dir / "assets"
|
|
10
|
+
embedder_file = assets_dir / "DA-1p-with-duplicate-pages.pdf.json"
|
|
11
|
+
assert embedder_file.exists()
|
|
12
|
+
assert embedder_file.is_file()
|
|
13
|
+
return embedder_file
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
|
|
8
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_aws_credentials() -> dict:
|
|
12
|
+
access_key = os.getenv("AWS_ACCESS_KEY_ID", None)
|
|
13
|
+
assert access_key
|
|
14
|
+
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", None)
|
|
15
|
+
assert secret_key
|
|
16
|
+
return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
|
|
20
|
+
def test_bedrock_embedder(embedder_file: Path):
|
|
21
|
+
aws_credentials = get_aws_credentials()
|
|
22
|
+
embedder_config = EmbedderConfig(
|
|
23
|
+
embedding_provider="aws-bedrock",
|
|
24
|
+
embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
25
|
+
embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
26
|
+
)
|
|
27
|
+
embedder = Embedder(config=embedder_config)
|
|
28
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
29
|
+
assert results
|
|
30
|
+
with embedder_file.open("r") as f:
|
|
31
|
+
original_elements = json.load(f)
|
|
32
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
|
|
36
|
+
def test_raw_bedrock_embedder(embedder_file: Path):
|
|
37
|
+
aws_credentials = get_aws_credentials()
|
|
38
|
+
embedder = BedrockEmbeddingEncoder(
|
|
39
|
+
config=BedrockEmbeddingConfig(
|
|
40
|
+
aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
41
|
+
aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
validate_raw_embedder(
|
|
45
|
+
embedder=embedder,
|
|
46
|
+
embedder_file=embedder_file,
|
|
47
|
+
expected_dimensions=(1536,),
|
|
48
|
+
expected_is_unit_vector=False,
|
|
49
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
5
|
+
from unstructured_ingest.embed.huggingface import (
|
|
6
|
+
HuggingFaceEmbeddingConfig,
|
|
7
|
+
HuggingFaceEmbeddingEncoder,
|
|
8
|
+
)
|
|
9
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_huggingface_embedder(embedder_file: Path):
|
|
13
|
+
embedder_config = EmbedderConfig(embedding_provider="huggingface")
|
|
14
|
+
embedder = Embedder(config=embedder_config)
|
|
15
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
16
|
+
assert results
|
|
17
|
+
with embedder_file.open("r") as f:
|
|
18
|
+
original_elements = json.load(f)
|
|
19
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_raw_hugginface_embedder(embedder_file: Path):
|
|
23
|
+
embedder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
|
|
24
|
+
validate_raw_embedder(
|
|
25
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(384,)
|
|
26
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.mixedbreadai import (
|
|
8
|
+
MixedbreadAIEmbeddingConfig,
|
|
9
|
+
MixedbreadAIEmbeddingEncoder,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
12
|
+
|
|
13
|
+
API_KEY = "MXBAI_API_KEY"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_api_key() -> str:
|
|
17
|
+
api_key = os.getenv(API_KEY, None)
|
|
18
|
+
assert api_key
|
|
19
|
+
return api_key
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@requires_env(API_KEY)
|
|
23
|
+
def test_mixedbread_embedder(embedder_file: Path):
|
|
24
|
+
api_key = get_api_key()
|
|
25
|
+
embedder_config = EmbedderConfig(embedding_provider="mixedbread-ai", embedding_api_key=api_key)
|
|
26
|
+
embedder = Embedder(config=embedder_config)
|
|
27
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
28
|
+
assert results
|
|
29
|
+
with embedder_file.open("r") as f:
|
|
30
|
+
original_elements = json.load(f)
|
|
31
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@requires_env(API_KEY)
|
|
35
|
+
def test_raw_mixedbread_embedder(embedder_file: Path):
|
|
36
|
+
api_key = get_api_key()
|
|
37
|
+
embedder = MixedbreadAIEmbeddingEncoder(
|
|
38
|
+
config=MixedbreadAIEmbeddingConfig(
|
|
39
|
+
api_key=api_key,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
validate_raw_embedder(
|
|
43
|
+
embedder=embedder,
|
|
44
|
+
embedder_file=embedder_file,
|
|
45
|
+
expected_dimensions=(1024,),
|
|
46
|
+
expected_is_unit_vector=False,
|
|
47
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
8
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
|
+
|
|
10
|
+
API_KEY = "OCTOAI_API_KEY"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_api_key() -> str:
|
|
14
|
+
api_key = os.getenv(API_KEY, None)
|
|
15
|
+
assert api_key
|
|
16
|
+
return api_key
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@requires_env(API_KEY)
|
|
20
|
+
def test_octoai_embedder(embedder_file: Path):
|
|
21
|
+
api_key = get_api_key()
|
|
22
|
+
embedder_config = EmbedderConfig(embedding_provider="octoai", embedding_api_key=api_key)
|
|
23
|
+
embedder = Embedder(config=embedder_config)
|
|
24
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
25
|
+
assert results
|
|
26
|
+
with embedder_file.open("r") as f:
|
|
27
|
+
original_elements = json.load(f)
|
|
28
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@requires_env(API_KEY)
|
|
32
|
+
def test_raw_octoai_embedder(embedder_file: Path):
|
|
33
|
+
api_key = get_api_key()
|
|
34
|
+
embedder = OctoAIEmbeddingEncoder(
|
|
35
|
+
config=OctoAiEmbeddingConfig(
|
|
36
|
+
api_key=api_key,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
validate_raw_embedder(
|
|
40
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
|
|
41
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
8
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
|
+
|
|
10
|
+
API_KEY = "OPENAI_API_KEY"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_api_key() -> str:
|
|
14
|
+
api_key = os.getenv(API_KEY, None)
|
|
15
|
+
assert api_key
|
|
16
|
+
return api_key
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@requires_env(API_KEY)
|
|
20
|
+
def test_openai_embedder(embedder_file: Path):
|
|
21
|
+
api_key = get_api_key()
|
|
22
|
+
embedder_config = EmbedderConfig(embedding_provider="openai", embedding_api_key=api_key)
|
|
23
|
+
embedder = Embedder(config=embedder_config)
|
|
24
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
25
|
+
assert results
|
|
26
|
+
with embedder_file.open("r") as f:
|
|
27
|
+
original_elements = json.load(f)
|
|
28
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@requires_env(API_KEY)
|
|
32
|
+
def test_raw_openai_embedder(embedder_file: Path):
|
|
33
|
+
api_key = get_api_key()
|
|
34
|
+
embedder = OpenAIEmbeddingEncoder(
|
|
35
|
+
config=OpenAIEmbeddingConfig(
|
|
36
|
+
api_key=api_key,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
validate_raw_embedder(
|
|
40
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
|
|
41
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
|
|
8
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
|
+
|
|
10
|
+
API_KEY = "VERTEXAI_API_KEY"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_api_key() -> str:
|
|
14
|
+
api_key = os.getenv(API_KEY, None)
|
|
15
|
+
assert api_key
|
|
16
|
+
return api_key
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@requires_env(API_KEY)
|
|
20
|
+
def test_vertexai_embedder(embedder_file: Path):
|
|
21
|
+
api_key = get_api_key()
|
|
22
|
+
embedder_config = EmbedderConfig(embedding_provider="vertexai", embedding_api_key=api_key)
|
|
23
|
+
embedder = Embedder(config=embedder_config)
|
|
24
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
25
|
+
assert results
|
|
26
|
+
with embedder_file.open("r") as f:
|
|
27
|
+
original_elements = json.load(f)
|
|
28
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@requires_env(API_KEY)
|
|
32
|
+
def test_raw_vertexai_embedder(embedder_file: Path):
|
|
33
|
+
api_key = get_api_key()
|
|
34
|
+
embedder = VertexAIEmbeddingEncoder(
|
|
35
|
+
config=VertexAIEmbeddingConfig(
|
|
36
|
+
api_key=api_key,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
validate_raw_embedder(
|
|
40
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(768,)
|
|
41
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
|
|
8
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
9
|
+
|
|
10
|
+
API_KEY = "VOYAGEAI_API_KEY"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_api_key() -> str:
|
|
14
|
+
api_key = os.getenv(API_KEY, None)
|
|
15
|
+
assert api_key
|
|
16
|
+
return api_key
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@requires_env(API_KEY)
|
|
20
|
+
def test_voyageai_embedder(embedder_file: Path):
|
|
21
|
+
api_key = get_api_key()
|
|
22
|
+
embedder_config = EmbedderConfig(embedding_provider="voyageai", embedding_api_key=api_key)
|
|
23
|
+
embedder = Embedder(config=embedder_config)
|
|
24
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
25
|
+
assert results
|
|
26
|
+
with embedder_file.open("r") as f:
|
|
27
|
+
original_elements = json.load(f)
|
|
28
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@requires_env(API_KEY)
|
|
32
|
+
def test_raw_voyageai_embedder(embedder_file: Path):
|
|
33
|
+
api_key = get_api_key()
|
|
34
|
+
embedder = VoyageAIEmbeddingEncoder(
|
|
35
|
+
config=VoyageAIEmbeddingConfig(
|
|
36
|
+
api_key=api_key,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
validate_raw_embedder(
|
|
40
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
|
|
41
|
+
)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
6
|
+
from test.integration.utils import requires_env
|
|
7
|
+
from unstructured_ingest.embed.togetherai import (
|
|
8
|
+
TogetherAIEmbeddingConfig,
|
|
9
|
+
TogetherAIEmbeddingEncoder,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
12
|
+
|
|
13
|
+
API_KEY = "TOGETHERAI_API_KEY"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_api_key() -> str:
|
|
17
|
+
api_key = os.getenv(API_KEY, None)
|
|
18
|
+
assert api_key
|
|
19
|
+
return api_key
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@requires_env(API_KEY)
|
|
23
|
+
def test_togetherai_embedder(embedder_file: Path):
|
|
24
|
+
api_key = get_api_key()
|
|
25
|
+
embedder_config = EmbedderConfig(embedding_provider="togetherai", embedding_api_key=api_key)
|
|
26
|
+
embedder = Embedder(config=embedder_config)
|
|
27
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
28
|
+
assert results
|
|
29
|
+
with embedder_file.open("r") as f:
|
|
30
|
+
original_elements = json.load(f)
|
|
31
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@requires_env(API_KEY)
|
|
35
|
+
def test_raw_togetherai_embedder(embedder_file: Path):
|
|
36
|
+
api_key = get_api_key()
|
|
37
|
+
embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key=api_key))
|
|
38
|
+
validate_raw_embedder(
|
|
39
|
+
embedder=embedder,
|
|
40
|
+
embedder_file=embedder_file,
|
|
41
|
+
expected_dimensions=(768,),
|
|
42
|
+
expected_is_unit_vector=False,
|
|
43
|
+
)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def validate_embedding_output(original_elements: list[dict], output_elements: list[dict]):
|
|
9
|
+
"""
|
|
10
|
+
Make sure the following characteristics are met:
|
|
11
|
+
* The same number of elements are returned
|
|
12
|
+
* For each element that had text, an embeddings entry was added in the output
|
|
13
|
+
* Other than the embedding, nothing about the element was changed
|
|
14
|
+
"""
|
|
15
|
+
assert len(original_elements) == len(output_elements)
|
|
16
|
+
for original_element, output_element in zip(original_elements, output_elements):
|
|
17
|
+
if original_element.get("text"):
|
|
18
|
+
assert output_element.get("embeddings", None)
|
|
19
|
+
output_element.pop("embeddings", None)
|
|
20
|
+
assert original_element == output_element
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def validate_raw_embedder(
|
|
24
|
+
embedder: BaseEmbeddingEncoder,
|
|
25
|
+
embedder_file: Path,
|
|
26
|
+
expected_dimensions: Optional[tuple[int, ...]] = None,
|
|
27
|
+
expected_is_unit_vector: bool = True,
|
|
28
|
+
):
|
|
29
|
+
with open(embedder_file) as f:
|
|
30
|
+
elements = json.load(f)
|
|
31
|
+
all_text = [element["text"] for element in elements]
|
|
32
|
+
single_text = all_text[0]
|
|
33
|
+
num_of_dimensions = embedder.num_of_dimensions
|
|
34
|
+
if expected_dimensions:
|
|
35
|
+
assert (
|
|
36
|
+
num_of_dimensions == expected_dimensions
|
|
37
|
+
), f"number of dimensions {num_of_dimensions} didn't match expected: {expected_dimensions}"
|
|
38
|
+
is_unit_vector = embedder.is_unit_vector
|
|
39
|
+
assert is_unit_vector == expected_is_unit_vector
|
|
40
|
+
single_embedding = embedder.embed_query(query=single_text)
|
|
41
|
+
expected_length = num_of_dimensions[0]
|
|
42
|
+
assert len(single_embedding) == expected_length
|
|
43
|
+
embedded_elements = embedder.embed_documents(elements=elements)
|
|
44
|
+
validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
|
|
File without changes
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from unstructured_client.models.errors.sdkerror import SDKError
|
|
7
|
+
|
|
8
|
+
from test.integration.utils import requires_env
|
|
9
|
+
from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
|
|
10
|
+
|
|
11
|
+
int_test_dir = Path(__file__).parent
|
|
12
|
+
assets_dir = int_test_dir / "assets"
|
|
13
|
+
|
|
14
|
+
all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
|
|
15
|
+
non_image_partition_files = [
|
|
16
|
+
path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
|
|
17
|
+
]
|
|
18
|
+
image_partition_files = [
|
|
19
|
+
path for path in all_partition_files if path not in non_image_partition_files
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.mark.parametrize(
|
|
24
|
+
"partition_file", all_partition_files, ids=[path.name for path in all_partition_files]
|
|
25
|
+
)
|
|
26
|
+
@requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
|
|
27
|
+
@pytest.mark.asyncio
|
|
28
|
+
async def test_partitioner_api_hi_res(partition_file: Path):
|
|
29
|
+
api_key = os.getenv("UNSTRUCTURED_API_KEY")
|
|
30
|
+
api_url = os.getenv("UNSTRUCTURED_API_URL")
|
|
31
|
+
partitioner_config = PartitionerConfig(
|
|
32
|
+
strategy="hi_res", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
|
|
33
|
+
)
|
|
34
|
+
partitioner = Partitioner(config=partitioner_config)
|
|
35
|
+
results = await partitioner.run_async(filename=partition_file)
|
|
36
|
+
results_dir = int_test_dir / "results"
|
|
37
|
+
results_dir.mkdir(exist_ok=True)
|
|
38
|
+
results_path = results_dir / f"{partition_file.name}.json"
|
|
39
|
+
with results_path.open("w") as f:
|
|
40
|
+
json.dump(results, f, indent=2)
|
|
41
|
+
assert results
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@pytest.mark.parametrize(
|
|
45
|
+
"partition_file",
|
|
46
|
+
non_image_partition_files,
|
|
47
|
+
ids=[path.name for path in non_image_partition_files],
|
|
48
|
+
)
|
|
49
|
+
@requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
|
|
50
|
+
@pytest.mark.asyncio
|
|
51
|
+
async def test_partitioner_api_fast(partition_file: Path):
|
|
52
|
+
api_key = os.getenv("UNSTRUCTURED_API_KEY")
|
|
53
|
+
api_url = os.getenv("UNSTRUCTURED_API_URL")
|
|
54
|
+
partitioner_config = PartitionerConfig(
|
|
55
|
+
strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
|
|
56
|
+
)
|
|
57
|
+
partitioner = Partitioner(config=partitioner_config)
|
|
58
|
+
results = await partitioner.run_async(filename=partition_file)
|
|
59
|
+
assert results
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@pytest.mark.parametrize(
|
|
63
|
+
"partition_file", image_partition_files, ids=[path.name for path in image_partition_files]
|
|
64
|
+
)
|
|
65
|
+
@requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
|
|
66
|
+
@pytest.mark.asyncio
|
|
67
|
+
async def test_partitioner_api_fast_error(partition_file: Path):
|
|
68
|
+
api_key = os.getenv("UNSTRUCTURED_API_KEY")
|
|
69
|
+
api_url = os.getenv("UNSTRUCTURED_API_URL")
|
|
70
|
+
partitioner_config = PartitionerConfig(
|
|
71
|
+
strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
|
|
72
|
+
)
|
|
73
|
+
partitioner = Partitioner(config=partitioner_config)
|
|
74
|
+
with pytest.raises(SDKError):
|
|
75
|
+
await partitioner.run_async(filename=partition_file)
|