unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (86) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/sql/__init__.py +0 -0
  10. test/integration/connectors/sql/test_postgres.py +178 -0
  11. test/integration/connectors/sql/test_sqlite.py +151 -0
  12. test/integration/connectors/test_s3.py +152 -0
  13. test/integration/connectors/utils/__init__.py +0 -0
  14. test/integration/connectors/utils/constants.py +7 -0
  15. test/integration/connectors/utils/docker_compose.py +44 -0
  16. test/integration/connectors/utils/validation.py +203 -0
  17. test/integration/embedders/__init__.py +0 -0
  18. test/integration/embedders/conftest.py +13 -0
  19. test/integration/embedders/test_bedrock.py +49 -0
  20. test/integration/embedders/test_huggingface.py +26 -0
  21. test/integration/embedders/test_mixedbread.py +47 -0
  22. test/integration/embedders/test_octoai.py +41 -0
  23. test/integration/embedders/test_openai.py +41 -0
  24. test/integration/embedders/test_vertexai.py +41 -0
  25. test/integration/embedders/test_voyageai.py +41 -0
  26. test/integration/embedders/togetherai.py +43 -0
  27. test/integration/embedders/utils.py +44 -0
  28. test/integration/partitioners/__init__.py +0 -0
  29. test/integration/partitioners/test_partitioner.py +75 -0
  30. test/integration/utils.py +15 -0
  31. test/unit/__init__.py +0 -0
  32. test/unit/embed/__init__.py +0 -0
  33. test/unit/embed/test_mixedbreadai.py +41 -0
  34. test/unit/embed/test_octoai.py +20 -0
  35. test/unit/embed/test_openai.py +20 -0
  36. test/unit/embed/test_vertexai.py +25 -0
  37. test/unit/embed/test_voyageai.py +24 -0
  38. test/unit/test_chunking_utils.py +36 -0
  39. test/unit/test_error.py +27 -0
  40. test/unit/test_interfaces.py +280 -0
  41. test/unit/test_interfaces_v2.py +26 -0
  42. test/unit/test_logger.py +78 -0
  43. test/unit/test_utils.py +164 -0
  44. test/unit/test_utils_v2.py +82 -0
  45. unstructured_ingest/__version__.py +1 -1
  46. unstructured_ingest/cli/interfaces.py +2 -2
  47. unstructured_ingest/connector/notion/types/block.py +1 -0
  48. unstructured_ingest/connector/notion/types/database.py +1 -0
  49. unstructured_ingest/connector/notion/types/page.py +1 -0
  50. unstructured_ingest/embed/bedrock.py +0 -20
  51. unstructured_ingest/embed/huggingface.py +0 -21
  52. unstructured_ingest/embed/interfaces.py +29 -3
  53. unstructured_ingest/embed/mixedbreadai.py +0 -36
  54. unstructured_ingest/embed/octoai.py +2 -24
  55. unstructured_ingest/embed/openai.py +0 -20
  56. unstructured_ingest/embed/togetherai.py +40 -0
  57. unstructured_ingest/embed/vertexai.py +0 -20
  58. unstructured_ingest/embed/voyageai.py +1 -24
  59. unstructured_ingest/interfaces.py +1 -1
  60. unstructured_ingest/v2/cli/utils/click.py +21 -2
  61. unstructured_ingest/v2/interfaces/connector.py +22 -2
  62. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  63. unstructured_ingest/v2/processes/chunker.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
  65. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  70. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  71. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
  72. unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  74. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  75. unstructured_ingest/v2/processes/connectors/sql/postgres.py +177 -0
  76. unstructured_ingest/v2/processes/connectors/sql/sql.py +310 -0
  77. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +172 -0
  78. unstructured_ingest/v2/processes/embedder.py +13 -0
  79. unstructured_ingest/v2/processes/partitioner.py +2 -1
  80. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -14
  81. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +85 -31
  82. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +1 -0
  83. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  84. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,152 @@
1
+ import os
2
+ import tempfile
3
+ import uuid
4
+ from pathlib import Path
5
+
6
+ import pytest
7
+
8
+ from test.integration.connectors.utils.constants import (
9
+ DESTINATION_TAG,
10
+ SOURCE_TAG,
11
+ env_setup_path,
12
+ )
13
+ from test.integration.connectors.utils.docker_compose import docker_compose_context
14
+ from test.integration.connectors.utils.validation import (
15
+ ValidationConfigs,
16
+ source_connector_validation,
17
+ )
18
+ from test.integration.utils import requires_env
19
+ from unstructured_ingest.error import (
20
+ SourceConnectionError,
21
+ )
22
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
23
+ from unstructured_ingest.v2.processes.connectors.fsspec.s3 import (
24
+ CONNECTOR_TYPE,
25
+ S3AccessConfig,
26
+ S3ConnectionConfig,
27
+ S3Downloader,
28
+ S3DownloaderConfig,
29
+ S3Indexer,
30
+ S3IndexerConfig,
31
+ S3Uploader,
32
+ S3UploaderConfig,
33
+ )
34
+
35
+
36
+ def validate_predownload_file_data(file_data: FileData):
37
+ assert file_data.connector_type == CONNECTOR_TYPE
38
+ assert file_data.local_download_path is None
39
+
40
+
41
+ def validate_postdownload_file_data(file_data: FileData):
42
+ assert file_data.connector_type == CONNECTOR_TYPE
43
+ assert file_data.local_download_path is not None
44
+
45
+
46
+ @pytest.fixture
47
+ def anon_connection_config() -> S3ConnectionConfig:
48
+ return S3ConnectionConfig(access_config=S3AccessConfig(), anonymous=True)
49
+
50
+
51
+ @pytest.mark.asyncio
52
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
53
+ async def test_s3_source(anon_connection_config: S3ConnectionConfig):
54
+ indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/")
55
+ with tempfile.TemporaryDirectory() as tempdir:
56
+ tempdir_path = Path(tempdir)
57
+ download_config = S3DownloaderConfig(download_dir=tempdir_path)
58
+ indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
59
+ downloader = S3Downloader(
60
+ connection_config=anon_connection_config, download_config=download_config
61
+ )
62
+ await source_connector_validation(
63
+ indexer=indexer,
64
+ downloader=downloader,
65
+ configs=ValidationConfigs(
66
+ test_id="s3",
67
+ predownload_file_data_check=validate_predownload_file_data,
68
+ postdownload_file_data_check=validate_postdownload_file_data,
69
+ expected_num_files=4,
70
+ ),
71
+ )
72
+
73
+
74
+ @pytest.mark.asyncio
75
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
76
+ async def test_s3_source_no_access(anon_connection_config: S3ConnectionConfig):
77
+ indexer_config = S3IndexerConfig(remote_url="s3://utic-ingest-test-fixtures/destination/")
78
+ indexer = S3Indexer(connection_config=anon_connection_config, index_config=indexer_config)
79
+ with pytest.raises(SourceConnectionError):
80
+ indexer.precheck()
81
+
82
+
83
+ @pytest.mark.asyncio
84
+ @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, "minio")
85
+ async def test_s3_minio_source(anon_connection_config: S3ConnectionConfig):
86
+ anon_connection_config.endpoint_url = "http://localhost:9000"
87
+ indexer_config = S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/")
88
+ with docker_compose_context(docker_compose_path=env_setup_path / "minio"):
89
+ with tempfile.TemporaryDirectory() as tempdir:
90
+ tempdir_path = Path(tempdir)
91
+ download_config = S3DownloaderConfig(download_dir=tempdir_path)
92
+ indexer = S3Indexer(
93
+ connection_config=anon_connection_config, index_config=indexer_config
94
+ )
95
+ downloader = S3Downloader(
96
+ connection_config=anon_connection_config, download_config=download_config
97
+ )
98
+ await source_connector_validation(
99
+ indexer=indexer,
100
+ downloader=downloader,
101
+ configs=ValidationConfigs(
102
+ test_id="s3-minio",
103
+ predownload_file_data_check=validate_predownload_file_data,
104
+ postdownload_file_data_check=validate_postdownload_file_data,
105
+ expected_num_files=1,
106
+ exclude_fields_extend=[
107
+ "metadata.date_modified",
108
+ "metadata.date_created",
109
+ "additional_metadata.LastModified",
110
+ ],
111
+ ),
112
+ )
113
+
114
+
115
+ def get_aws_credentials() -> dict:
116
+ access_key = os.getenv("S3_INGEST_TEST_ACCESS_KEY", None)
117
+ assert access_key
118
+ secret_key = os.getenv("S3_INGEST_TEST_SECRET_KEY", None)
119
+ assert secret_key
120
+ return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
121
+
122
+
123
+ @pytest.mark.asyncio
124
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
125
+ @requires_env("S3_INGEST_TEST_ACCESS_KEY", "S3_INGEST_TEST_SECRET_KEY")
126
+ async def test_s3_destination(upload_file: Path):
127
+ aws_credentials = get_aws_credentials()
128
+ s3_bucket = "s3://utic-ingest-test-fixtures"
129
+ destination_path = f"{s3_bucket}/destination/{uuid.uuid4()}"
130
+ connection_config = S3ConnectionConfig(
131
+ access_config=S3AccessConfig(
132
+ key=aws_credentials["aws_access_key_id"],
133
+ secret=aws_credentials["aws_secret_access_key"],
134
+ ),
135
+ )
136
+ upload_config = S3UploaderConfig(remote_url=destination_path)
137
+ uploader = S3Uploader(connection_config=connection_config, upload_config=upload_config)
138
+ s3fs = uploader.fs
139
+ file_data = FileData(
140
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
141
+ connector_type=CONNECTOR_TYPE,
142
+ identifier="mock file data",
143
+ )
144
+ try:
145
+ if uploader.is_async():
146
+ await uploader.run_async(path=upload_file, file_data=file_data)
147
+ else:
148
+ uploader.run(path=upload_file, file_data=file_data)
149
+ uploaded_files = s3fs.ls(path=destination_path)
150
+ assert len(uploaded_files) == 1
151
+ finally:
152
+ s3fs.rm(path=destination_path, recursive=True)
File without changes
@@ -0,0 +1,7 @@
1
+ from pathlib import Path
2
+
3
+ SOURCE_TAG = "source"
4
+ DESTINATION_TAG = "destination"
5
+
6
+ env_setup_path = Path(__file__).parents[1] / "env_setup"
7
+ expected_results_path = Path(__file__).parents[1] / "expected_results"
@@ -0,0 +1,44 @@
1
+ import subprocess
2
+ from contextlib import contextmanager
3
+ from pathlib import Path
4
+
5
+
6
+ @contextmanager
7
+ def docker_compose_context(docker_compose_path: Path):
8
+ # Dynamically run a specific docker compose file and make sure it gets cleanup by
9
+ # by leveraging a context manager. Uses subprocess to map docker compose commands
10
+ # to the underlying shell.
11
+ assert docker_compose_path.exists()
12
+ if docker_compose_path.is_dir():
13
+ if (docker_compose_path / "docker-compose.yml").exists():
14
+ docker_compose_path = docker_compose_path / "docker-compose.yml"
15
+ elif (docker_compose_path / "docker-compose.yaml").exists():
16
+ docker_compose_path = docker_compose_path / "docker-compose.yaml"
17
+ assert docker_compose_path.is_file()
18
+ resp = None
19
+ try:
20
+ cmd = f"docker compose -f {docker_compose_path.resolve()} up -d --wait"
21
+ print(f"Running command: {cmd}")
22
+ resp = subprocess.run(
23
+ cmd,
24
+ shell=True,
25
+ capture_output=True,
26
+ )
27
+ # Return code from docker compose using --wait can be 1 even if no error
28
+ yield
29
+ except Exception as e:
30
+ if resp:
31
+ print("STDOUT: {}".format(resp.stdout.decode("utf-8")))
32
+ print("STDERR: {}".format(resp.stderr.decode("utf-8")))
33
+ raise e
34
+ finally:
35
+ cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v"
36
+ print(f"Running command: {cmd}")
37
+ final_resp = subprocess.run(
38
+ cmd,
39
+ shell=True,
40
+ capture_output=True,
41
+ )
42
+ if final_resp.returncode != 0:
43
+ print("STDOUT: {}".format(final_resp.stdout.decode("utf-8")))
44
+ print("STDERR: {}".format(final_resp.stderr.decode("utf-8")))
@@ -0,0 +1,203 @@
1
+ import json
2
+ import os
3
+ import shutil
4
+ from dataclasses import dataclass, field, replace
5
+ from pathlib import Path
6
+ from typing import Callable, Optional
7
+
8
+ from deepdiff import DeepDiff
9
+
10
+ from test.integration.connectors.utils.constants import expected_results_path
11
+ from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
12
+
13
+
14
+ @dataclass
15
+ class ValidationConfigs:
16
+ test_id: str
17
+ expected_num_files: Optional[int] = None
18
+ predownload_file_data_check: Optional[Callable[[FileData], None]] = None
19
+ postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
20
+ exclude_fields: list[str] = field(
21
+ default_factory=lambda: ["local_download_path", "metadata.date_processed"]
22
+ )
23
+ exclude_fields_extend: list[str] = field(default_factory=list)
24
+
25
+ def get_exclude_fields(self) -> list[str]:
26
+ exclude_fields = self.exclude_fields
27
+ exclude_fields.extend(self.exclude_fields_extend)
28
+ return exclude_fields
29
+
30
+ def run_file_data_validation(
31
+ self, predownload_file_data: FileData, postdownload_file_data: FileData
32
+ ):
33
+ if predownload_file_data_check := self.predownload_file_data_check:
34
+ predownload_file_data_check(predownload_file_data)
35
+ if postdownload_file_data_check := self.postdownload_file_data_check:
36
+ postdownload_file_data_check(postdownload_file_data)
37
+
38
+ def run_download_dir_validation(self, download_dir: Path):
39
+ if expected_num_files := self.expected_num_files:
40
+ downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
41
+ assert len(downloaded_files) == expected_num_files
42
+
43
+ def test_output_dir(self) -> Path:
44
+ return expected_results_path / self.test_id
45
+
46
+ def omit_ignored_fields(self, data: dict) -> dict:
47
+ exclude_fields = self.get_exclude_fields()
48
+ # Ignore fields that dynamically change every time the tests run
49
+ copied_data = data.copy()
50
+ for exclude_field in exclude_fields:
51
+ exclude_field_vals = exclude_field.split(".")
52
+ if len(exclude_field_vals) == 1:
53
+ current_val = copied_data
54
+ drop_field = exclude_field_vals[0]
55
+ copied_data.pop(exclude_field_vals[0], None)
56
+ else:
57
+ current_val = copied_data
58
+ for val in exclude_field_vals[:-1]:
59
+ current_val = current_val.get(val, {})
60
+ drop_field = exclude_field_vals[-1]
61
+ if drop_field == "*":
62
+ current_val.clear()
63
+ else:
64
+ current_val.pop(drop_field, None)
65
+ return copied_data
66
+
67
+
68
+ def get_files(dir_path: Path) -> list[str]:
69
+ return [
70
+ str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.iterdir() if f.is_file()
71
+ ]
72
+
73
+
74
+ def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
75
+ expected_files = get_files(dir_path=expected_output_dir)
76
+ current_files = [f"{file_data.identifier}.json" for file_data in all_file_data]
77
+ diff = set(expected_files) ^ set(current_files)
78
+ assert not diff, "diff in files that exist: {}".format(", ".join(diff))
79
+
80
+
81
+ def check_contents(
82
+ expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
83
+ ):
84
+ found_diff = False
85
+ for file_data in all_file_data:
86
+ file_data_path = expected_output_dir / f"{file_data.identifier}.json"
87
+ with file_data_path.open("r") as file:
88
+ expected_file_data_contents = json.load(file)
89
+ current_file_data_contents = file_data.to_dict()
90
+ expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
91
+ current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
92
+ diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
93
+ if diff:
94
+ found_diff = True
95
+ print(diff.to_json(indent=2))
96
+ assert not found_diff, f"Diffs found between files: {found_diff}"
97
+
98
+
99
+ def run_expected_results_validation(
100
+ expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
101
+ ):
102
+ check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
103
+ check_contents(
104
+ expected_output_dir=expected_output_dir, all_file_data=all_file_data, configs=configs
105
+ )
106
+
107
+
108
+ def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
109
+ directory_record = expected_output_dir / "directory_structure.json"
110
+ with directory_record.open("r") as directory_file:
111
+ directory_file_contents = json.load(directory_file)
112
+ directory_structure = directory_file_contents["directory_structure"]
113
+ assert directory_structure == download_files
114
+
115
+
116
+ def update_fixtures(output_dir: Path, download_dir: Path, all_file_data: list[FileData]):
117
+ # Delete current files
118
+ shutil.rmtree(path=output_dir, ignore_errors=True)
119
+ output_dir.mkdir(parents=True)
120
+ # Rewrite the current file data
121
+ file_data_output_path = output_dir / "file_data"
122
+ file_data_output_path.mkdir(parents=True)
123
+ for file_data in all_file_data:
124
+ file_data_path = file_data_output_path / f"{file_data.identifier}.json"
125
+ with file_data_path.open(mode="w") as f:
126
+ json.dump(file_data.to_dict(), f, indent=2)
127
+
128
+ # Record file structure of download directory
129
+ download_files = get_files(dir_path=download_dir)
130
+ download_files.sort()
131
+ download_dir_record = output_dir / "directory_structure.json"
132
+ with download_dir_record.open(mode="w") as f:
133
+ json.dump({"directory_structure": download_files}, f, indent=2)
134
+
135
+
136
+ def run_all_validations(
137
+ configs: ValidationConfigs,
138
+ predownload_file_data: list[FileData],
139
+ postdownload_file_data: list[FileData],
140
+ download_dir: Path,
141
+ test_output_dir: Path,
142
+ ):
143
+ for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
144
+ configs.run_file_data_validation(
145
+ predownload_file_data=pre_data, postdownload_file_data=post_data
146
+ )
147
+ configs.run_download_dir_validation(download_dir=download_dir)
148
+ run_expected_results_validation(
149
+ expected_output_dir=test_output_dir / "file_data",
150
+ all_file_data=postdownload_file_data,
151
+ configs=configs,
152
+ )
153
+ download_files = get_files(dir_path=download_dir)
154
+ download_files.sort()
155
+ run_directory_structure_validation(
156
+ expected_output_dir=configs.test_output_dir(), download_files=download_files
157
+ )
158
+
159
+
160
+ async def source_connector_validation(
161
+ indexer: Indexer,
162
+ downloader: Downloader,
163
+ configs: ValidationConfigs,
164
+ overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
165
+ ) -> None:
166
+ # Run common validations on the process of running a source connector, supporting dynamic
167
+ # validators that get passed in along with comparisons on the saved expected values.
168
+ # If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
169
+ # expected values with what gets generated by this test.
170
+ all_predownload_file_data = []
171
+ all_postdownload_file_data = []
172
+ indexer.precheck()
173
+ download_dir = downloader.download_config.download_dir
174
+ test_output_dir = configs.test_output_dir()
175
+ for file_data in indexer.run():
176
+ assert file_data
177
+ predownload_file_data = replace(file_data)
178
+ all_predownload_file_data.append(predownload_file_data)
179
+ if downloader.is_async():
180
+ resp = await downloader.run_async(file_data=file_data)
181
+ else:
182
+ resp = downloader.run(file_data=file_data)
183
+ if isinstance(resp, list):
184
+ for r in resp:
185
+ postdownload_file_data = replace(r["file_data"])
186
+ all_postdownload_file_data.append(postdownload_file_data)
187
+ else:
188
+ postdownload_file_data = replace(resp["file_data"])
189
+ all_postdownload_file_data.append(postdownload_file_data)
190
+ if not overwrite_fixtures:
191
+ run_all_validations(
192
+ configs=configs,
193
+ predownload_file_data=all_predownload_file_data,
194
+ postdownload_file_data=all_postdownload_file_data,
195
+ download_dir=download_dir,
196
+ test_output_dir=test_output_dir,
197
+ )
198
+ else:
199
+ update_fixtures(
200
+ output_dir=test_output_dir,
201
+ download_dir=download_dir,
202
+ all_file_data=all_postdownload_file_data,
203
+ )
File without changes
@@ -0,0 +1,13 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+
5
+
6
+ @pytest.fixture
7
+ def embedder_file() -> Path:
8
+ int_test_dir = Path(__file__).parent
9
+ assets_dir = int_test_dir / "assets"
10
+ embedder_file = assets_dir / "DA-1p-with-duplicate-pages.pdf.json"
11
+ assert embedder_file.exists()
12
+ assert embedder_file.is_file()
13
+ return embedder_file
@@ -0,0 +1,49 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
8
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
+
10
+
11
+ def get_aws_credentials() -> dict:
12
+ access_key = os.getenv("AWS_ACCESS_KEY_ID", None)
13
+ assert access_key
14
+ secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", None)
15
+ assert secret_key
16
+ return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
17
+
18
+
19
+ @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
20
+ def test_bedrock_embedder(embedder_file: Path):
21
+ aws_credentials = get_aws_credentials()
22
+ embedder_config = EmbedderConfig(
23
+ embedding_provider="aws-bedrock",
24
+ embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
25
+ embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
26
+ )
27
+ embedder = Embedder(config=embedder_config)
28
+ results = embedder.run(elements_filepath=embedder_file)
29
+ assert results
30
+ with embedder_file.open("r") as f:
31
+ original_elements = json.load(f)
32
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
33
+
34
+
35
+ @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
36
+ def test_raw_bedrock_embedder(embedder_file: Path):
37
+ aws_credentials = get_aws_credentials()
38
+ embedder = BedrockEmbeddingEncoder(
39
+ config=BedrockEmbeddingConfig(
40
+ aws_access_key_id=aws_credentials["aws_access_key_id"],
41
+ aws_secret_access_key=aws_credentials["aws_secret_access_key"],
42
+ )
43
+ )
44
+ validate_raw_embedder(
45
+ embedder=embedder,
46
+ embedder_file=embedder_file,
47
+ expected_dimensions=(1536,),
48
+ expected_is_unit_vector=False,
49
+ )
@@ -0,0 +1,26 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
5
+ from unstructured_ingest.embed.huggingface import (
6
+ HuggingFaceEmbeddingConfig,
7
+ HuggingFaceEmbeddingEncoder,
8
+ )
9
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
10
+
11
+
12
+ def test_huggingface_embedder(embedder_file: Path):
13
+ embedder_config = EmbedderConfig(embedding_provider="huggingface")
14
+ embedder = Embedder(config=embedder_config)
15
+ results = embedder.run(elements_filepath=embedder_file)
16
+ assert results
17
+ with embedder_file.open("r") as f:
18
+ original_elements = json.load(f)
19
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
20
+
21
+
22
+ def test_raw_hugginface_embedder(embedder_file: Path):
23
+ embedder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
24
+ validate_raw_embedder(
25
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(384,)
26
+ )
@@ -0,0 +1,47 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.mixedbreadai import (
8
+ MixedbreadAIEmbeddingConfig,
9
+ MixedbreadAIEmbeddingEncoder,
10
+ )
11
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
12
+
13
+ API_KEY = "MXBAI_API_KEY"
14
+
15
+
16
+ def get_api_key() -> str:
17
+ api_key = os.getenv(API_KEY, None)
18
+ assert api_key
19
+ return api_key
20
+
21
+
22
+ @requires_env(API_KEY)
23
+ def test_mixedbread_embedder(embedder_file: Path):
24
+ api_key = get_api_key()
25
+ embedder_config = EmbedderConfig(embedding_provider="mixedbread-ai", embedding_api_key=api_key)
26
+ embedder = Embedder(config=embedder_config)
27
+ results = embedder.run(elements_filepath=embedder_file)
28
+ assert results
29
+ with embedder_file.open("r") as f:
30
+ original_elements = json.load(f)
31
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
32
+
33
+
34
+ @requires_env(API_KEY)
35
+ def test_raw_mixedbread_embedder(embedder_file: Path):
36
+ api_key = get_api_key()
37
+ embedder = MixedbreadAIEmbeddingEncoder(
38
+ config=MixedbreadAIEmbeddingConfig(
39
+ api_key=api_key,
40
+ )
41
+ )
42
+ validate_raw_embedder(
43
+ embedder=embedder,
44
+ embedder_file=embedder_file,
45
+ expected_dimensions=(1024,),
46
+ expected_is_unit_vector=False,
47
+ )
@@ -0,0 +1,41 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
8
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
+
10
+ API_KEY = "OCTOAI_API_KEY"
11
+
12
+
13
+ def get_api_key() -> str:
14
+ api_key = os.getenv(API_KEY, None)
15
+ assert api_key
16
+ return api_key
17
+
18
+
19
+ @requires_env(API_KEY)
20
+ def test_octoai_embedder(embedder_file: Path):
21
+ api_key = get_api_key()
22
+ embedder_config = EmbedderConfig(embedding_provider="octoai", embedding_api_key=api_key)
23
+ embedder = Embedder(config=embedder_config)
24
+ results = embedder.run(elements_filepath=embedder_file)
25
+ assert results
26
+ with embedder_file.open("r") as f:
27
+ original_elements = json.load(f)
28
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
29
+
30
+
31
+ @requires_env(API_KEY)
32
+ def test_raw_octoai_embedder(embedder_file: Path):
33
+ api_key = get_api_key()
34
+ embedder = OctoAIEmbeddingEncoder(
35
+ config=OctoAiEmbeddingConfig(
36
+ api_key=api_key,
37
+ )
38
+ )
39
+ validate_raw_embedder(
40
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
41
+ )
@@ -0,0 +1,41 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
8
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
+
10
+ API_KEY = "OPENAI_API_KEY"
11
+
12
+
13
+ def get_api_key() -> str:
14
+ api_key = os.getenv(API_KEY, None)
15
+ assert api_key
16
+ return api_key
17
+
18
+
19
+ @requires_env(API_KEY)
20
+ def test_openai_embedder(embedder_file: Path):
21
+ api_key = get_api_key()
22
+ embedder_config = EmbedderConfig(embedding_provider="openai", embedding_api_key=api_key)
23
+ embedder = Embedder(config=embedder_config)
24
+ results = embedder.run(elements_filepath=embedder_file)
25
+ assert results
26
+ with embedder_file.open("r") as f:
27
+ original_elements = json.load(f)
28
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
29
+
30
+
31
+ @requires_env(API_KEY)
32
+ def test_raw_openai_embedder(embedder_file: Path):
33
+ api_key = get_api_key()
34
+ embedder = OpenAIEmbeddingEncoder(
35
+ config=OpenAIEmbeddingConfig(
36
+ api_key=api_key,
37
+ )
38
+ )
39
+ validate_raw_embedder(
40
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
41
+ )
@@ -0,0 +1,41 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
8
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
+
10
+ API_KEY = "VERTEXAI_API_KEY"
11
+
12
+
13
+ def get_api_key() -> str:
14
+ api_key = os.getenv(API_KEY, None)
15
+ assert api_key
16
+ return api_key
17
+
18
+
19
+ @requires_env(API_KEY)
20
+ def test_vertexai_embedder(embedder_file: Path):
21
+ api_key = get_api_key()
22
+ embedder_config = EmbedderConfig(embedding_provider="vertexai", embedding_api_key=api_key)
23
+ embedder = Embedder(config=embedder_config)
24
+ results = embedder.run(elements_filepath=embedder_file)
25
+ assert results
26
+ with embedder_file.open("r") as f:
27
+ original_elements = json.load(f)
28
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
29
+
30
+
31
+ @requires_env(API_KEY)
32
+ def test_raw_vertexai_embedder(embedder_file: Path):
33
+ api_key = get_api_key()
34
+ embedder = VertexAIEmbeddingEncoder(
35
+ config=VertexAIEmbeddingConfig(
36
+ api_key=api_key,
37
+ )
38
+ )
39
+ validate_raw_embedder(
40
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(768,)
41
+ )