unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (83) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/test_postgres.py +100 -0
  10. test/integration/connectors/test_s3.py +152 -0
  11. test/integration/connectors/test_sqlite.py +91 -0
  12. test/integration/connectors/utils/__init__.py +0 -0
  13. test/integration/connectors/utils/constants.py +7 -0
  14. test/integration/connectors/utils/docker_compose.py +44 -0
  15. test/integration/connectors/utils/validation.py +198 -0
  16. test/integration/embedders/__init__.py +0 -0
  17. test/integration/embedders/conftest.py +13 -0
  18. test/integration/embedders/test_bedrock.py +49 -0
  19. test/integration/embedders/test_huggingface.py +26 -0
  20. test/integration/embedders/test_mixedbread.py +47 -0
  21. test/integration/embedders/test_octoai.py +41 -0
  22. test/integration/embedders/test_openai.py +41 -0
  23. test/integration/embedders/test_vertexai.py +41 -0
  24. test/integration/embedders/test_voyageai.py +41 -0
  25. test/integration/embedders/togetherai.py +43 -0
  26. test/integration/embedders/utils.py +44 -0
  27. test/integration/partitioners/__init__.py +0 -0
  28. test/integration/partitioners/test_partitioner.py +75 -0
  29. test/integration/utils.py +15 -0
  30. test/unit/__init__.py +0 -0
  31. test/unit/embed/__init__.py +0 -0
  32. test/unit/embed/test_mixedbreadai.py +41 -0
  33. test/unit/embed/test_octoai.py +20 -0
  34. test/unit/embed/test_openai.py +20 -0
  35. test/unit/embed/test_vertexai.py +25 -0
  36. test/unit/embed/test_voyageai.py +24 -0
  37. test/unit/test_chunking_utils.py +36 -0
  38. test/unit/test_error.py +27 -0
  39. test/unit/test_interfaces.py +280 -0
  40. test/unit/test_interfaces_v2.py +26 -0
  41. test/unit/test_logger.py +78 -0
  42. test/unit/test_utils.py +164 -0
  43. test/unit/test_utils_v2.py +82 -0
  44. unstructured_ingest/__version__.py +1 -1
  45. unstructured_ingest/cli/interfaces.py +2 -2
  46. unstructured_ingest/connector/notion/types/block.py +1 -0
  47. unstructured_ingest/connector/notion/types/database.py +1 -0
  48. unstructured_ingest/connector/notion/types/page.py +1 -0
  49. unstructured_ingest/embed/bedrock.py +0 -20
  50. unstructured_ingest/embed/huggingface.py +0 -21
  51. unstructured_ingest/embed/interfaces.py +29 -3
  52. unstructured_ingest/embed/mixedbreadai.py +0 -36
  53. unstructured_ingest/embed/octoai.py +2 -24
  54. unstructured_ingest/embed/openai.py +0 -20
  55. unstructured_ingest/embed/togetherai.py +40 -0
  56. unstructured_ingest/embed/vertexai.py +0 -20
  57. unstructured_ingest/embed/voyageai.py +1 -24
  58. unstructured_ingest/interfaces.py +1 -1
  59. unstructured_ingest/v2/cli/utils/click.py +21 -2
  60. unstructured_ingest/v2/interfaces/connector.py +22 -2
  61. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  62. unstructured_ingest/v2/processes/chunker.py +1 -1
  63. unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
  64. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  65. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  71. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  72. unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
  73. unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
  74. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
  75. unstructured_ingest/v2/processes/embedder.py +13 -0
  76. unstructured_ingest/v2/processes/partitioner.py +2 -1
  77. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +14 -12
  78. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +82 -29
  79. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
  80. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  81. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
  82. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
  83. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,198 @@
1
+ import json
2
+ import os
3
+ import shutil
4
+ from dataclasses import dataclass, field, replace
5
+ from pathlib import Path
6
+ from typing import Callable, Optional
7
+
8
+ from deepdiff import DeepDiff
9
+
10
+ from test.integration.connectors.utils.constants import expected_results_path
11
+ from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
12
+
13
+
14
+ @dataclass
15
+ class ValidationConfigs:
16
+ test_id: str
17
+ expected_num_files: Optional[int] = None
18
+ predownload_file_data_check: Optional[Callable[[FileData], None]] = None
19
+ postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
20
+ exclude_fields: list[str] = field(
21
+ default_factory=lambda: ["local_download_path", "metadata.date_processed"]
22
+ )
23
+ exclude_fields_extend: list[str] = field(default_factory=list)
24
+
25
+ def get_exclude_fields(self) -> list[str]:
26
+ exclude_fields = self.exclude_fields
27
+ exclude_fields.extend(self.exclude_fields_extend)
28
+ return exclude_fields
29
+
30
+ def run_file_data_validation(
31
+ self, predownload_file_data: FileData, postdownload_file_data: FileData
32
+ ):
33
+ if predownload_file_data_check := self.predownload_file_data_check:
34
+ predownload_file_data_check(predownload_file_data)
35
+ if postdownload_file_data_check := self.postdownload_file_data_check:
36
+ postdownload_file_data_check(postdownload_file_data)
37
+
38
+ def run_download_dir_validation(self, download_dir: Path):
39
+ if expected_num_files := self.expected_num_files:
40
+ downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
41
+ assert len(downloaded_files) == expected_num_files
42
+
43
+ def test_output_dir(self) -> Path:
44
+ return expected_results_path / self.test_id
45
+
46
+ def omit_ignored_fields(self, data: dict) -> dict:
47
+ exclude_fields = self.get_exclude_fields()
48
+ # Ignore fields that dynamically change every time the tests run
49
+ copied_data = data.copy()
50
+ for exclude_field in exclude_fields:
51
+ exclude_field_vals = exclude_field.split(".")
52
+ if len(exclude_field_vals) == 1:
53
+ current_val = copied_data
54
+ drop_field = exclude_field_vals[0]
55
+ copied_data.pop(exclude_field_vals[0], None)
56
+ else:
57
+ current_val = copied_data
58
+ for val in exclude_field_vals[:-1]:
59
+ current_val = current_val.get(val, {})
60
+ drop_field = exclude_field_vals[-1]
61
+ if drop_field == "*":
62
+ current_val.clear()
63
+ else:
64
+ current_val.pop(drop_field, None)
65
+ return copied_data
66
+
67
+
68
+ def get_files(dir_path: Path) -> list[str]:
69
+ return [
70
+ str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.iterdir() if f.is_file()
71
+ ]
72
+
73
+
74
+ def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
75
+ expected_files = get_files(dir_path=expected_output_dir)
76
+ current_files = [f"{file_data.identifier}.json" for file_data in all_file_data]
77
+ diff = set(expected_files) ^ set(current_files)
78
+ assert not diff, "diff in files that exist: {}".format(", ".join(diff))
79
+
80
+
81
+ def check_contents(
82
+ expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
83
+ ):
84
+ found_diff = False
85
+ for file_data in all_file_data:
86
+ file_data_path = expected_output_dir / f"{file_data.identifier}.json"
87
+ with file_data_path.open("r") as file:
88
+ expected_file_data_contents = json.load(file)
89
+ current_file_data_contents = file_data.to_dict()
90
+ expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
91
+ current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
92
+ diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
93
+ if diff:
94
+ found_diff = True
95
+ print(diff.to_json(indent=2))
96
+ assert not found_diff, f"Diffs found between files: {found_diff}"
97
+
98
+
99
+ def run_expected_results_validation(
100
+ expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
101
+ ):
102
+ check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
103
+ check_contents(
104
+ expected_output_dir=expected_output_dir, all_file_data=all_file_data, configs=configs
105
+ )
106
+
107
+
108
+ def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
109
+ directory_record = expected_output_dir / "directory_structure.json"
110
+ with directory_record.open("r") as directory_file:
111
+ directory_file_contents = json.load(directory_file)
112
+ directory_structure = directory_file_contents["directory_structure"]
113
+ assert directory_structure == download_files
114
+
115
+
116
+ def update_fixtures(output_dir: Path, download_dir: Path, all_file_data: list[FileData]):
117
+ # Delete current files
118
+ shutil.rmtree(path=output_dir, ignore_errors=True)
119
+ output_dir.mkdir(parents=True)
120
+ # Rewrite the current file data
121
+ file_data_output_path = output_dir / "file_data"
122
+ file_data_output_path.mkdir(parents=True)
123
+ for file_data in all_file_data:
124
+ file_data_path = file_data_output_path / f"{file_data.identifier}.json"
125
+ with file_data_path.open(mode="w") as f:
126
+ json.dump(file_data.to_dict(), f, indent=2)
127
+
128
+ # Record file structure of download directory
129
+ download_files = get_files(dir_path=download_dir)
130
+ download_files.sort()
131
+ download_dir_record = output_dir / "directory_structure.json"
132
+ with download_dir_record.open(mode="w") as f:
133
+ json.dump({"directory_structure": download_files}, f, indent=2)
134
+
135
+
136
+ def run_all_validations(
137
+ configs: ValidationConfigs,
138
+ predownload_file_data: list[FileData],
139
+ postdownload_file_data: list[FileData],
140
+ download_dir: Path,
141
+ test_output_dir: Path,
142
+ ):
143
+ for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
144
+ configs.run_file_data_validation(
145
+ predownload_file_data=pre_data, postdownload_file_data=post_data
146
+ )
147
+ configs.run_download_dir_validation(download_dir=download_dir)
148
+ run_expected_results_validation(
149
+ expected_output_dir=test_output_dir / "file_data",
150
+ all_file_data=postdownload_file_data,
151
+ configs=configs,
152
+ )
153
+ download_files = get_files(dir_path=download_dir)
154
+ download_files.sort()
155
+ run_directory_structure_validation(
156
+ expected_output_dir=configs.test_output_dir(), download_files=download_files
157
+ )
158
+
159
+
160
+ async def source_connector_validation(
161
+ indexer: Indexer,
162
+ downloader: Downloader,
163
+ configs: ValidationConfigs,
164
+ overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
165
+ ) -> None:
166
+ # Run common validations on the process of running a source connector, supporting dynamic
167
+ # validators that get passed in along with comparisons on the saved expected values.
168
+ # If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the
169
+ # expected values with what gets generated by this test.
170
+ all_predownload_file_data = []
171
+ all_postdownload_file_data = []
172
+ indexer.precheck()
173
+ download_dir = downloader.download_config.download_dir
174
+ test_output_dir = configs.test_output_dir()
175
+ for file_data in indexer.run():
176
+ assert file_data
177
+ predownload_file_data = replace(file_data)
178
+ all_predownload_file_data.append(predownload_file_data)
179
+ if downloader.is_async():
180
+ resp = await downloader.run_async(file_data=file_data)
181
+ else:
182
+ resp = downloader.run(file_data=file_data)
183
+ postdownload_file_data = replace(resp["file_data"])
184
+ all_postdownload_file_data.append(postdownload_file_data)
185
+ if not overwrite_fixtures:
186
+ run_all_validations(
187
+ configs=configs,
188
+ predownload_file_data=all_predownload_file_data,
189
+ postdownload_file_data=all_postdownload_file_data,
190
+ download_dir=download_dir,
191
+ test_output_dir=test_output_dir,
192
+ )
193
+ else:
194
+ update_fixtures(
195
+ output_dir=test_output_dir,
196
+ download_dir=download_dir,
197
+ all_file_data=all_postdownload_file_data,
198
+ )
File without changes
@@ -0,0 +1,13 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+
5
+
6
+ @pytest.fixture
7
+ def embedder_file() -> Path:
8
+ int_test_dir = Path(__file__).parent
9
+ assets_dir = int_test_dir / "assets"
10
+ embedder_file = assets_dir / "DA-1p-with-duplicate-pages.pdf.json"
11
+ assert embedder_file.exists()
12
+ assert embedder_file.is_file()
13
+ return embedder_file
@@ -0,0 +1,49 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
8
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
+
10
+
11
+ def get_aws_credentials() -> dict:
12
+ access_key = os.getenv("AWS_ACCESS_KEY_ID", None)
13
+ assert access_key
14
+ secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", None)
15
+ assert secret_key
16
+ return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
17
+
18
+
19
+ @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
20
+ def test_bedrock_embedder(embedder_file: Path):
21
+ aws_credentials = get_aws_credentials()
22
+ embedder_config = EmbedderConfig(
23
+ embedding_provider="aws-bedrock",
24
+ embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
25
+ embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
26
+ )
27
+ embedder = Embedder(config=embedder_config)
28
+ results = embedder.run(elements_filepath=embedder_file)
29
+ assert results
30
+ with embedder_file.open("r") as f:
31
+ original_elements = json.load(f)
32
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
33
+
34
+
35
+ @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
36
+ def test_raw_bedrock_embedder(embedder_file: Path):
37
+ aws_credentials = get_aws_credentials()
38
+ embedder = BedrockEmbeddingEncoder(
39
+ config=BedrockEmbeddingConfig(
40
+ aws_access_key_id=aws_credentials["aws_access_key_id"],
41
+ aws_secret_access_key=aws_credentials["aws_secret_access_key"],
42
+ )
43
+ )
44
+ validate_raw_embedder(
45
+ embedder=embedder,
46
+ embedder_file=embedder_file,
47
+ expected_dimensions=(1536,),
48
+ expected_is_unit_vector=False,
49
+ )
@@ -0,0 +1,26 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
5
+ from unstructured_ingest.embed.huggingface import (
6
+ HuggingFaceEmbeddingConfig,
7
+ HuggingFaceEmbeddingEncoder,
8
+ )
9
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
10
+
11
+
12
+ def test_huggingface_embedder(embedder_file: Path):
13
+ embedder_config = EmbedderConfig(embedding_provider="huggingface")
14
+ embedder = Embedder(config=embedder_config)
15
+ results = embedder.run(elements_filepath=embedder_file)
16
+ assert results
17
+ with embedder_file.open("r") as f:
18
+ original_elements = json.load(f)
19
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
20
+
21
+
22
+ def test_raw_hugginface_embedder(embedder_file: Path):
23
+ embedder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
24
+ validate_raw_embedder(
25
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(384,)
26
+ )
@@ -0,0 +1,47 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.mixedbreadai import (
8
+ MixedbreadAIEmbeddingConfig,
9
+ MixedbreadAIEmbeddingEncoder,
10
+ )
11
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
12
+
13
+ API_KEY = "MXBAI_API_KEY"
14
+
15
+
16
+ def get_api_key() -> str:
17
+ api_key = os.getenv(API_KEY, None)
18
+ assert api_key
19
+ return api_key
20
+
21
+
22
+ @requires_env(API_KEY)
23
+ def test_mixedbread_embedder(embedder_file: Path):
24
+ api_key = get_api_key()
25
+ embedder_config = EmbedderConfig(embedding_provider="mixedbread-ai", embedding_api_key=api_key)
26
+ embedder = Embedder(config=embedder_config)
27
+ results = embedder.run(elements_filepath=embedder_file)
28
+ assert results
29
+ with embedder_file.open("r") as f:
30
+ original_elements = json.load(f)
31
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
32
+
33
+
34
+ @requires_env(API_KEY)
35
+ def test_raw_mixedbread_embedder(embedder_file: Path):
36
+ api_key = get_api_key()
37
+ embedder = MixedbreadAIEmbeddingEncoder(
38
+ config=MixedbreadAIEmbeddingConfig(
39
+ api_key=api_key,
40
+ )
41
+ )
42
+ validate_raw_embedder(
43
+ embedder=embedder,
44
+ embedder_file=embedder_file,
45
+ expected_dimensions=(1024,),
46
+ expected_is_unit_vector=False,
47
+ )
@@ -0,0 +1,41 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
8
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
+
10
+ API_KEY = "OCTOAI_API_KEY"
11
+
12
+
13
+ def get_api_key() -> str:
14
+ api_key = os.getenv(API_KEY, None)
15
+ assert api_key
16
+ return api_key
17
+
18
+
19
+ @requires_env(API_KEY)
20
+ def test_octoai_embedder(embedder_file: Path):
21
+ api_key = get_api_key()
22
+ embedder_config = EmbedderConfig(embedding_provider="octoai", embedding_api_key=api_key)
23
+ embedder = Embedder(config=embedder_config)
24
+ results = embedder.run(elements_filepath=embedder_file)
25
+ assert results
26
+ with embedder_file.open("r") as f:
27
+ original_elements = json.load(f)
28
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
29
+
30
+
31
+ @requires_env(API_KEY)
32
+ def test_raw_octoai_embedder(embedder_file: Path):
33
+ api_key = get_api_key()
34
+ embedder = OctoAIEmbeddingEncoder(
35
+ config=OctoAiEmbeddingConfig(
36
+ api_key=api_key,
37
+ )
38
+ )
39
+ validate_raw_embedder(
40
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
41
+ )
@@ -0,0 +1,41 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
8
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
+
10
+ API_KEY = "OPENAI_API_KEY"
11
+
12
+
13
+ def get_api_key() -> str:
14
+ api_key = os.getenv(API_KEY, None)
15
+ assert api_key
16
+ return api_key
17
+
18
+
19
+ @requires_env(API_KEY)
20
+ def test_openai_embedder(embedder_file: Path):
21
+ api_key = get_api_key()
22
+ embedder_config = EmbedderConfig(embedding_provider="openai", embedding_api_key=api_key)
23
+ embedder = Embedder(config=embedder_config)
24
+ results = embedder.run(elements_filepath=embedder_file)
25
+ assert results
26
+ with embedder_file.open("r") as f:
27
+ original_elements = json.load(f)
28
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
29
+
30
+
31
+ @requires_env(API_KEY)
32
+ def test_raw_openai_embedder(embedder_file: Path):
33
+ api_key = get_api_key()
34
+ embedder = OpenAIEmbeddingEncoder(
35
+ config=OpenAIEmbeddingConfig(
36
+ api_key=api_key,
37
+ )
38
+ )
39
+ validate_raw_embedder(
40
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
41
+ )
@@ -0,0 +1,41 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
8
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
+
10
+ API_KEY = "VERTEXAI_API_KEY"
11
+
12
+
13
+ def get_api_key() -> str:
14
+ api_key = os.getenv(API_KEY, None)
15
+ assert api_key
16
+ return api_key
17
+
18
+
19
+ @requires_env(API_KEY)
20
+ def test_vertexai_embedder(embedder_file: Path):
21
+ api_key = get_api_key()
22
+ embedder_config = EmbedderConfig(embedding_provider="vertexai", embedding_api_key=api_key)
23
+ embedder = Embedder(config=embedder_config)
24
+ results = embedder.run(elements_filepath=embedder_file)
25
+ assert results
26
+ with embedder_file.open("r") as f:
27
+ original_elements = json.load(f)
28
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
29
+
30
+
31
+ @requires_env(API_KEY)
32
+ def test_raw_vertexai_embedder(embedder_file: Path):
33
+ api_key = get_api_key()
34
+ embedder = VertexAIEmbeddingEncoder(
35
+ config=VertexAIEmbeddingConfig(
36
+ api_key=api_key,
37
+ )
38
+ )
39
+ validate_raw_embedder(
40
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(768,)
41
+ )
@@ -0,0 +1,41 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
8
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
9
+
10
+ API_KEY = "VOYAGEAI_API_KEY"
11
+
12
+
13
+ def get_api_key() -> str:
14
+ api_key = os.getenv(API_KEY, None)
15
+ assert api_key
16
+ return api_key
17
+
18
+
19
+ @requires_env(API_KEY)
20
+ def test_voyageai_embedder(embedder_file: Path):
21
+ api_key = get_api_key()
22
+ embedder_config = EmbedderConfig(embedding_provider="voyageai", embedding_api_key=api_key)
23
+ embedder = Embedder(config=embedder_config)
24
+ results = embedder.run(elements_filepath=embedder_file)
25
+ assert results
26
+ with embedder_file.open("r") as f:
27
+ original_elements = json.load(f)
28
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
29
+
30
+
31
+ @requires_env(API_KEY)
32
+ def test_raw_voyageai_embedder(embedder_file: Path):
33
+ api_key = get_api_key()
34
+ embedder = VoyageAIEmbeddingEncoder(
35
+ config=VoyageAIEmbeddingConfig(
36
+ api_key=api_key,
37
+ )
38
+ )
39
+ validate_raw_embedder(
40
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
41
+ )
@@ -0,0 +1,43 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
6
+ from test.integration.utils import requires_env
7
+ from unstructured_ingest.embed.togetherai import (
8
+ TogetherAIEmbeddingConfig,
9
+ TogetherAIEmbeddingEncoder,
10
+ )
11
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
12
+
13
+ API_KEY = "TOGETHERAI_API_KEY"
14
+
15
+
16
+ def get_api_key() -> str:
17
+ api_key = os.getenv(API_KEY, None)
18
+ assert api_key
19
+ return api_key
20
+
21
+
22
+ @requires_env(API_KEY)
23
+ def test_togetherai_embedder(embedder_file: Path):
24
+ api_key = get_api_key()
25
+ embedder_config = EmbedderConfig(embedding_provider="togetherai", embedding_api_key=api_key)
26
+ embedder = Embedder(config=embedder_config)
27
+ results = embedder.run(elements_filepath=embedder_file)
28
+ assert results
29
+ with embedder_file.open("r") as f:
30
+ original_elements = json.load(f)
31
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
32
+
33
+
34
+ @requires_env(API_KEY)
35
+ def test_raw_togetherai_embedder(embedder_file: Path):
36
+ api_key = get_api_key()
37
+ embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key=api_key))
38
+ validate_raw_embedder(
39
+ embedder=embedder,
40
+ embedder_file=embedder_file,
41
+ expected_dimensions=(768,),
42
+ expected_is_unit_vector=False,
43
+ )
@@ -0,0 +1,44 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
6
+
7
+
8
+ def validate_embedding_output(original_elements: list[dict], output_elements: list[dict]):
9
+ """
10
+ Make sure the following characteristics are met:
11
+ * The same number of elements are returned
12
+ * For each element that had text, an embeddings entry was added in the output
13
+ * Other than the embedding, nothing about the element was changed
14
+ """
15
+ assert len(original_elements) == len(output_elements)
16
+ for original_element, output_element in zip(original_elements, output_elements):
17
+ if original_element.get("text"):
18
+ assert output_element.get("embeddings", None)
19
+ output_element.pop("embeddings", None)
20
+ assert original_element == output_element
21
+
22
+
23
+ def validate_raw_embedder(
24
+ embedder: BaseEmbeddingEncoder,
25
+ embedder_file: Path,
26
+ expected_dimensions: Optional[tuple[int, ...]] = None,
27
+ expected_is_unit_vector: bool = True,
28
+ ):
29
+ with open(embedder_file) as f:
30
+ elements = json.load(f)
31
+ all_text = [element["text"] for element in elements]
32
+ single_text = all_text[0]
33
+ num_of_dimensions = embedder.num_of_dimensions
34
+ if expected_dimensions:
35
+ assert (
36
+ num_of_dimensions == expected_dimensions
37
+ ), f"number of dimensions {num_of_dimensions} didn't match expected: {expected_dimensions}"
38
+ is_unit_vector = embedder.is_unit_vector
39
+ assert is_unit_vector == expected_is_unit_vector
40
+ single_embedding = embedder.embed_query(query=single_text)
41
+ expected_length = num_of_dimensions[0]
42
+ assert len(single_embedding) == expected_length
43
+ embedded_elements = embedder.embed_documents(elements=elements)
44
+ validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
File without changes
@@ -0,0 +1,75 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ from unstructured_client.models.errors.sdkerror import SDKError
7
+
8
+ from test.integration.utils import requires_env
9
+ from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
10
+
11
+ int_test_dir = Path(__file__).parent
12
+ assets_dir = int_test_dir / "assets"
13
+
14
+ all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
15
+ non_image_partition_files = [
16
+ path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
17
+ ]
18
+ image_partition_files = [
19
+ path for path in all_partition_files if path not in non_image_partition_files
20
+ ]
21
+
22
+
23
+ @pytest.mark.parametrize(
24
+ "partition_file", all_partition_files, ids=[path.name for path in all_partition_files]
25
+ )
26
+ @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
27
+ @pytest.mark.asyncio
28
+ async def test_partitioner_api_hi_res(partition_file: Path):
29
+ api_key = os.getenv("UNSTRUCTURED_API_KEY")
30
+ api_url = os.getenv("UNSTRUCTURED_API_URL")
31
+ partitioner_config = PartitionerConfig(
32
+ strategy="hi_res", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
33
+ )
34
+ partitioner = Partitioner(config=partitioner_config)
35
+ results = await partitioner.run_async(filename=partition_file)
36
+ results_dir = int_test_dir / "results"
37
+ results_dir.mkdir(exist_ok=True)
38
+ results_path = results_dir / f"{partition_file.name}.json"
39
+ with results_path.open("w") as f:
40
+ json.dump(results, f, indent=2)
41
+ assert results
42
+
43
+
44
+ @pytest.mark.parametrize(
45
+ "partition_file",
46
+ non_image_partition_files,
47
+ ids=[path.name for path in non_image_partition_files],
48
+ )
49
+ @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
50
+ @pytest.mark.asyncio
51
+ async def test_partitioner_api_fast(partition_file: Path):
52
+ api_key = os.getenv("UNSTRUCTURED_API_KEY")
53
+ api_url = os.getenv("UNSTRUCTURED_API_URL")
54
+ partitioner_config = PartitionerConfig(
55
+ strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
56
+ )
57
+ partitioner = Partitioner(config=partitioner_config)
58
+ results = await partitioner.run_async(filename=partition_file)
59
+ assert results
60
+
61
+
62
+ @pytest.mark.parametrize(
63
+ "partition_file", image_partition_files, ids=[path.name for path in image_partition_files]
64
+ )
65
+ @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
66
+ @pytest.mark.asyncio
67
+ async def test_partitioner_api_fast_error(partition_file: Path):
68
+ api_key = os.getenv("UNSTRUCTURED_API_KEY")
69
+ api_url = os.getenv("UNSTRUCTURED_API_URL")
70
+ partitioner_config = PartitionerConfig(
71
+ strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
72
+ )
73
+ partitioner = Partitioner(config=partitioner_config)
74
+ with pytest.raises(SDKError):
75
+ await partitioner.run_async(filename=partition_file)