unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +48 -34
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,152 +0,0 @@
1
- import json
2
- import time
3
- from pathlib import Path
4
-
5
- import pytest
6
- import requests
7
- import weaviate
8
- from weaviate.client import WeaviateClient
9
-
10
- from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
11
- from test.integration.connectors.utils.docker import container_context
12
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
13
- from unstructured_ingest.processes.connectors.weaviate.local import (
14
- CONNECTOR_TYPE,
15
- LocalWeaviateConnectionConfig,
16
- LocalWeaviateUploader,
17
- LocalWeaviateUploaderConfig,
18
- LocalWeaviateUploadStager,
19
- )
20
-
21
- COLLECTION_NAME = "elements"
22
-
23
-
24
- def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
25
- start_time = time.time()
26
- while time.time() - start_time < timeout:
27
- try:
28
- requests.get("http://localhost:8080/v1/.well-known/read", timeout=1)
29
- return
30
- except Exception as e:
31
- print(f"Failed to validate container healthy, sleeping for {interval} seconds: {e}")
32
- time.sleep(interval)
33
- raise TimeoutError("Docker container never came up healthy")
34
-
35
-
36
- @pytest.fixture
37
- def weaviate_instance():
38
- with container_context(
39
- image="semitechnologies/weaviate:1.27.3",
40
- ports={8080: 8080, 50051: 50051},
41
- ) as ctx:
42
- wait_for_container()
43
- yield ctx
44
-
45
-
46
- @pytest.fixture
47
- def collection(weaviate_instance, collections_schema_config: dict) -> str:
48
- with weaviate.connect_to_local() as weaviate_client:
49
- weaviate_client.collections.create_from_dict(config=collections_schema_config)
50
- return COLLECTION_NAME
51
-
52
-
53
- def get_count(client: WeaviateClient) -> int:
54
- collection = client.collections.get(COLLECTION_NAME)
55
- resp = collection.aggregate.over_all(total_count=True)
56
- return resp.total_count
57
-
58
-
59
- def validate_count(expected_count: int, retries: int = 10, interval: int = 1) -> None:
60
- with weaviate.connect_to_local() as weaviate_client:
61
- current_count = get_count(client=weaviate_client)
62
- retry_count = 0
63
- while current_count != expected_count and retry_count < retries:
64
- retry_count += 1
65
- time.sleep(interval)
66
- current_count = get_count(client=weaviate_client)
67
- assert current_count == expected_count, (
68
- f"Expected count ({expected_count}) doesn't match how "
69
- f"much came back from collection: {current_count}"
70
- )
71
-
72
-
73
- def run_uploader_and_validate(
74
- uploader: LocalWeaviateUploader, path: Path, file_data: FileData, expected_count: int
75
- ):
76
- uploader.precheck()
77
- uploader.run(path=path, file_data=file_data)
78
- validate_count(expected_count=expected_count)
79
-
80
-
81
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
82
- def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
83
- file_data = FileData(
84
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
85
- connector_type=CONNECTOR_TYPE,
86
- identifier="mock file data",
87
- )
88
- stager = LocalWeaviateUploadStager()
89
-
90
- staged_filepath = stager.run(
91
- elements_filepath=upload_file,
92
- file_data=file_data,
93
- output_dir=tmp_path,
94
- output_filename=upload_file.name,
95
- )
96
- dynamic_uploader = LocalWeaviateUploader(
97
- upload_config=LocalWeaviateUploaderConfig(
98
- collection=COLLECTION_NAME,
99
- ),
100
- connection_config=LocalWeaviateConnectionConfig(),
101
- )
102
- fixed_size_uploader = LocalWeaviateUploader(
103
- upload_config=LocalWeaviateUploaderConfig(
104
- collection=COLLECTION_NAME, batch_size=10, dynamic_batch=False
105
- ),
106
- connection_config=LocalWeaviateConnectionConfig(),
107
- )
108
- rate_limited_uploader = LocalWeaviateUploader(
109
- upload_config=LocalWeaviateUploaderConfig(
110
- collection=COLLECTION_NAME, requests_per_minute=50, dynamic_batch=False
111
- ),
112
- connection_config=LocalWeaviateConnectionConfig(),
113
- )
114
- with staged_filepath.open() as f:
115
- staged_elements = json.load(f)
116
- expected_count = len(staged_elements)
117
-
118
- run_uploader_and_validate(
119
- uploader=dynamic_uploader,
120
- path=staged_filepath,
121
- file_data=file_data,
122
- expected_count=expected_count,
123
- )
124
- run_uploader_and_validate(
125
- uploader=fixed_size_uploader,
126
- path=staged_filepath,
127
- file_data=file_data,
128
- expected_count=expected_count,
129
- )
130
- run_uploader_and_validate(
131
- uploader=rate_limited_uploader,
132
- path=staged_filepath,
133
- file_data=file_data,
134
- expected_count=expected_count,
135
- )
136
-
137
-
138
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
139
- def test_weaviate_local_create_destination(weaviate_instance):
140
- uploader = LocalWeaviateUploader(
141
- upload_config=LocalWeaviateUploaderConfig(),
142
- connection_config=LocalWeaviateConnectionConfig(),
143
- )
144
- collection_name = "system_created-123"
145
- formatted_collection_name = "System_created_123"
146
- created = uploader.create_destination(destination_name=collection_name)
147
- assert created
148
- with uploader.connection_config.get_client() as weaviate_client:
149
- assert weaviate_client.collections.exists(name=formatted_collection_name)
150
-
151
- created = uploader.create_destination(destination_name=collection_name)
152
- assert not created
File without changes
@@ -1,13 +0,0 @@
1
- from pathlib import Path
2
-
3
- import pytest
4
-
5
-
6
- @pytest.fixture
7
- def embedder_file() -> Path:
8
- int_test_dir = Path(__file__).parent
9
- assets_dir = int_test_dir / "assets"
10
- embedder_file = assets_dir / "DA-1p-with-duplicate-pages.pdf.json"
11
- assert embedder_file.exists()
12
- assert embedder_file.is_file()
13
- return embedder_file
@@ -1,57 +0,0 @@
1
- import json
2
- import os
3
- from dataclasses import dataclass
4
- from pathlib import Path
5
-
6
- from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
7
- from test.integration.utils import requires_env
8
- from unstructured_ingest.embed.azure_openai import (
9
- AzureOpenAIEmbeddingConfig,
10
- AzureOpenAIEmbeddingEncoder,
11
- )
12
- from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
13
-
14
- API_KEY = "AZURE_OPENAI_API_KEY"
15
- ENDPOINT = "AZURE_OPENAI_ENDPOINT"
16
-
17
-
18
- @dataclass(frozen=True)
19
- class AzureData:
20
- api_key: str
21
- endpoint: str
22
-
23
-
24
- def get_azure_data() -> AzureData:
25
- api_key = os.getenv(API_KEY, None)
26
- assert api_key
27
- endpoint = os.getenv(ENDPOINT, None)
28
- assert endpoint
29
- return AzureData(api_key, endpoint)
30
-
31
-
32
- @requires_env(API_KEY, ENDPOINT)
33
- def test_azure_openai_embedder(embedder_file: Path):
34
- azure_data = get_azure_data()
35
- embedder_config = EmbedderConfig(
36
- embedding_provider="azure-openai",
37
- embedding_api_key=azure_data.api_key,
38
- embedding_azure_endpoint=azure_data.endpoint,
39
- )
40
- embedder = Embedder(config=embedder_config)
41
- results = embedder.run(elements_filepath=embedder_file)
42
- assert results
43
- with embedder_file.open("r") as f:
44
- original_elements = json.load(f)
45
- validate_embedding_output(original_elements=original_elements, output_elements=results)
46
-
47
-
48
- @requires_env(API_KEY, ENDPOINT)
49
- def test_raw_azure_openai_embedder(embedder_file: Path):
50
- azure_data = get_azure_data()
51
- embedder = AzureOpenAIEmbeddingEncoder(
52
- config=AzureOpenAIEmbeddingConfig(
53
- api_key=azure_data.api_key,
54
- azure_endpoint=azure_data.endpoint,
55
- )
56
- )
57
- validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1536)
@@ -1,103 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
-
5
- import pytest
6
-
7
- from test.integration.embedders.utils import (
8
- validate_embedding_output,
9
- validate_raw_embedder,
10
- validate_raw_embedder_async,
11
- )
12
- from test.integration.utils import requires_env
13
- from unstructured_ingest.embed.bedrock import (
14
- AsyncBedrockEmbeddingEncoder,
15
- BedrockEmbeddingConfig,
16
- BedrockEmbeddingEncoder,
17
- )
18
- from unstructured_ingest.errors_v2 import UserAuthError, UserError
19
- from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
20
-
21
-
22
- def get_aws_credentials() -> dict:
23
- access_key = os.getenv("AWS_ACCESS_KEY_ID", None)
24
- assert access_key
25
- secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", None)
26
- assert secret_key
27
- return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
28
-
29
-
30
- @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
31
- def test_bedrock_embedder(embedder_file: Path):
32
- aws_credentials = get_aws_credentials()
33
- embedder_config = EmbedderConfig(
34
- embedding_provider="bedrock",
35
- embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
36
- embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
37
- )
38
- embedder = Embedder(config=embedder_config)
39
- results = embedder.run(elements_filepath=embedder_file)
40
- assert results
41
- with embedder_file.open("r") as f:
42
- original_elements = json.load(f)
43
- validate_embedding_output(original_elements=original_elements, output_elements=results)
44
-
45
-
46
- @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
47
- def test_raw_bedrock_embedder(embedder_file: Path):
48
- aws_credentials = get_aws_credentials()
49
- embedder = BedrockEmbeddingEncoder(
50
- config=BedrockEmbeddingConfig(
51
- aws_access_key_id=aws_credentials["aws_access_key_id"],
52
- aws_secret_access_key=aws_credentials["aws_secret_access_key"],
53
- )
54
- )
55
- validate_raw_embedder(
56
- embedder=embedder,
57
- embedder_file=embedder_file,
58
- expected_dimension=1536,
59
- expected_is_unit_vector=False,
60
- )
61
-
62
-
63
- def test_raw_bedrock_embedder_invalid_credentials(embedder_file: Path):
64
- embedder = BedrockEmbeddingEncoder(
65
- config=BedrockEmbeddingConfig(
66
- aws_access_key_id="no_key",
67
- aws_secret_access_key="no_secret",
68
- )
69
- )
70
- with pytest.raises(UserAuthError):
71
- embedder.get_exemplary_embedding()
72
-
73
-
74
- @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
75
- def test_raw_bedrock_embedder_invalid_model(embedder_file: Path):
76
- aws_credentials = get_aws_credentials()
77
- embedder = BedrockEmbeddingEncoder(
78
- config=BedrockEmbeddingConfig(
79
- aws_access_key_id=aws_credentials["aws_access_key_id"],
80
- aws_secret_access_key=aws_credentials["aws_secret_access_key"],
81
- model_name="invalid_model",
82
- )
83
- )
84
- with pytest.raises(UserError):
85
- embedder.get_exemplary_embedding()
86
-
87
-
88
- @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
89
- @pytest.mark.asyncio
90
- async def test_raw_async_bedrock_embedder(embedder_file: Path):
91
- aws_credentials = get_aws_credentials()
92
- embedder = AsyncBedrockEmbeddingEncoder(
93
- config=BedrockEmbeddingConfig(
94
- aws_access_key_id=aws_credentials["aws_access_key_id"],
95
- aws_secret_access_key=aws_credentials["aws_secret_access_key"],
96
- )
97
- )
98
- await validate_raw_embedder_async(
99
- embedder=embedder,
100
- embedder_file=embedder_file,
101
- expected_dimension=1536,
102
- expected_is_unit_vector=False,
103
- )
@@ -1,24 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
5
- from unstructured_ingest.embed.huggingface import (
6
- HuggingFaceEmbeddingConfig,
7
- HuggingFaceEmbeddingEncoder,
8
- )
9
- from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
10
-
11
-
12
- def test_huggingface_embedder(embedder_file: Path):
13
- embedder_config = EmbedderConfig(embedding_provider="huggingface")
14
- embedder = Embedder(config=embedder_config)
15
- results = embedder.run(elements_filepath=embedder_file)
16
- assert results
17
- with embedder_file.open("r") as f:
18
- original_elements = json.load(f)
19
- validate_embedding_output(original_elements=original_elements, output_elements=results)
20
-
21
-
22
- def test_raw_hugginface_embedder(embedder_file: Path):
23
- embedder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
24
- validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=384)
@@ -1,71 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
-
5
- import pytest
6
-
7
- from test.integration.embedders.utils import (
8
- validate_embedding_output,
9
- validate_raw_embedder,
10
- validate_raw_embedder_async,
11
- )
12
- from test.integration.utils import requires_env
13
- from unstructured_ingest.embed.mixedbreadai import (
14
- AsyncMixedbreadAIEmbeddingEncoder,
15
- MixedbreadAIEmbeddingConfig,
16
- MixedbreadAIEmbeddingEncoder,
17
- )
18
- from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
19
-
20
- API_KEY = "MXBAI_API_KEY"
21
-
22
-
23
- def get_api_key() -> str:
24
- api_key = os.getenv(API_KEY, None)
25
- assert api_key
26
- return api_key
27
-
28
-
29
- @requires_env(API_KEY)
30
- def test_mixedbread_embedder(embedder_file: Path):
31
- api_key = get_api_key()
32
- embedder_config = EmbedderConfig(embedding_provider="mixedbread-ai", embedding_api_key=api_key)
33
- embedder = Embedder(config=embedder_config)
34
- results = embedder.run(elements_filepath=embedder_file)
35
- assert results
36
- with embedder_file.open("r") as f:
37
- original_elements = json.load(f)
38
- validate_embedding_output(original_elements=original_elements, output_elements=results)
39
-
40
-
41
- @requires_env(API_KEY)
42
- def test_raw_mixedbread_embedder(embedder_file: Path):
43
- api_key = get_api_key()
44
- embedder = MixedbreadAIEmbeddingEncoder(
45
- config=MixedbreadAIEmbeddingConfig(
46
- api_key=api_key,
47
- )
48
- )
49
- validate_raw_embedder(
50
- embedder=embedder,
51
- embedder_file=embedder_file,
52
- expected_dimension=1024,
53
- expected_is_unit_vector=True,
54
- )
55
-
56
-
57
- @requires_env(API_KEY)
58
- @pytest.mark.asyncio
59
- async def test_raw_async_mixedbread_embedder(embedder_file: Path):
60
- api_key = get_api_key()
61
- embedder = AsyncMixedbreadAIEmbeddingEncoder(
62
- config=MixedbreadAIEmbeddingConfig(
63
- api_key=api_key,
64
- )
65
- )
66
- await validate_raw_embedder_async(
67
- embedder=embedder,
68
- embedder_file=embedder_file,
69
- expected_dimension=1024,
70
- expected_is_unit_vector=True,
71
- )
@@ -1,75 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
-
5
- import pytest
6
-
7
- from test.integration.embedders.utils import (
8
- validate_embedding_output,
9
- validate_raw_embedder,
10
- validate_raw_embedder_async,
11
- )
12
- from test.integration.utils import requires_env
13
- from unstructured_ingest.embed.octoai import (
14
- AsyncOctoAIEmbeddingEncoder,
15
- OctoAiEmbeddingConfig,
16
- OctoAIEmbeddingEncoder,
17
- )
18
- from unstructured_ingest.errors_v2 import UserAuthError
19
- from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
20
-
21
- API_KEY = "OCTOAI_API_KEY"
22
-
23
-
24
- def get_api_key() -> str:
25
- api_key = os.getenv(API_KEY, None)
26
- assert api_key
27
- return api_key
28
-
29
-
30
- @requires_env(API_KEY)
31
- def test_octoai_embedder(embedder_file: Path):
32
- api_key = get_api_key()
33
- embedder_config = EmbedderConfig(embedding_provider="octoai", embedding_api_key=api_key)
34
- embedder = Embedder(config=embedder_config)
35
- results = embedder.run(elements_filepath=embedder_file)
36
- assert results
37
- with embedder_file.open("r") as f:
38
- original_elements = json.load(f)
39
- validate_embedding_output(original_elements=original_elements, output_elements=results)
40
-
41
-
42
- @requires_env(API_KEY)
43
- def test_raw_octoai_embedder(embedder_file: Path):
44
- api_key = get_api_key()
45
- embedder = OctoAIEmbeddingEncoder(
46
- config=OctoAiEmbeddingConfig(
47
- api_key=api_key,
48
- )
49
- )
50
- validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1024)
51
-
52
-
53
- @pytest.mark.skip(reason="Unexpected connection error at the moment")
54
- def test_raw_octoai_embedder_invalid_credentials():
55
- embedder = OctoAIEmbeddingEncoder(
56
- config=OctoAiEmbeddingConfig(
57
- api_key="fake_api_key",
58
- )
59
- )
60
- with pytest.raises(UserAuthError):
61
- embedder.get_exemplary_embedding()
62
-
63
-
64
- @requires_env(API_KEY)
65
- @pytest.mark.asyncio
66
- async def test_raw_async_octoai_embedder(embedder_file: Path):
67
- api_key = get_api_key()
68
- embedder = AsyncOctoAIEmbeddingEncoder(
69
- config=OctoAiEmbeddingConfig(
70
- api_key=api_key,
71
- )
72
- )
73
- await validate_raw_embedder_async(
74
- embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
75
- )
@@ -1,74 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
-
5
- import pytest
6
-
7
- from test.integration.embedders.utils import (
8
- validate_embedding_output,
9
- validate_raw_embedder,
10
- validate_raw_embedder_async,
11
- )
12
- from test.integration.utils import requires_env
13
- from unstructured_ingest.embed.openai import (
14
- AsyncOpenAIEmbeddingEncoder,
15
- OpenAIEmbeddingConfig,
16
- OpenAIEmbeddingEncoder,
17
- )
18
- from unstructured_ingest.errors_v2 import UserAuthError
19
- from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
20
-
21
- API_KEY = "OPENAI_API_KEY"
22
-
23
-
24
- def get_api_key() -> str:
25
- api_key = os.getenv(API_KEY, None)
26
- assert api_key
27
- return api_key
28
-
29
-
30
- @requires_env(API_KEY)
31
- def test_openai_embedder(embedder_file: Path):
32
- api_key = get_api_key()
33
- embedder_config = EmbedderConfig(embedding_provider="openai", embedding_api_key=api_key)
34
- embedder = Embedder(config=embedder_config)
35
- results = embedder.run(elements_filepath=embedder_file)
36
- assert results
37
- with embedder_file.open("r") as f:
38
- original_elements = json.load(f)
39
- validate_embedding_output(original_elements=original_elements, output_elements=results)
40
-
41
-
42
- @requires_env(API_KEY)
43
- def test_raw_openai_embedder(embedder_file: Path):
44
- api_key = get_api_key()
45
- embedder = OpenAIEmbeddingEncoder(
46
- config=OpenAIEmbeddingConfig(
47
- api_key=api_key,
48
- )
49
- )
50
- validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1536)
51
-
52
-
53
- def test_raw_openai_embedder_invalid_credentials():
54
- embedder = OpenAIEmbeddingEncoder(
55
- config=OpenAIEmbeddingConfig(
56
- api_key="fake_api_key",
57
- )
58
- )
59
- with pytest.raises(UserAuthError):
60
- embedder.get_exemplary_embedding()
61
-
62
-
63
- @requires_env(API_KEY)
64
- @pytest.mark.asyncio
65
- async def test_raw_async_openai_embedder(embedder_file: Path):
66
- api_key = get_api_key()
67
- embedder = AsyncOpenAIEmbeddingEncoder(
68
- config=OpenAIEmbeddingConfig(
69
- api_key=api_key,
70
- )
71
- )
72
- await validate_raw_embedder_async(
73
- embedder=embedder, embedder_file=embedder_file, expected_dimension=1536
74
- )
@@ -1,71 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
-
5
- import pytest
6
-
7
- from test.integration.embedders.utils import (
8
- validate_embedding_output,
9
- validate_raw_embedder,
10
- validate_raw_embedder_async,
11
- )
12
- from test.integration.utils import requires_env
13
- from unstructured_ingest.embed.togetherai import (
14
- AsyncTogetherAIEmbeddingEncoder,
15
- TogetherAIEmbeddingConfig,
16
- TogetherAIEmbeddingEncoder,
17
- )
18
- from unstructured_ingest.errors_v2 import UserAuthError
19
- from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
20
-
21
- API_KEY = "TOGETHERAI_API_KEY"
22
-
23
-
24
- def get_api_key() -> str:
25
- api_key = os.getenv(API_KEY, None)
26
- assert api_key
27
- return api_key
28
-
29
-
30
- @requires_env(API_KEY)
31
- def test_togetherai_embedder(embedder_file: Path):
32
- api_key = get_api_key()
33
- embedder_config = EmbedderConfig(embedding_provider="togetherai", embedding_api_key=api_key)
34
- embedder = Embedder(config=embedder_config)
35
- results = embedder.run(elements_filepath=embedder_file)
36
- assert results
37
- with embedder_file.open("r") as f:
38
- original_elements = json.load(f)
39
- validate_embedding_output(original_elements=original_elements, output_elements=results)
40
-
41
-
42
- @requires_env(API_KEY)
43
- def test_raw_togetherai_embedder(embedder_file: Path):
44
- api_key = get_api_key()
45
- embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key=api_key))
46
- validate_raw_embedder(
47
- embedder=embedder,
48
- embedder_file=embedder_file,
49
- expected_dimension=768,
50
- expected_is_unit_vector=False,
51
- )
52
-
53
-
54
- def test_raw_togetherai_embedder_invalid_credentials():
55
- embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key="fake_api_key"))
56
-
57
- with pytest.raises(UserAuthError):
58
- embedder.get_exemplary_embedding()
59
-
60
-
61
- @requires_env(API_KEY)
62
- @pytest.mark.asyncio
63
- async def test_raw_async_togetherai_embedder(embedder_file: Path):
64
- api_key = get_api_key()
65
- embedder = AsyncTogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key=api_key))
66
- await validate_raw_embedder_async(
67
- embedder=embedder,
68
- embedder_file=embedder_file,
69
- expected_dimension=768,
70
- expected_is_unit_vector=False,
71
- )