unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +48 -34
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,152 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import time
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
import requests
|
|
7
|
-
import weaviate
|
|
8
|
-
from weaviate.client import WeaviateClient
|
|
9
|
-
|
|
10
|
-
from test.integration.connectors.utils.constants import DESTINATION_TAG, VECTOR_DB_TAG
|
|
11
|
-
from test.integration.connectors.utils.docker import container_context
|
|
12
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
13
|
-
from unstructured_ingest.processes.connectors.weaviate.local import (
|
|
14
|
-
CONNECTOR_TYPE,
|
|
15
|
-
LocalWeaviateConnectionConfig,
|
|
16
|
-
LocalWeaviateUploader,
|
|
17
|
-
LocalWeaviateUploaderConfig,
|
|
18
|
-
LocalWeaviateUploadStager,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
COLLECTION_NAME = "elements"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
|
|
25
|
-
start_time = time.time()
|
|
26
|
-
while time.time() - start_time < timeout:
|
|
27
|
-
try:
|
|
28
|
-
requests.get("http://localhost:8080/v1/.well-known/read", timeout=1)
|
|
29
|
-
return
|
|
30
|
-
except Exception as e:
|
|
31
|
-
print(f"Failed to validate container healthy, sleeping for {interval} seconds: {e}")
|
|
32
|
-
time.sleep(interval)
|
|
33
|
-
raise TimeoutError("Docker container never came up healthy")
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
@pytest.fixture
|
|
37
|
-
def weaviate_instance():
|
|
38
|
-
with container_context(
|
|
39
|
-
image="semitechnologies/weaviate:1.27.3",
|
|
40
|
-
ports={8080: 8080, 50051: 50051},
|
|
41
|
-
) as ctx:
|
|
42
|
-
wait_for_container()
|
|
43
|
-
yield ctx
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@pytest.fixture
|
|
47
|
-
def collection(weaviate_instance, collections_schema_config: dict) -> str:
|
|
48
|
-
with weaviate.connect_to_local() as weaviate_client:
|
|
49
|
-
weaviate_client.collections.create_from_dict(config=collections_schema_config)
|
|
50
|
-
return COLLECTION_NAME
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def get_count(client: WeaviateClient) -> int:
|
|
54
|
-
collection = client.collections.get(COLLECTION_NAME)
|
|
55
|
-
resp = collection.aggregate.over_all(total_count=True)
|
|
56
|
-
return resp.total_count
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def validate_count(expected_count: int, retries: int = 10, interval: int = 1) -> None:
|
|
60
|
-
with weaviate.connect_to_local() as weaviate_client:
|
|
61
|
-
current_count = get_count(client=weaviate_client)
|
|
62
|
-
retry_count = 0
|
|
63
|
-
while current_count != expected_count and retry_count < retries:
|
|
64
|
-
retry_count += 1
|
|
65
|
-
time.sleep(interval)
|
|
66
|
-
current_count = get_count(client=weaviate_client)
|
|
67
|
-
assert current_count == expected_count, (
|
|
68
|
-
f"Expected count ({expected_count}) doesn't match how "
|
|
69
|
-
f"much came back from collection: {current_count}"
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def run_uploader_and_validate(
|
|
74
|
-
uploader: LocalWeaviateUploader, path: Path, file_data: FileData, expected_count: int
|
|
75
|
-
):
|
|
76
|
-
uploader.precheck()
|
|
77
|
-
uploader.run(path=path, file_data=file_data)
|
|
78
|
-
validate_count(expected_count=expected_count)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
82
|
-
def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
|
|
83
|
-
file_data = FileData(
|
|
84
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
85
|
-
connector_type=CONNECTOR_TYPE,
|
|
86
|
-
identifier="mock file data",
|
|
87
|
-
)
|
|
88
|
-
stager = LocalWeaviateUploadStager()
|
|
89
|
-
|
|
90
|
-
staged_filepath = stager.run(
|
|
91
|
-
elements_filepath=upload_file,
|
|
92
|
-
file_data=file_data,
|
|
93
|
-
output_dir=tmp_path,
|
|
94
|
-
output_filename=upload_file.name,
|
|
95
|
-
)
|
|
96
|
-
dynamic_uploader = LocalWeaviateUploader(
|
|
97
|
-
upload_config=LocalWeaviateUploaderConfig(
|
|
98
|
-
collection=COLLECTION_NAME,
|
|
99
|
-
),
|
|
100
|
-
connection_config=LocalWeaviateConnectionConfig(),
|
|
101
|
-
)
|
|
102
|
-
fixed_size_uploader = LocalWeaviateUploader(
|
|
103
|
-
upload_config=LocalWeaviateUploaderConfig(
|
|
104
|
-
collection=COLLECTION_NAME, batch_size=10, dynamic_batch=False
|
|
105
|
-
),
|
|
106
|
-
connection_config=LocalWeaviateConnectionConfig(),
|
|
107
|
-
)
|
|
108
|
-
rate_limited_uploader = LocalWeaviateUploader(
|
|
109
|
-
upload_config=LocalWeaviateUploaderConfig(
|
|
110
|
-
collection=COLLECTION_NAME, requests_per_minute=50, dynamic_batch=False
|
|
111
|
-
),
|
|
112
|
-
connection_config=LocalWeaviateConnectionConfig(),
|
|
113
|
-
)
|
|
114
|
-
with staged_filepath.open() as f:
|
|
115
|
-
staged_elements = json.load(f)
|
|
116
|
-
expected_count = len(staged_elements)
|
|
117
|
-
|
|
118
|
-
run_uploader_and_validate(
|
|
119
|
-
uploader=dynamic_uploader,
|
|
120
|
-
path=staged_filepath,
|
|
121
|
-
file_data=file_data,
|
|
122
|
-
expected_count=expected_count,
|
|
123
|
-
)
|
|
124
|
-
run_uploader_and_validate(
|
|
125
|
-
uploader=fixed_size_uploader,
|
|
126
|
-
path=staged_filepath,
|
|
127
|
-
file_data=file_data,
|
|
128
|
-
expected_count=expected_count,
|
|
129
|
-
)
|
|
130
|
-
run_uploader_and_validate(
|
|
131
|
-
uploader=rate_limited_uploader,
|
|
132
|
-
path=staged_filepath,
|
|
133
|
-
file_data=file_data,
|
|
134
|
-
expected_count=expected_count,
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
139
|
-
def test_weaviate_local_create_destination(weaviate_instance):
|
|
140
|
-
uploader = LocalWeaviateUploader(
|
|
141
|
-
upload_config=LocalWeaviateUploaderConfig(),
|
|
142
|
-
connection_config=LocalWeaviateConnectionConfig(),
|
|
143
|
-
)
|
|
144
|
-
collection_name = "system_created-123"
|
|
145
|
-
formatted_collection_name = "System_created_123"
|
|
146
|
-
created = uploader.create_destination(destination_name=collection_name)
|
|
147
|
-
assert created
|
|
148
|
-
with uploader.connection_config.get_client() as weaviate_client:
|
|
149
|
-
assert weaviate_client.collections.exists(name=formatted_collection_name)
|
|
150
|
-
|
|
151
|
-
created = uploader.create_destination(destination_name=collection_name)
|
|
152
|
-
assert not created
|
|
File without changes
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
@pytest.fixture
|
|
7
|
-
def embedder_file() -> Path:
|
|
8
|
-
int_test_dir = Path(__file__).parent
|
|
9
|
-
assets_dir = int_test_dir / "assets"
|
|
10
|
-
embedder_file = assets_dir / "DA-1p-with-duplicate-pages.pdf.json"
|
|
11
|
-
assert embedder_file.exists()
|
|
12
|
-
assert embedder_file.is_file()
|
|
13
|
-
return embedder_file
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
7
|
-
from test.integration.utils import requires_env
|
|
8
|
-
from unstructured_ingest.embed.azure_openai import (
|
|
9
|
-
AzureOpenAIEmbeddingConfig,
|
|
10
|
-
AzureOpenAIEmbeddingEncoder,
|
|
11
|
-
)
|
|
12
|
-
from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
|
|
13
|
-
|
|
14
|
-
API_KEY = "AZURE_OPENAI_API_KEY"
|
|
15
|
-
ENDPOINT = "AZURE_OPENAI_ENDPOINT"
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@dataclass(frozen=True)
|
|
19
|
-
class AzureData:
|
|
20
|
-
api_key: str
|
|
21
|
-
endpoint: str
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def get_azure_data() -> AzureData:
|
|
25
|
-
api_key = os.getenv(API_KEY, None)
|
|
26
|
-
assert api_key
|
|
27
|
-
endpoint = os.getenv(ENDPOINT, None)
|
|
28
|
-
assert endpoint
|
|
29
|
-
return AzureData(api_key, endpoint)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@requires_env(API_KEY, ENDPOINT)
|
|
33
|
-
def test_azure_openai_embedder(embedder_file: Path):
|
|
34
|
-
azure_data = get_azure_data()
|
|
35
|
-
embedder_config = EmbedderConfig(
|
|
36
|
-
embedding_provider="azure-openai",
|
|
37
|
-
embedding_api_key=azure_data.api_key,
|
|
38
|
-
embedding_azure_endpoint=azure_data.endpoint,
|
|
39
|
-
)
|
|
40
|
-
embedder = Embedder(config=embedder_config)
|
|
41
|
-
results = embedder.run(elements_filepath=embedder_file)
|
|
42
|
-
assert results
|
|
43
|
-
with embedder_file.open("r") as f:
|
|
44
|
-
original_elements = json.load(f)
|
|
45
|
-
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
@requires_env(API_KEY, ENDPOINT)
|
|
49
|
-
def test_raw_azure_openai_embedder(embedder_file: Path):
|
|
50
|
-
azure_data = get_azure_data()
|
|
51
|
-
embedder = AzureOpenAIEmbeddingEncoder(
|
|
52
|
-
config=AzureOpenAIEmbeddingConfig(
|
|
53
|
-
api_key=azure_data.api_key,
|
|
54
|
-
azure_endpoint=azure_data.endpoint,
|
|
55
|
-
)
|
|
56
|
-
)
|
|
57
|
-
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1536)
|
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from test.integration.embedders.utils import (
|
|
8
|
-
validate_embedding_output,
|
|
9
|
-
validate_raw_embedder,
|
|
10
|
-
validate_raw_embedder_async,
|
|
11
|
-
)
|
|
12
|
-
from test.integration.utils import requires_env
|
|
13
|
-
from unstructured_ingest.embed.bedrock import (
|
|
14
|
-
AsyncBedrockEmbeddingEncoder,
|
|
15
|
-
BedrockEmbeddingConfig,
|
|
16
|
-
BedrockEmbeddingEncoder,
|
|
17
|
-
)
|
|
18
|
-
from unstructured_ingest.errors_v2 import UserAuthError, UserError
|
|
19
|
-
from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def get_aws_credentials() -> dict:
|
|
23
|
-
access_key = os.getenv("AWS_ACCESS_KEY_ID", None)
|
|
24
|
-
assert access_key
|
|
25
|
-
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", None)
|
|
26
|
-
assert secret_key
|
|
27
|
-
return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
|
|
31
|
-
def test_bedrock_embedder(embedder_file: Path):
|
|
32
|
-
aws_credentials = get_aws_credentials()
|
|
33
|
-
embedder_config = EmbedderConfig(
|
|
34
|
-
embedding_provider="bedrock",
|
|
35
|
-
embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
36
|
-
embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
37
|
-
)
|
|
38
|
-
embedder = Embedder(config=embedder_config)
|
|
39
|
-
results = embedder.run(elements_filepath=embedder_file)
|
|
40
|
-
assert results
|
|
41
|
-
with embedder_file.open("r") as f:
|
|
42
|
-
original_elements = json.load(f)
|
|
43
|
-
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
|
|
47
|
-
def test_raw_bedrock_embedder(embedder_file: Path):
|
|
48
|
-
aws_credentials = get_aws_credentials()
|
|
49
|
-
embedder = BedrockEmbeddingEncoder(
|
|
50
|
-
config=BedrockEmbeddingConfig(
|
|
51
|
-
aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
52
|
-
aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
53
|
-
)
|
|
54
|
-
)
|
|
55
|
-
validate_raw_embedder(
|
|
56
|
-
embedder=embedder,
|
|
57
|
-
embedder_file=embedder_file,
|
|
58
|
-
expected_dimension=1536,
|
|
59
|
-
expected_is_unit_vector=False,
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def test_raw_bedrock_embedder_invalid_credentials(embedder_file: Path):
|
|
64
|
-
embedder = BedrockEmbeddingEncoder(
|
|
65
|
-
config=BedrockEmbeddingConfig(
|
|
66
|
-
aws_access_key_id="no_key",
|
|
67
|
-
aws_secret_access_key="no_secret",
|
|
68
|
-
)
|
|
69
|
-
)
|
|
70
|
-
with pytest.raises(UserAuthError):
|
|
71
|
-
embedder.get_exemplary_embedding()
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
|
|
75
|
-
def test_raw_bedrock_embedder_invalid_model(embedder_file: Path):
|
|
76
|
-
aws_credentials = get_aws_credentials()
|
|
77
|
-
embedder = BedrockEmbeddingEncoder(
|
|
78
|
-
config=BedrockEmbeddingConfig(
|
|
79
|
-
aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
80
|
-
aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
81
|
-
model_name="invalid_model",
|
|
82
|
-
)
|
|
83
|
-
)
|
|
84
|
-
with pytest.raises(UserError):
|
|
85
|
-
embedder.get_exemplary_embedding()
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
@requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
|
|
89
|
-
@pytest.mark.asyncio
|
|
90
|
-
async def test_raw_async_bedrock_embedder(embedder_file: Path):
|
|
91
|
-
aws_credentials = get_aws_credentials()
|
|
92
|
-
embedder = AsyncBedrockEmbeddingEncoder(
|
|
93
|
-
config=BedrockEmbeddingConfig(
|
|
94
|
-
aws_access_key_id=aws_credentials["aws_access_key_id"],
|
|
95
|
-
aws_secret_access_key=aws_credentials["aws_secret_access_key"],
|
|
96
|
-
)
|
|
97
|
-
)
|
|
98
|
-
await validate_raw_embedder_async(
|
|
99
|
-
embedder=embedder,
|
|
100
|
-
embedder_file=embedder_file,
|
|
101
|
-
expected_dimension=1536,
|
|
102
|
-
expected_is_unit_vector=False,
|
|
103
|
-
)
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
5
|
-
from unstructured_ingest.embed.huggingface import (
|
|
6
|
-
HuggingFaceEmbeddingConfig,
|
|
7
|
-
HuggingFaceEmbeddingEncoder,
|
|
8
|
-
)
|
|
9
|
-
from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def test_huggingface_embedder(embedder_file: Path):
|
|
13
|
-
embedder_config = EmbedderConfig(embedding_provider="huggingface")
|
|
14
|
-
embedder = Embedder(config=embedder_config)
|
|
15
|
-
results = embedder.run(elements_filepath=embedder_file)
|
|
16
|
-
assert results
|
|
17
|
-
with embedder_file.open("r") as f:
|
|
18
|
-
original_elements = json.load(f)
|
|
19
|
-
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def test_raw_hugginface_embedder(embedder_file: Path):
|
|
23
|
-
embedder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
|
|
24
|
-
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=384)
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from test.integration.embedders.utils import (
|
|
8
|
-
validate_embedding_output,
|
|
9
|
-
validate_raw_embedder,
|
|
10
|
-
validate_raw_embedder_async,
|
|
11
|
-
)
|
|
12
|
-
from test.integration.utils import requires_env
|
|
13
|
-
from unstructured_ingest.embed.mixedbreadai import (
|
|
14
|
-
AsyncMixedbreadAIEmbeddingEncoder,
|
|
15
|
-
MixedbreadAIEmbeddingConfig,
|
|
16
|
-
MixedbreadAIEmbeddingEncoder,
|
|
17
|
-
)
|
|
18
|
-
from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
|
|
19
|
-
|
|
20
|
-
API_KEY = "MXBAI_API_KEY"
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def get_api_key() -> str:
|
|
24
|
-
api_key = os.getenv(API_KEY, None)
|
|
25
|
-
assert api_key
|
|
26
|
-
return api_key
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@requires_env(API_KEY)
|
|
30
|
-
def test_mixedbread_embedder(embedder_file: Path):
|
|
31
|
-
api_key = get_api_key()
|
|
32
|
-
embedder_config = EmbedderConfig(embedding_provider="mixedbread-ai", embedding_api_key=api_key)
|
|
33
|
-
embedder = Embedder(config=embedder_config)
|
|
34
|
-
results = embedder.run(elements_filepath=embedder_file)
|
|
35
|
-
assert results
|
|
36
|
-
with embedder_file.open("r") as f:
|
|
37
|
-
original_elements = json.load(f)
|
|
38
|
-
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@requires_env(API_KEY)
|
|
42
|
-
def test_raw_mixedbread_embedder(embedder_file: Path):
|
|
43
|
-
api_key = get_api_key()
|
|
44
|
-
embedder = MixedbreadAIEmbeddingEncoder(
|
|
45
|
-
config=MixedbreadAIEmbeddingConfig(
|
|
46
|
-
api_key=api_key,
|
|
47
|
-
)
|
|
48
|
-
)
|
|
49
|
-
validate_raw_embedder(
|
|
50
|
-
embedder=embedder,
|
|
51
|
-
embedder_file=embedder_file,
|
|
52
|
-
expected_dimension=1024,
|
|
53
|
-
expected_is_unit_vector=True,
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
@requires_env(API_KEY)
|
|
58
|
-
@pytest.mark.asyncio
|
|
59
|
-
async def test_raw_async_mixedbread_embedder(embedder_file: Path):
|
|
60
|
-
api_key = get_api_key()
|
|
61
|
-
embedder = AsyncMixedbreadAIEmbeddingEncoder(
|
|
62
|
-
config=MixedbreadAIEmbeddingConfig(
|
|
63
|
-
api_key=api_key,
|
|
64
|
-
)
|
|
65
|
-
)
|
|
66
|
-
await validate_raw_embedder_async(
|
|
67
|
-
embedder=embedder,
|
|
68
|
-
embedder_file=embedder_file,
|
|
69
|
-
expected_dimension=1024,
|
|
70
|
-
expected_is_unit_vector=True,
|
|
71
|
-
)
|
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from test.integration.embedders.utils import (
|
|
8
|
-
validate_embedding_output,
|
|
9
|
-
validate_raw_embedder,
|
|
10
|
-
validate_raw_embedder_async,
|
|
11
|
-
)
|
|
12
|
-
from test.integration.utils import requires_env
|
|
13
|
-
from unstructured_ingest.embed.octoai import (
|
|
14
|
-
AsyncOctoAIEmbeddingEncoder,
|
|
15
|
-
OctoAiEmbeddingConfig,
|
|
16
|
-
OctoAIEmbeddingEncoder,
|
|
17
|
-
)
|
|
18
|
-
from unstructured_ingest.errors_v2 import UserAuthError
|
|
19
|
-
from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
|
|
20
|
-
|
|
21
|
-
API_KEY = "OCTOAI_API_KEY"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def get_api_key() -> str:
|
|
25
|
-
api_key = os.getenv(API_KEY, None)
|
|
26
|
-
assert api_key
|
|
27
|
-
return api_key
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@requires_env(API_KEY)
|
|
31
|
-
def test_octoai_embedder(embedder_file: Path):
|
|
32
|
-
api_key = get_api_key()
|
|
33
|
-
embedder_config = EmbedderConfig(embedding_provider="octoai", embedding_api_key=api_key)
|
|
34
|
-
embedder = Embedder(config=embedder_config)
|
|
35
|
-
results = embedder.run(elements_filepath=embedder_file)
|
|
36
|
-
assert results
|
|
37
|
-
with embedder_file.open("r") as f:
|
|
38
|
-
original_elements = json.load(f)
|
|
39
|
-
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
@requires_env(API_KEY)
|
|
43
|
-
def test_raw_octoai_embedder(embedder_file: Path):
|
|
44
|
-
api_key = get_api_key()
|
|
45
|
-
embedder = OctoAIEmbeddingEncoder(
|
|
46
|
-
config=OctoAiEmbeddingConfig(
|
|
47
|
-
api_key=api_key,
|
|
48
|
-
)
|
|
49
|
-
)
|
|
50
|
-
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1024)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
@pytest.mark.skip(reason="Unexpected connection error at the moment")
|
|
54
|
-
def test_raw_octoai_embedder_invalid_credentials():
|
|
55
|
-
embedder = OctoAIEmbeddingEncoder(
|
|
56
|
-
config=OctoAiEmbeddingConfig(
|
|
57
|
-
api_key="fake_api_key",
|
|
58
|
-
)
|
|
59
|
-
)
|
|
60
|
-
with pytest.raises(UserAuthError):
|
|
61
|
-
embedder.get_exemplary_embedding()
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
@requires_env(API_KEY)
|
|
65
|
-
@pytest.mark.asyncio
|
|
66
|
-
async def test_raw_async_octoai_embedder(embedder_file: Path):
|
|
67
|
-
api_key = get_api_key()
|
|
68
|
-
embedder = AsyncOctoAIEmbeddingEncoder(
|
|
69
|
-
config=OctoAiEmbeddingConfig(
|
|
70
|
-
api_key=api_key,
|
|
71
|
-
)
|
|
72
|
-
)
|
|
73
|
-
await validate_raw_embedder_async(
|
|
74
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
|
|
75
|
-
)
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from test.integration.embedders.utils import (
|
|
8
|
-
validate_embedding_output,
|
|
9
|
-
validate_raw_embedder,
|
|
10
|
-
validate_raw_embedder_async,
|
|
11
|
-
)
|
|
12
|
-
from test.integration.utils import requires_env
|
|
13
|
-
from unstructured_ingest.embed.openai import (
|
|
14
|
-
AsyncOpenAIEmbeddingEncoder,
|
|
15
|
-
OpenAIEmbeddingConfig,
|
|
16
|
-
OpenAIEmbeddingEncoder,
|
|
17
|
-
)
|
|
18
|
-
from unstructured_ingest.errors_v2 import UserAuthError
|
|
19
|
-
from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
|
|
20
|
-
|
|
21
|
-
API_KEY = "OPENAI_API_KEY"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def get_api_key() -> str:
|
|
25
|
-
api_key = os.getenv(API_KEY, None)
|
|
26
|
-
assert api_key
|
|
27
|
-
return api_key
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@requires_env(API_KEY)
|
|
31
|
-
def test_openai_embedder(embedder_file: Path):
|
|
32
|
-
api_key = get_api_key()
|
|
33
|
-
embedder_config = EmbedderConfig(embedding_provider="openai", embedding_api_key=api_key)
|
|
34
|
-
embedder = Embedder(config=embedder_config)
|
|
35
|
-
results = embedder.run(elements_filepath=embedder_file)
|
|
36
|
-
assert results
|
|
37
|
-
with embedder_file.open("r") as f:
|
|
38
|
-
original_elements = json.load(f)
|
|
39
|
-
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
@requires_env(API_KEY)
|
|
43
|
-
def test_raw_openai_embedder(embedder_file: Path):
|
|
44
|
-
api_key = get_api_key()
|
|
45
|
-
embedder = OpenAIEmbeddingEncoder(
|
|
46
|
-
config=OpenAIEmbeddingConfig(
|
|
47
|
-
api_key=api_key,
|
|
48
|
-
)
|
|
49
|
-
)
|
|
50
|
-
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1536)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def test_raw_openai_embedder_invalid_credentials():
|
|
54
|
-
embedder = OpenAIEmbeddingEncoder(
|
|
55
|
-
config=OpenAIEmbeddingConfig(
|
|
56
|
-
api_key="fake_api_key",
|
|
57
|
-
)
|
|
58
|
-
)
|
|
59
|
-
with pytest.raises(UserAuthError):
|
|
60
|
-
embedder.get_exemplary_embedding()
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
@requires_env(API_KEY)
|
|
64
|
-
@pytest.mark.asyncio
|
|
65
|
-
async def test_raw_async_openai_embedder(embedder_file: Path):
|
|
66
|
-
api_key = get_api_key()
|
|
67
|
-
embedder = AsyncOpenAIEmbeddingEncoder(
|
|
68
|
-
config=OpenAIEmbeddingConfig(
|
|
69
|
-
api_key=api_key,
|
|
70
|
-
)
|
|
71
|
-
)
|
|
72
|
-
await validate_raw_embedder_async(
|
|
73
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimension=1536
|
|
74
|
-
)
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from test.integration.embedders.utils import (
|
|
8
|
-
validate_embedding_output,
|
|
9
|
-
validate_raw_embedder,
|
|
10
|
-
validate_raw_embedder_async,
|
|
11
|
-
)
|
|
12
|
-
from test.integration.utils import requires_env
|
|
13
|
-
from unstructured_ingest.embed.togetherai import (
|
|
14
|
-
AsyncTogetherAIEmbeddingEncoder,
|
|
15
|
-
TogetherAIEmbeddingConfig,
|
|
16
|
-
TogetherAIEmbeddingEncoder,
|
|
17
|
-
)
|
|
18
|
-
from unstructured_ingest.errors_v2 import UserAuthError
|
|
19
|
-
from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
|
|
20
|
-
|
|
21
|
-
API_KEY = "TOGETHERAI_API_KEY"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def get_api_key() -> str:
|
|
25
|
-
api_key = os.getenv(API_KEY, None)
|
|
26
|
-
assert api_key
|
|
27
|
-
return api_key
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@requires_env(API_KEY)
|
|
31
|
-
def test_togetherai_embedder(embedder_file: Path):
|
|
32
|
-
api_key = get_api_key()
|
|
33
|
-
embedder_config = EmbedderConfig(embedding_provider="togetherai", embedding_api_key=api_key)
|
|
34
|
-
embedder = Embedder(config=embedder_config)
|
|
35
|
-
results = embedder.run(elements_filepath=embedder_file)
|
|
36
|
-
assert results
|
|
37
|
-
with embedder_file.open("r") as f:
|
|
38
|
-
original_elements = json.load(f)
|
|
39
|
-
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
@requires_env(API_KEY)
|
|
43
|
-
def test_raw_togetherai_embedder(embedder_file: Path):
|
|
44
|
-
api_key = get_api_key()
|
|
45
|
-
embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key=api_key))
|
|
46
|
-
validate_raw_embedder(
|
|
47
|
-
embedder=embedder,
|
|
48
|
-
embedder_file=embedder_file,
|
|
49
|
-
expected_dimension=768,
|
|
50
|
-
expected_is_unit_vector=False,
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def test_raw_togetherai_embedder_invalid_credentials():
|
|
55
|
-
embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key="fake_api_key"))
|
|
56
|
-
|
|
57
|
-
with pytest.raises(UserAuthError):
|
|
58
|
-
embedder.get_exemplary_embedding()
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
@requires_env(API_KEY)
|
|
62
|
-
@pytest.mark.asyncio
|
|
63
|
-
async def test_raw_async_togetherai_embedder(embedder_file: Path):
|
|
64
|
-
api_key = get_api_key()
|
|
65
|
-
embedder = AsyncTogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key=api_key))
|
|
66
|
-
await validate_raw_embedder_async(
|
|
67
|
-
embedder=embedder,
|
|
68
|
-
embedder_file=embedder_file,
|
|
69
|
-
expected_dimension=768,
|
|
70
|
-
expected_is_unit_vector=False,
|
|
71
|
-
)
|