unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +48 -34
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
from typing import Any
|
|
3
|
-
|
|
4
|
-
import faker
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from test.unit.utils.data_generator import generate_random_dictionary
|
|
8
|
-
from unstructured_ingest.embed.huggingface import (
|
|
9
|
-
HuggingFaceEmbeddingConfig,
|
|
10
|
-
HuggingFaceEmbeddingEncoder,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
fake = faker.Faker()
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def generate_embedder_config_params() -> dict:
|
|
17
|
-
params = {}
|
|
18
|
-
if random.random() < 0.5:
|
|
19
|
-
params["embedder_model_name"] = fake.word() if random.random() < 0.5 else None
|
|
20
|
-
params["embedder_model_kwargs"] = (
|
|
21
|
-
generate_random_dictionary(key_type=str, value_type=Any)
|
|
22
|
-
if random.random() < 0.5
|
|
23
|
-
else None
|
|
24
|
-
)
|
|
25
|
-
params["encode_kwargs"] = (
|
|
26
|
-
generate_random_dictionary(key_type=str, value_type=Any)
|
|
27
|
-
if random.random() < 0.5
|
|
28
|
-
else None
|
|
29
|
-
)
|
|
30
|
-
params["cache_folder"] = fake.file_path() if random.random() < 0.5 else None
|
|
31
|
-
return params
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
@pytest.mark.parametrize(
|
|
35
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
36
|
-
)
|
|
37
|
-
def test_embedder_config(embedder_config_params: dict):
|
|
38
|
-
embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
|
|
39
|
-
assert embedder_config
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
@pytest.mark.parametrize(
|
|
43
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
44
|
-
)
|
|
45
|
-
def test_embedder(embedder_config_params: dict):
|
|
46
|
-
embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
|
|
47
|
-
embedder = HuggingFaceEmbeddingEncoder(config=embedder_config)
|
|
48
|
-
assert embedder
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
|
|
3
|
-
import faker
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.embed.mixedbreadai import (
|
|
7
|
-
MixedbreadAIEmbeddingConfig,
|
|
8
|
-
MixedbreadAIEmbeddingEncoder,
|
|
9
|
-
)
|
|
10
|
-
|
|
11
|
-
fake = faker.Faker()
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def generate_embedder_config_params() -> dict:
|
|
15
|
-
params = {
|
|
16
|
-
"api_key": fake.password(),
|
|
17
|
-
}
|
|
18
|
-
if random.random() < 0.5:
|
|
19
|
-
params["embedder_model_name"] = fake.word()
|
|
20
|
-
return params
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pytest.mark.parametrize(
|
|
24
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
25
|
-
)
|
|
26
|
-
def test_embedder_config(embedder_config_params: dict):
|
|
27
|
-
embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
28
|
-
assert embedder_config
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@pytest.mark.parametrize(
|
|
32
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
33
|
-
)
|
|
34
|
-
def test_embedder(embedder_config_params: dict):
|
|
35
|
-
embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
36
|
-
embedder = MixedbreadAIEmbeddingEncoder(config=embedder_config)
|
|
37
|
-
assert embedder
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
|
|
3
|
-
import faker
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
7
|
-
|
|
8
|
-
fake = faker.Faker()
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def generate_embedder_config_params() -> dict:
|
|
12
|
-
params = {
|
|
13
|
-
"api_key": fake.password(),
|
|
14
|
-
}
|
|
15
|
-
if random.random() < 0.5:
|
|
16
|
-
params["embedder_model_name"] = fake.word()
|
|
17
|
-
params["base_url"] = fake.url()
|
|
18
|
-
return params
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@pytest.mark.parametrize(
|
|
22
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
23
|
-
)
|
|
24
|
-
def test_embedder_config(embedder_config_params: dict):
|
|
25
|
-
embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
|
|
26
|
-
assert embedder_config
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@pytest.mark.parametrize(
|
|
30
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
31
|
-
)
|
|
32
|
-
def test_embedder(embedder_config_params: dict):
|
|
33
|
-
embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
|
|
34
|
-
embedder = OctoAIEmbeddingEncoder(config=embedder_config)
|
|
35
|
-
assert embedder
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
|
|
3
|
-
import faker
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
7
|
-
|
|
8
|
-
fake = faker.Faker()
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def generate_embedder_config_params() -> dict:
|
|
12
|
-
params = {
|
|
13
|
-
"api_key": fake.password(),
|
|
14
|
-
}
|
|
15
|
-
if random.random() < 0.5:
|
|
16
|
-
params["embedder_model_name"] = fake.word()
|
|
17
|
-
params["base_url"] = fake.url()
|
|
18
|
-
return params
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@pytest.mark.parametrize(
|
|
22
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
23
|
-
)
|
|
24
|
-
def test_embedder_config(embedder_config_params: dict):
|
|
25
|
-
embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
26
|
-
assert embedder_config
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@pytest.mark.parametrize(
|
|
30
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
31
|
-
)
|
|
32
|
-
def test_embedder(embedder_config_params: dict):
|
|
33
|
-
embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
34
|
-
embedder = OpenAIEmbeddingEncoder(config=embedder_config)
|
|
35
|
-
assert embedder
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
|
|
3
|
-
import faker
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.embed.togetherai import (
|
|
7
|
-
TogetherAIEmbeddingConfig,
|
|
8
|
-
TogetherAIEmbeddingEncoder,
|
|
9
|
-
)
|
|
10
|
-
|
|
11
|
-
fake = faker.Faker()
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def generate_embedder_config_params() -> dict:
|
|
15
|
-
params = {
|
|
16
|
-
"api_key": fake.password(),
|
|
17
|
-
}
|
|
18
|
-
if random.random() < 0.5:
|
|
19
|
-
params["embedder_model_name"] = fake.word()
|
|
20
|
-
return params
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pytest.mark.parametrize(
|
|
24
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
25
|
-
)
|
|
26
|
-
def test_embedder_config(embedder_config_params: dict):
|
|
27
|
-
embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
28
|
-
assert embedder_config
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@pytest.mark.parametrize(
|
|
32
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
33
|
-
)
|
|
34
|
-
def test_embedder(embedder_config_params: dict):
|
|
35
|
-
embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
36
|
-
embedder = TogetherAIEmbeddingEncoder(config=embedder_config)
|
|
37
|
-
assert embedder
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import random
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
import faker
|
|
6
|
-
import pytest
|
|
7
|
-
|
|
8
|
-
from test.unit.utils.data_generator import generate_random_dictionary
|
|
9
|
-
from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
|
|
10
|
-
|
|
11
|
-
fake = faker.Faker()
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def generate_embedder_config_params() -> dict:
|
|
15
|
-
params = {
|
|
16
|
-
"api_key": json.dumps(generate_random_dictionary(key_type=str, value_type=Any)),
|
|
17
|
-
}
|
|
18
|
-
if random.random() < 0.5:
|
|
19
|
-
params["embedder_model_name"] = fake.word()
|
|
20
|
-
return params
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pytest.mark.parametrize(
|
|
24
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
25
|
-
)
|
|
26
|
-
def test_embedder_config(embedder_config_params: dict):
|
|
27
|
-
embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
28
|
-
assert embedder_config
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@pytest.mark.parametrize(
|
|
32
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
33
|
-
)
|
|
34
|
-
def test_embedder(embedder_config_params: dict):
|
|
35
|
-
embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
36
|
-
embedder = VertexAIEmbeddingEncoder(config=embedder_config)
|
|
37
|
-
assert embedder
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
|
|
3
|
-
import faker
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
|
|
7
|
-
|
|
8
|
-
fake = faker.Faker()
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def generate_embedder_config_params() -> dict:
|
|
12
|
-
params = {
|
|
13
|
-
"api_key": fake.password(),
|
|
14
|
-
}
|
|
15
|
-
if random.random() < 0.5:
|
|
16
|
-
params["embedder_model_name"] = fake.word()
|
|
17
|
-
params["batch_size"] = fake.random_int(max=100)
|
|
18
|
-
params["truncation"] = fake.boolean()
|
|
19
|
-
params["max_retries"] = fake.random_int()
|
|
20
|
-
params["timeout_in_seconds"] = fake.random_int()
|
|
21
|
-
return params
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@pytest.mark.parametrize(
|
|
25
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
26
|
-
)
|
|
27
|
-
def test_embedder_config(embedder_config_params: dict):
|
|
28
|
-
embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
29
|
-
assert embedder_config
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@pytest.mark.parametrize(
|
|
33
|
-
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
34
|
-
)
|
|
35
|
-
def test_embedder(embedder_config_params: dict):
|
|
36
|
-
embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
37
|
-
embedder = VoyageAIEmbeddingEncoder(config=embedder_config)
|
|
38
|
-
assert embedder
|
|
File without changes
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
from typing import Any
|
|
3
|
-
|
|
4
|
-
import faker
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from test.unit.utils.data_generator import generate_random_dictionary
|
|
8
|
-
from unstructured_ingest.processes.partitioner import Partitioner, PartitionerConfig
|
|
9
|
-
|
|
10
|
-
fake = faker.Faker()
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def generate_partitioner_config_params() -> dict:
|
|
14
|
-
params = {
|
|
15
|
-
"strategy": random.choice(["fast", "hi_res", "auto"]),
|
|
16
|
-
"ocr_languages": fake.words() if random.random() < 0.5 else None,
|
|
17
|
-
"encoding": fake.word() if random.random() < 0.5 else None,
|
|
18
|
-
"additional_partition_args": (
|
|
19
|
-
generate_random_dictionary(key_type=str, value_type=Any)
|
|
20
|
-
if random.random() < 0.5
|
|
21
|
-
else None
|
|
22
|
-
),
|
|
23
|
-
"skip_infer_table_types": fake.words() if random.random() < 0.5 else None,
|
|
24
|
-
"flatten_metadata": fake.boolean(),
|
|
25
|
-
"hi_res_model_name": fake.word() if random.random() < 0.5 else None,
|
|
26
|
-
}
|
|
27
|
-
random_val = random.random()
|
|
28
|
-
# Randomly set the fields_include to a random list[str]
|
|
29
|
-
if random_val < 0.5:
|
|
30
|
-
params["fields_include"] = fake.words()
|
|
31
|
-
|
|
32
|
-
# Randomly set the metadata_exclude or metadata_include to a valid
|
|
33
|
-
# list[str] or don't set it at all
|
|
34
|
-
if random.random() < (1 / 3):
|
|
35
|
-
params["metadata_exclude"] = fake.words()
|
|
36
|
-
elif random_val < (2 / 3):
|
|
37
|
-
params["metadata_include"] = fake.words()
|
|
38
|
-
|
|
39
|
-
# Randomly set the values associated with calling the api, or not at all
|
|
40
|
-
if random.random() < 0.5:
|
|
41
|
-
params["partition_by_api"]: True
|
|
42
|
-
params["partition_endpoint"] = fake.url()
|
|
43
|
-
params["api_key"] = fake.password()
|
|
44
|
-
else:
|
|
45
|
-
params["partition_by_api"]: False
|
|
46
|
-
return params
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
@pytest.mark.parametrize(
|
|
50
|
-
"partition_config_params", [generate_partitioner_config_params() for i in range(10)]
|
|
51
|
-
)
|
|
52
|
-
def test_partition_config(partition_config_params: dict):
|
|
53
|
-
partition_config = PartitionerConfig.model_validate(partition_config_params)
|
|
54
|
-
assert partition_config
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
@pytest.mark.parametrize(
|
|
58
|
-
"partition_config_params", [generate_partitioner_config_params() for i in range(10)]
|
|
59
|
-
)
|
|
60
|
-
def test_partitioner(partition_config_params: dict):
|
|
61
|
-
partition_config = PartitionerConfig.model_validate(partition_config_params)
|
|
62
|
-
partitioner = Partitioner(config=partition_config)
|
|
63
|
-
assert partitioner
|
test/unit/test_error.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
|
|
3
|
-
from unstructured_ingest.error import (
|
|
4
|
-
DestinationConnectionError,
|
|
5
|
-
PartitionError,
|
|
6
|
-
SourceConnectionError,
|
|
7
|
-
)
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@pytest.mark.parametrize(
|
|
11
|
-
("error_class", "exception_type", "error_message"),
|
|
12
|
-
[
|
|
13
|
-
(SourceConnectionError, ValueError, "Simulated connection error"),
|
|
14
|
-
(DestinationConnectionError, RuntimeError, "Simulated connection error"),
|
|
15
|
-
(PartitionError, FileNotFoundError, "Simulated partition error"),
|
|
16
|
-
],
|
|
17
|
-
)
|
|
18
|
-
def test_custom_error_decorator(error_class, exception_type, error_message):
|
|
19
|
-
@error_class.wrap
|
|
20
|
-
def simulate_error():
|
|
21
|
-
raise exception_type(error_message)
|
|
22
|
-
|
|
23
|
-
with pytest.raises(error_class) as context:
|
|
24
|
-
simulate_error()
|
|
25
|
-
|
|
26
|
-
expected_error_string = error_class.error_string.format(error_message)
|
|
27
|
-
assert str(context.value) == expected_error_string
|
test/unit/test_html.py
DELETED
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
import base64
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from bs4 import BeautifulSoup
|
|
5
|
-
from pytest_mock import MockerFixture
|
|
6
|
-
|
|
7
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
8
|
-
from unstructured_ingest.utils.html import HtmlMixin
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def test_extract_images(mocker: MockerFixture):
|
|
12
|
-
mixin = HtmlMixin(extract_images=True)
|
|
13
|
-
mock_download_response = b"DOWNLOADED"
|
|
14
|
-
expected_image_src = base64.b64encode(mock_download_response).decode()
|
|
15
|
-
mocked_download_response = mocker.patch(
|
|
16
|
-
"unstructured_ingest.utils.html.HtmlMixin.download_content",
|
|
17
|
-
return_value=mock_download_response,
|
|
18
|
-
)
|
|
19
|
-
url = "http://mywebsite.com/path/to/page"
|
|
20
|
-
html = """
|
|
21
|
-
<img src="http://mywebsite.com/img1.jpg"/>
|
|
22
|
-
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
23
|
-
<img src="img3.jpg"/>
|
|
24
|
-
<img src="data:image/png;base64,24689654..."/>
|
|
25
|
-
"""
|
|
26
|
-
expected_html = f"""
|
|
27
|
-
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
28
|
-
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
29
|
-
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
30
|
-
<img src="data:image/png;base64,24689654..."/>
|
|
31
|
-
"""
|
|
32
|
-
expected_soup = BeautifulSoup(expected_html, "html.parser")
|
|
33
|
-
result = mixin.extract_html_images(url=url, html=html)
|
|
34
|
-
result_soup = BeautifulSoup(result, "html.parser")
|
|
35
|
-
assert expected_soup == result_soup
|
|
36
|
-
assert mocked_download_response.call_count == 2
|
|
37
|
-
urls_to_download = [
|
|
38
|
-
call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
|
|
39
|
-
]
|
|
40
|
-
assert urls_to_download == ["http://mywebsite.com/img1.jpg", "http://mywebsite.com/img3.jpg"]
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def test_extract_images_allow_list(mocker: MockerFixture):
|
|
44
|
-
mixin = HtmlMixin(
|
|
45
|
-
extract_images=True, allow_list=["http://allowedwebsite1.com", "http://allowedwebsite2.com"]
|
|
46
|
-
)
|
|
47
|
-
mock_download_response = b"DOWNLOADED"
|
|
48
|
-
expected_image_src = base64.b64encode(mock_download_response).decode()
|
|
49
|
-
mocked_download_response = mocker.patch(
|
|
50
|
-
"unstructured_ingest.utils.html.HtmlMixin.download_content",
|
|
51
|
-
return_value=mock_download_response,
|
|
52
|
-
)
|
|
53
|
-
url = "http://mywebsite.com/path/to/page"
|
|
54
|
-
html = """
|
|
55
|
-
<img src="http://mywebsite.com/img1.jpg"/>
|
|
56
|
-
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
57
|
-
<img src="http://allowedwebsite1.com/img2.jpg"/>
|
|
58
|
-
<img src="http://allowedwebsite2.com/img2.jpg"/>
|
|
59
|
-
"""
|
|
60
|
-
|
|
61
|
-
expected_html = f"""
|
|
62
|
-
<img src="http://mywebsite.com/img1.jpg"/>
|
|
63
|
-
<img src="http://notmywebsite.com/img2.jpg"/>
|
|
64
|
-
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
65
|
-
<img src="data:image/png;base64,{expected_image_src}"/>
|
|
66
|
-
"""
|
|
67
|
-
expected_soup = BeautifulSoup(expected_html, "html.parser")
|
|
68
|
-
result = mixin.extract_html_images(url=url, html=html)
|
|
69
|
-
result_soup = BeautifulSoup(result, "html.parser")
|
|
70
|
-
assert expected_soup == result_soup
|
|
71
|
-
assert mocked_download_response.call_count == 2
|
|
72
|
-
urls_to_download = [
|
|
73
|
-
call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
|
|
74
|
-
]
|
|
75
|
-
assert urls_to_download == [
|
|
76
|
-
"http://allowedwebsite1.com/img2.jpg",
|
|
77
|
-
"http://allowedwebsite2.com/img2.jpg",
|
|
78
|
-
]
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def test_extract_embedded_docs(mocker: MockerFixture):
|
|
82
|
-
mixin = HtmlMixin(extract_files=True)
|
|
83
|
-
mock_download_response = b"DOWNLOADED"
|
|
84
|
-
mocked_download_response = mocker.patch(
|
|
85
|
-
"unstructured_ingest.utils.html.HtmlMixin.download_content",
|
|
86
|
-
return_value=mock_download_response,
|
|
87
|
-
)
|
|
88
|
-
mocked_write_content = mocker.patch("unstructured_ingest.utils.html.HtmlMixin.write_content")
|
|
89
|
-
url = "http://mywebsite.com/path/to/page"
|
|
90
|
-
html = """
|
|
91
|
-
<a href="http://mywebsite.com/file.pdf"/>
|
|
92
|
-
<a href="http://notmywebsite.com/file.pdf"/>
|
|
93
|
-
<a href="http://mywebsite.com/another/link"/>
|
|
94
|
-
<a href="another/link/2"/>
|
|
95
|
-
<a href="file.doc"/>
|
|
96
|
-
"""
|
|
97
|
-
file_data = FileData(
|
|
98
|
-
source_identifiers=SourceIdentifiers(
|
|
99
|
-
fullpath="file.txt",
|
|
100
|
-
filename="file.txt",
|
|
101
|
-
),
|
|
102
|
-
connector_type="my_connector",
|
|
103
|
-
identifier="mock_file_data",
|
|
104
|
-
)
|
|
105
|
-
results = mixin.extract_embedded_files(
|
|
106
|
-
url=url, html=html, download_dir=Path("/tmp/download/location"), original_filedata=file_data
|
|
107
|
-
)
|
|
108
|
-
assert len(results) == 2
|
|
109
|
-
downloaded_urls = [r["file_data"].metadata.url for r in results]
|
|
110
|
-
assert downloaded_urls == ["http://mywebsite.com/file.pdf", "http://mywebsite.com/file.doc"]
|
|
111
|
-
assert mocked_download_response.call_count == 2
|
|
112
|
-
assert mocked_write_content.call_count == 2
|
test/unit/test_interfaces.py
DELETED
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
from pydantic import Secret, ValidationError
|
|
3
|
-
|
|
4
|
-
from unstructured_ingest.interfaces import AccessConfig, ConnectionConfig
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def test_failing_connection_config():
|
|
8
|
-
class MyAccessConfig(AccessConfig):
|
|
9
|
-
sensitive_value: str
|
|
10
|
-
|
|
11
|
-
class MyConnectionConfig(ConnectionConfig):
|
|
12
|
-
access_config: MyAccessConfig
|
|
13
|
-
|
|
14
|
-
with pytest.raises(ValidationError):
|
|
15
|
-
MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def test_happy_path_connection_config():
|
|
19
|
-
class MyAccessConfig(AccessConfig):
|
|
20
|
-
sensitive_value: str
|
|
21
|
-
|
|
22
|
-
class MyConnectionConfig(ConnectionConfig):
|
|
23
|
-
access_config: Secret[MyAccessConfig]
|
|
24
|
-
|
|
25
|
-
connection_config = MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
|
|
26
|
-
assert connection_config
|