unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from test.integration.embedders.utils import (
|
|
8
|
-
validate_embedding_output,
|
|
9
|
-
validate_raw_embedder,
|
|
10
|
-
validate_raw_embedder_async,
|
|
11
|
-
)
|
|
12
|
-
from test.integration.utils import requires_env
|
|
13
|
-
from unstructured_ingest.embed.vertexai import (
|
|
14
|
-
AsyncVertexAIEmbeddingEncoder,
|
|
15
|
-
VertexAIEmbeddingConfig,
|
|
16
|
-
VertexAIEmbeddingEncoder,
|
|
17
|
-
)
|
|
18
|
-
from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
|
|
19
|
-
|
|
20
|
-
API_KEY = "VERTEXAI_API_KEY"
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def get_api_key() -> str:
|
|
24
|
-
api_key = os.getenv(API_KEY, None)
|
|
25
|
-
assert api_key
|
|
26
|
-
return api_key
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@requires_env(API_KEY)
|
|
30
|
-
def test_vertexai_embedder(embedder_file: Path):
|
|
31
|
-
api_key = get_api_key()
|
|
32
|
-
embedder_config = EmbedderConfig(embedding_provider="vertexai", embedding_api_key=api_key)
|
|
33
|
-
embedder = Embedder(config=embedder_config)
|
|
34
|
-
results = embedder.run(elements_filepath=embedder_file)
|
|
35
|
-
assert results
|
|
36
|
-
with embedder_file.open("r") as f:
|
|
37
|
-
original_elements = json.load(f)
|
|
38
|
-
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@requires_env(API_KEY)
|
|
42
|
-
def test_raw_vertexai_embedder(embedder_file: Path):
|
|
43
|
-
api_key = get_api_key()
|
|
44
|
-
embedder = VertexAIEmbeddingEncoder(
|
|
45
|
-
config=VertexAIEmbeddingConfig(
|
|
46
|
-
api_key=api_key,
|
|
47
|
-
)
|
|
48
|
-
)
|
|
49
|
-
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=768)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@requires_env(API_KEY)
|
|
53
|
-
@pytest.mark.asyncio
|
|
54
|
-
async def test_raw_async_vertexai_embedder(embedder_file: Path):
|
|
55
|
-
api_key = get_api_key()
|
|
56
|
-
embedder = AsyncVertexAIEmbeddingEncoder(
|
|
57
|
-
config=VertexAIEmbeddingConfig(
|
|
58
|
-
api_key=api_key,
|
|
59
|
-
)
|
|
60
|
-
)
|
|
61
|
-
await validate_raw_embedder_async(
|
|
62
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimension=768
|
|
63
|
-
)
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from test.integration.embedders.utils import (
|
|
8
|
-
validate_embedding_output,
|
|
9
|
-
validate_raw_embedder,
|
|
10
|
-
validate_raw_embedder_async,
|
|
11
|
-
)
|
|
12
|
-
from test.integration.utils import requires_env
|
|
13
|
-
from unstructured_ingest.embed.voyageai import (
|
|
14
|
-
AsyncVoyageAIEmbeddingEncoder,
|
|
15
|
-
VoyageAIEmbeddingConfig,
|
|
16
|
-
VoyageAIEmbeddingEncoder,
|
|
17
|
-
)
|
|
18
|
-
from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
|
|
19
|
-
|
|
20
|
-
API_KEY = "VOYAGEAI_API_KEY"
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def get_api_key() -> str:
|
|
24
|
-
api_key = os.getenv(API_KEY, None)
|
|
25
|
-
assert api_key
|
|
26
|
-
return api_key
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@requires_env(API_KEY)
|
|
30
|
-
def test_voyageai_embedder(embedder_file: Path):
|
|
31
|
-
api_key = get_api_key()
|
|
32
|
-
embedder_config = EmbedderConfig(embedding_provider="voyageai", embedding_api_key=api_key)
|
|
33
|
-
embedder = Embedder(config=embedder_config)
|
|
34
|
-
results = embedder.run(elements_filepath=embedder_file)
|
|
35
|
-
assert results
|
|
36
|
-
with embedder_file.open("r") as f:
|
|
37
|
-
original_elements = json.load(f)
|
|
38
|
-
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@requires_env(API_KEY)
|
|
42
|
-
def test_raw_voyageai_embedder(embedder_file: Path):
|
|
43
|
-
api_key = get_api_key()
|
|
44
|
-
embedder = VoyageAIEmbeddingEncoder(
|
|
45
|
-
config=VoyageAIEmbeddingConfig(
|
|
46
|
-
api_key=api_key,
|
|
47
|
-
)
|
|
48
|
-
)
|
|
49
|
-
validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1024)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@requires_env(API_KEY)
|
|
53
|
-
@pytest.mark.asyncio
|
|
54
|
-
async def test_raw_async_voyageai_embedder(embedder_file: Path):
|
|
55
|
-
api_key = get_api_key()
|
|
56
|
-
embedder = AsyncVoyageAIEmbeddingEncoder(
|
|
57
|
-
config=VoyageAIEmbeddingConfig(
|
|
58
|
-
api_key=api_key,
|
|
59
|
-
)
|
|
60
|
-
)
|
|
61
|
-
await validate_raw_embedder_async(
|
|
62
|
-
embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
@requires_env(API_KEY)
|
|
67
|
-
def test_voyageai_multimodal_embedder(embedder_file: Path):
|
|
68
|
-
api_key = get_api_key()
|
|
69
|
-
embedder_config = EmbedderConfig(
|
|
70
|
-
embedding_provider="voyageai",
|
|
71
|
-
embedding_api_key=api_key,
|
|
72
|
-
embedding_model_name="voyage-multimodal-3",
|
|
73
|
-
)
|
|
74
|
-
embedder = Embedder(config=embedder_config)
|
|
75
|
-
results = embedder.run(elements_filepath=embedder_file)
|
|
76
|
-
assert results
|
|
77
|
-
with embedder_file.open("r") as f:
|
|
78
|
-
original_elements = json.load(f)
|
|
79
|
-
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import Optional
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.embed.interfaces import AsyncBaseEmbeddingEncoder, BaseEmbeddingEncoder
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def validate_embedding_output(original_elements: list[dict], output_elements: list[dict]):
|
|
9
|
-
"""
|
|
10
|
-
Make sure the following characteristics are met:
|
|
11
|
-
* The same number of elements are returned
|
|
12
|
-
* For each element that had text, an embeddings entry was added in the output
|
|
13
|
-
* Other than the embedding, nothing about the element was changed
|
|
14
|
-
"""
|
|
15
|
-
assert len(original_elements) == len(output_elements)
|
|
16
|
-
for original_element, output_element in zip(original_elements, output_elements):
|
|
17
|
-
if original_element.get("text"):
|
|
18
|
-
assert output_element.get("embeddings", None)
|
|
19
|
-
output_element.pop("embeddings", None)
|
|
20
|
-
assert original_element == output_element
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def validate_raw_embedder(
|
|
24
|
-
embedder: BaseEmbeddingEncoder,
|
|
25
|
-
embedder_file: Path,
|
|
26
|
-
expected_dimension: Optional[int] = None,
|
|
27
|
-
expected_is_unit_vector: bool = True,
|
|
28
|
-
):
|
|
29
|
-
with open(embedder_file) as f:
|
|
30
|
-
elements = json.load(f)
|
|
31
|
-
all_text = [element["text"] for element in elements]
|
|
32
|
-
single_text = all_text[0]
|
|
33
|
-
dimension = embedder.dimension
|
|
34
|
-
if expected_dimension:
|
|
35
|
-
assert (
|
|
36
|
-
dimension == expected_dimension
|
|
37
|
-
), f"dimensions {dimension} didn't match expected: {expected_dimension}"
|
|
38
|
-
is_unit_vector = embedder.is_unit_vector
|
|
39
|
-
assert is_unit_vector == expected_is_unit_vector
|
|
40
|
-
single_embedding = embedder.embed_query(query=single_text)
|
|
41
|
-
assert len(single_embedding) == dimension
|
|
42
|
-
embedded_elements = embedder.embed_documents(elements=elements)
|
|
43
|
-
validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
async def validate_raw_embedder_async(
|
|
47
|
-
embedder: AsyncBaseEmbeddingEncoder,
|
|
48
|
-
embedder_file: Path,
|
|
49
|
-
expected_dimension: Optional[int] = None,
|
|
50
|
-
expected_is_unit_vector: bool = True,
|
|
51
|
-
):
|
|
52
|
-
with open(embedder_file) as f:
|
|
53
|
-
elements = json.load(f)
|
|
54
|
-
all_text = [element["text"] for element in elements]
|
|
55
|
-
single_text = all_text[0]
|
|
56
|
-
dimension = await embedder.dimension
|
|
57
|
-
if expected_dimension:
|
|
58
|
-
assert (
|
|
59
|
-
dimension == expected_dimension
|
|
60
|
-
), f"dimension {dimension} didn't match expected: {expected_dimension}"
|
|
61
|
-
is_unit_vector = await embedder.is_unit_vector
|
|
62
|
-
assert is_unit_vector == expected_is_unit_vector
|
|
63
|
-
single_embedding = await embedder.embed_query(query=single_text)
|
|
64
|
-
assert len(single_embedding) == dimension
|
|
65
|
-
embedded_elements = await embedder.embed_documents(elements=elements)
|
|
66
|
-
validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
|
|
File without changes
|
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from test.integration.utils import requires_env
|
|
7
|
-
from unstructured_ingest.errors_v2 import UserError
|
|
8
|
-
from unstructured_ingest.processes.partitioner import Partitioner, PartitionerConfig
|
|
9
|
-
|
|
10
|
-
int_test_dir = Path(__file__).parent
|
|
11
|
-
assets_dir = int_test_dir / "assets"
|
|
12
|
-
|
|
13
|
-
all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
|
|
14
|
-
non_image_partition_files = [
|
|
15
|
-
path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
|
|
16
|
-
]
|
|
17
|
-
supported_fast_partition_files = [
|
|
18
|
-
path for path in non_image_partition_files if path.suffix != ".eml"
|
|
19
|
-
]
|
|
20
|
-
image_partition_files = [
|
|
21
|
-
path for path in all_partition_files if path not in non_image_partition_files
|
|
22
|
-
]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@pytest.mark.parametrize(
|
|
26
|
-
"partition_file", all_partition_files, ids=[path.name for path in all_partition_files]
|
|
27
|
-
)
|
|
28
|
-
@requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
|
|
29
|
-
@pytest.mark.asyncio
|
|
30
|
-
async def test_partitioner_api_hi_res(partition_file: Path):
|
|
31
|
-
api_key = os.getenv("UNSTRUCTURED_API_KEY")
|
|
32
|
-
api_url = os.getenv("UNSTRUCTURED_API_URL")
|
|
33
|
-
partitioner_config = PartitionerConfig(
|
|
34
|
-
strategy="hi_res", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
|
|
35
|
-
)
|
|
36
|
-
partitioner = Partitioner(config=partitioner_config)
|
|
37
|
-
results = await partitioner.run_async(filename=partition_file)
|
|
38
|
-
assert results
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@pytest.mark.parametrize(
|
|
42
|
-
"partition_file",
|
|
43
|
-
supported_fast_partition_files,
|
|
44
|
-
ids=[path.name for path in supported_fast_partition_files],
|
|
45
|
-
)
|
|
46
|
-
@requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
|
|
47
|
-
@pytest.mark.asyncio
|
|
48
|
-
async def test_partitioner_api_fast(partition_file: Path):
|
|
49
|
-
api_key = os.getenv("UNSTRUCTURED_API_KEY")
|
|
50
|
-
api_url = os.getenv("UNSTRUCTURED_API_URL")
|
|
51
|
-
partitioner_config = PartitionerConfig(
|
|
52
|
-
strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
|
|
53
|
-
)
|
|
54
|
-
partitioner = Partitioner(config=partitioner_config)
|
|
55
|
-
results = await partitioner.run_async(filename=partition_file)
|
|
56
|
-
assert results
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
@pytest.mark.parametrize(
|
|
60
|
-
"partition_file", image_partition_files, ids=[path.name for path in image_partition_files]
|
|
61
|
-
)
|
|
62
|
-
@requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
|
|
63
|
-
@pytest.mark.asyncio
|
|
64
|
-
async def test_partitioner_api_fast_error(partition_file: Path):
|
|
65
|
-
api_key = os.getenv("UNSTRUCTURED_API_KEY")
|
|
66
|
-
api_url = os.getenv("UNSTRUCTURED_API_URL")
|
|
67
|
-
partitioner_config = PartitionerConfig(
|
|
68
|
-
strategy="fast",
|
|
69
|
-
partition_by_api=True,
|
|
70
|
-
api_key=api_key,
|
|
71
|
-
partition_endpoint=api_url,
|
|
72
|
-
raise_unsupported_filetype=True,
|
|
73
|
-
)
|
|
74
|
-
partitioner = Partitioner(config=partitioner_config)
|
|
75
|
-
with pytest.raises(UserError):
|
|
76
|
-
await partitioner.run_async(filename=partition_file)
|
test/integration/utils.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def requires_env(*envs):
|
|
7
|
-
if len(envs) == 1:
|
|
8
|
-
env = envs[0]
|
|
9
|
-
return pytest.mark.skipif(
|
|
10
|
-
env not in os.environ, reason=f"Environment variable not set: {env}"
|
|
11
|
-
)
|
|
12
|
-
return pytest.mark.skipif(
|
|
13
|
-
not all(env in os.environ for env in envs),
|
|
14
|
-
reason="All required environment variables not set: {}".format(", ".join(envs)),
|
|
15
|
-
)
|
test/unit/__init__.py
DELETED
|
File without changes
|
test/unit/chunkers/__init__.py
DELETED
|
File without changes
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
|
|
3
|
-
import faker
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.processes.chunker import Chunker, ChunkerConfig
|
|
7
|
-
|
|
8
|
-
fake = faker.Faker()
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def generate_chunker_config_params() -> dict:
|
|
12
|
-
params = {}
|
|
13
|
-
random_val = random.random()
|
|
14
|
-
if random_val < 0.5:
|
|
15
|
-
params["chunking_strategy"] = fake.word() if random.random() < 0.5 else None
|
|
16
|
-
params["chunk_combine_text_under_n_chars"] = (
|
|
17
|
-
fake.random_int() if random.random() < 0.5 else None
|
|
18
|
-
)
|
|
19
|
-
params["chunk_include_orig_elements"] = fake.boolean() if random.random() < 0.5 else None
|
|
20
|
-
params["chunk_max_characters"] = fake.random_int()
|
|
21
|
-
params["chunk_multipage_sections"] = fake.boolean()
|
|
22
|
-
params["chunk_new_after_n_chars"] = fake.random_int() if random.random() < 0.5 else None
|
|
23
|
-
params["chunk_overlap"] = fake.random_int() if random.random() < 0.5 else None
|
|
24
|
-
params["chunk_overlap_all"] = fake.boolean() if random.random() < 0.5 else None
|
|
25
|
-
if random_val < 0.5:
|
|
26
|
-
params["chunk_by_api"] = True
|
|
27
|
-
params["chunking_endpoint"] = fake.url()
|
|
28
|
-
params["chunk_api_key"] = fake.password()
|
|
29
|
-
else:
|
|
30
|
-
params["chunk_by_api"] = False
|
|
31
|
-
|
|
32
|
-
return params
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@pytest.mark.parametrize(
|
|
36
|
-
"partition_config_params", [generate_chunker_config_params() for i in range(10)]
|
|
37
|
-
)
|
|
38
|
-
def test_chunker_config(partition_config_params: dict):
|
|
39
|
-
chunker_config = ChunkerConfig.model_validate(partition_config_params)
|
|
40
|
-
assert chunker_config
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@pytest.mark.parametrize(
|
|
44
|
-
"partition_config_params", [generate_chunker_config_params() for i in range(10)]
|
|
45
|
-
)
|
|
46
|
-
def test_chunker(partition_config_params: dict):
|
|
47
|
-
chunker_config = ChunkerConfig.model_validate(partition_config_params)
|
|
48
|
-
chunker = Chunker(config=chunker_config)
|
|
49
|
-
assert chunker
|
test/unit/connectors/__init__.py
DELETED
|
File without changes
|
|
File without changes
|