unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +109 -0
- test/integration/connectors/test_azure_cog_search.py +233 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_kafka.py +167 -0
- test/integration/connectors/test_onedrive.py +112 -0
- test/integration/connectors/test_pinecone.py +161 -0
- test/integration/connectors/test_qdrant.py +137 -0
- test/integration/connectors/test_s3.py +23 -0
- test/integration/connectors/utils/docker.py +2 -1
- test/integration/connectors/utils/validation.py +73 -22
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/__init__.py +2 -2
- unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/kafka.py +0 -1
- unstructured_ingest/interfaces.py +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/processes/chunker.py +2 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
- unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
- unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
- unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
- unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
- unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
- unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
- unstructured_ingest/v2/processes/partitioner.py +14 -3
- unstructured_ingest/v2/unstructured_api.py +24 -10
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
- unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
- /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
- /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
- /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
import faker
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
|
|
7
|
+
|
|
8
|
+
fake = faker.Faker()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def generate_embedder_config_params() -> dict:
|
|
12
|
+
params = {
|
|
13
|
+
"aws_access_key_id": fake.password(),
|
|
14
|
+
"aws_secret_access_key": fake.password(),
|
|
15
|
+
"region_name": fake.city(),
|
|
16
|
+
}
|
|
17
|
+
if random.random() < 0.5:
|
|
18
|
+
params["embed_model_name"] = fake.word()
|
|
19
|
+
return params
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@pytest.mark.parametrize(
|
|
23
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
24
|
+
)
|
|
25
|
+
def test_embedder_config(embedder_config_params: dict):
|
|
26
|
+
embedder_config = BedrockEmbeddingConfig.model_validate(embedder_config_params)
|
|
27
|
+
assert embedder_config
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.mark.parametrize(
|
|
31
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
32
|
+
)
|
|
33
|
+
def test_embedder(embedder_config_params: dict):
|
|
34
|
+
embedder_config = BedrockEmbeddingConfig.model_validate(embedder_config_params)
|
|
35
|
+
embedder = BedrockEmbeddingEncoder(config=embedder_config)
|
|
36
|
+
assert embedder
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import faker
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from test.unit.v2.utils.data_generator import generate_random_dictionary
|
|
8
|
+
from unstructured_ingest.embed.huggingface import (
|
|
9
|
+
HuggingFaceEmbeddingConfig,
|
|
10
|
+
HuggingFaceEmbeddingEncoder,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
fake = faker.Faker()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def generate_embedder_config_params() -> dict:
|
|
17
|
+
params = {}
|
|
18
|
+
if random.random() < 0.5:
|
|
19
|
+
params["embed_model_name"] = fake.word() if random.random() < 0.5 else None
|
|
20
|
+
params["embedder_model_kwargs"] = (
|
|
21
|
+
generate_random_dictionary(key_type=str, value_type=Any)
|
|
22
|
+
if random.random() < 0.5
|
|
23
|
+
else None
|
|
24
|
+
)
|
|
25
|
+
params["encode_kwargs"] = (
|
|
26
|
+
generate_random_dictionary(key_type=str, value_type=Any)
|
|
27
|
+
if random.random() < 0.5
|
|
28
|
+
else None
|
|
29
|
+
)
|
|
30
|
+
params["cache_folder"] = fake.file_path() if random.random() < 0.5 else None
|
|
31
|
+
return params
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@pytest.mark.parametrize(
|
|
35
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
36
|
+
)
|
|
37
|
+
def test_embedder_config(embedder_config_params: dict):
|
|
38
|
+
embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
|
|
39
|
+
assert embedder_config
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@pytest.mark.parametrize(
|
|
43
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
44
|
+
)
|
|
45
|
+
def test_embedder(embedder_config_params: dict):
|
|
46
|
+
embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
|
|
47
|
+
embedder = HuggingFaceEmbeddingEncoder(config=embedder_config)
|
|
48
|
+
assert embedder
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
import faker
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.mixedbreadai import (
|
|
7
|
+
MixedbreadAIEmbeddingConfig,
|
|
8
|
+
MixedbreadAIEmbeddingEncoder,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
fake = faker.Faker()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def generate_embedder_config_params() -> dict:
|
|
15
|
+
params = {
|
|
16
|
+
"api_key": fake.password(),
|
|
17
|
+
}
|
|
18
|
+
if random.random() < 0.5:
|
|
19
|
+
params["embedder_model_name"] = fake.word()
|
|
20
|
+
return params
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.mark.parametrize(
|
|
24
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
25
|
+
)
|
|
26
|
+
def test_embedder_config(embedder_config_params: dict):
|
|
27
|
+
embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
28
|
+
assert embedder_config
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@pytest.mark.parametrize(
|
|
32
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
33
|
+
)
|
|
34
|
+
def test_embedder(embedder_config_params: dict):
|
|
35
|
+
embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
36
|
+
embedder = MixedbreadAIEmbeddingEncoder(config=embedder_config)
|
|
37
|
+
assert embedder
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
import faker
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
7
|
+
|
|
8
|
+
fake = faker.Faker()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def generate_embedder_config_params() -> dict:
|
|
12
|
+
params = {
|
|
13
|
+
"api_key": fake.password(),
|
|
14
|
+
}
|
|
15
|
+
if random.random() < 0.5:
|
|
16
|
+
params["embedder_model_name"] = fake.word()
|
|
17
|
+
params["base_url"] = fake.url()
|
|
18
|
+
return params
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.mark.parametrize(
|
|
22
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
23
|
+
)
|
|
24
|
+
def test_embedder_config(embedder_config_params: dict):
|
|
25
|
+
embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
|
|
26
|
+
assert embedder_config
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.mark.parametrize(
|
|
30
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
31
|
+
)
|
|
32
|
+
def test_embedder(embedder_config_params: dict):
|
|
33
|
+
embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
|
|
34
|
+
embedder = OctoAIEmbeddingEncoder(config=embedder_config)
|
|
35
|
+
assert embedder
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
import faker
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
7
|
+
|
|
8
|
+
fake = faker.Faker()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def generate_embedder_config_params() -> dict:
|
|
12
|
+
params = {
|
|
13
|
+
"api_key": fake.password(),
|
|
14
|
+
}
|
|
15
|
+
if random.random() < 0.5:
|
|
16
|
+
params["embedder_model_name"] = fake.word()
|
|
17
|
+
params["base_url"] = fake.url()
|
|
18
|
+
return params
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.mark.parametrize(
|
|
22
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
23
|
+
)
|
|
24
|
+
def test_embedder_config(embedder_config_params: dict):
|
|
25
|
+
embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
26
|
+
assert embedder_config
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.mark.parametrize(
|
|
30
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
31
|
+
)
|
|
32
|
+
def test_embedder(embedder_config_params: dict):
|
|
33
|
+
embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
34
|
+
embedder = OpenAIEmbeddingEncoder(config=embedder_config)
|
|
35
|
+
assert embedder
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
import faker
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.togetherai import (
|
|
7
|
+
TogetherAIEmbeddingConfig,
|
|
8
|
+
TogetherAIEmbeddingEncoder,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
fake = faker.Faker()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def generate_embedder_config_params() -> dict:
|
|
15
|
+
params = {
|
|
16
|
+
"api_key": fake.password(),
|
|
17
|
+
}
|
|
18
|
+
if random.random() < 0.5:
|
|
19
|
+
params["embedder_model_name"] = fake.word()
|
|
20
|
+
return params
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.mark.parametrize(
|
|
24
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
25
|
+
)
|
|
26
|
+
def test_embedder_config(embedder_config_params: dict):
|
|
27
|
+
embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
28
|
+
assert embedder_config
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@pytest.mark.parametrize(
|
|
32
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
33
|
+
)
|
|
34
|
+
def test_embedder(embedder_config_params: dict):
|
|
35
|
+
embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
36
|
+
embedder = TogetherAIEmbeddingEncoder(config=embedder_config)
|
|
37
|
+
assert embedder
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import random
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import faker
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from test.unit.v2.utils.data_generator import generate_random_dictionary
|
|
9
|
+
from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
|
|
10
|
+
|
|
11
|
+
fake = faker.Faker()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def generate_embedder_config_params() -> dict:
|
|
15
|
+
params = {
|
|
16
|
+
"api_key": json.dumps(generate_random_dictionary(key_type=str, value_type=Any)),
|
|
17
|
+
}
|
|
18
|
+
if random.random() < 0.5:
|
|
19
|
+
params["embedder_model_name"] = fake.word()
|
|
20
|
+
return params
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.mark.parametrize(
|
|
24
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
25
|
+
)
|
|
26
|
+
def test_embedder_config(embedder_config_params: dict):
|
|
27
|
+
embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
28
|
+
assert embedder_config
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@pytest.mark.parametrize(
|
|
32
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
33
|
+
)
|
|
34
|
+
def test_embedder(embedder_config_params: dict):
|
|
35
|
+
embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
36
|
+
embedder = VertexAIEmbeddingEncoder(config=embedder_config)
|
|
37
|
+
assert embedder
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
import faker
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
|
|
7
|
+
|
|
8
|
+
fake = faker.Faker()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def generate_embedder_config_params() -> dict:
|
|
12
|
+
params = {
|
|
13
|
+
"api_key": fake.password(),
|
|
14
|
+
}
|
|
15
|
+
if random.random() < 0.5:
|
|
16
|
+
params["embedder_model_name"] = fake.word()
|
|
17
|
+
params["batch_size"] = fake.random_int()
|
|
18
|
+
params["truncation"] = fake.boolean()
|
|
19
|
+
params["max_retries"] = fake.random_int()
|
|
20
|
+
params["timeout_in_seconds"] = fake.random_int()
|
|
21
|
+
return params
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.mark.parametrize(
|
|
25
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
26
|
+
)
|
|
27
|
+
def test_embedder_config(embedder_config_params: dict):
|
|
28
|
+
embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
29
|
+
assert embedder_config
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.mark.parametrize(
|
|
33
|
+
"embedder_config_params", [generate_embedder_config_params() for i in range(10)]
|
|
34
|
+
)
|
|
35
|
+
def test_embedder(embedder_config_params: dict):
|
|
36
|
+
embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
|
|
37
|
+
embedder = VoyageAIEmbeddingEncoder(config=embedder_config)
|
|
38
|
+
assert embedder
|
|
File without changes
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import faker
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from test.unit.v2.utils.data_generator import generate_random_dictionary
|
|
8
|
+
from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
|
|
9
|
+
|
|
10
|
+
fake = faker.Faker()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate_partitioner_config_params() -> dict:
|
|
14
|
+
params = {
|
|
15
|
+
"strategy": random.choice(["fast", "hi_res", "auto"]),
|
|
16
|
+
"ocr_languages": fake.words() if random.random() < 0.5 else None,
|
|
17
|
+
"encoding": fake.word() if random.random() < 0.5 else None,
|
|
18
|
+
"additional_partition_args": (
|
|
19
|
+
generate_random_dictionary(key_type=str, value_type=Any)
|
|
20
|
+
if random.random() < 0.5
|
|
21
|
+
else None
|
|
22
|
+
),
|
|
23
|
+
"skip_infer_table_types": fake.words() if random.random() < 0.5 else None,
|
|
24
|
+
"flatten_metadata": fake.boolean(),
|
|
25
|
+
"hi_res_model_name": fake.word() if random.random() < 0.5 else None,
|
|
26
|
+
}
|
|
27
|
+
random_val = random.random()
|
|
28
|
+
# Randomly set the fields_include to a random list[str]
|
|
29
|
+
if random_val < 0.5:
|
|
30
|
+
params["fields_include"] = fake.words()
|
|
31
|
+
|
|
32
|
+
# Randomly set the metadata_exclude or metadata_include to a valid
|
|
33
|
+
# list[str] or don't set it at all
|
|
34
|
+
if random.random() < (1 / 3):
|
|
35
|
+
params["metadata_exclude"] = fake.words()
|
|
36
|
+
elif random_val < (2 / 3):
|
|
37
|
+
params["metadata_include"] = fake.words()
|
|
38
|
+
|
|
39
|
+
# Randomly set the values associated with calling the api, or not at all
|
|
40
|
+
if random.random() < 0.5:
|
|
41
|
+
params["partition_by_api"]: True
|
|
42
|
+
params["partition_endpoint"] = fake.url()
|
|
43
|
+
params["api_key"] = fake.password()
|
|
44
|
+
else:
|
|
45
|
+
params["partition_by_api"]: False
|
|
46
|
+
return params
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@pytest.mark.parametrize(
|
|
50
|
+
"partition_config_params", [generate_partitioner_config_params() for i in range(10)]
|
|
51
|
+
)
|
|
52
|
+
def test_partition_config(partition_config_params: dict):
|
|
53
|
+
partition_config = PartitionerConfig.model_validate(partition_config_params)
|
|
54
|
+
assert partition_config
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@pytest.mark.parametrize(
|
|
58
|
+
"partition_config_params", [generate_partitioner_config_params() for i in range(10)]
|
|
59
|
+
)
|
|
60
|
+
def test_partitioner(partition_config_params: dict):
|
|
61
|
+
partition_config = PartitionerConfig.model_validate(partition_config_params)
|
|
62
|
+
partitioner = Partitioner(config=partition_config)
|
|
63
|
+
assert partitioner
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from typing import Any, Type
|
|
3
|
+
|
|
4
|
+
from faker import Faker
|
|
5
|
+
|
|
6
|
+
fake = Faker()
|
|
7
|
+
|
|
8
|
+
type_to_random_value_map = {
|
|
9
|
+
str: fake.sentence,
|
|
10
|
+
int: fake.random_int,
|
|
11
|
+
float: fake.random_digit,
|
|
12
|
+
bool: fake.boolean,
|
|
13
|
+
}
|
|
14
|
+
type_to_random_value_map_key = type_to_random_value_map.copy()
|
|
15
|
+
type_to_random_value_map_key[str] = fake.word
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def generate_random_dictionary(key_type: Type = str, value_type: Type = str) -> dict:
|
|
19
|
+
d = {}
|
|
20
|
+
num_keys = random.randint(1, 3)
|
|
21
|
+
for i in range(num_keys):
|
|
22
|
+
key = type_to_random_value_map_key[key_type]()
|
|
23
|
+
current_value_type = value_type
|
|
24
|
+
if current_value_type == Any:
|
|
25
|
+
current_value_type = random.choice(list(type_to_random_value_map.keys()) + [dict])
|
|
26
|
+
value = (
|
|
27
|
+
generate_random_dictionary(key_type=key_type, value_type=value_type)
|
|
28
|
+
if current_value_type is dict
|
|
29
|
+
else type_to_random_value_map[current_value_type]()
|
|
30
|
+
)
|
|
31
|
+
d[key] = value
|
|
32
|
+
return d
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.3.0" # pragma: no cover
|
|
@@ -9,7 +9,7 @@ from unstructured_ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_ba
|
|
|
9
9
|
from .airtable import get_base_src_cmd as airtable_base_src_cmd
|
|
10
10
|
from .astradb import get_base_dest_cmd as astradb_base_dest_cmd
|
|
11
11
|
from .astradb import get_base_src_cmd as astradb_base_src_cmd
|
|
12
|
-
from .
|
|
12
|
+
from .azure_ai_search import get_base_dest_cmd as azure_ai_search_base_dest_cmd
|
|
13
13
|
from .biomed import get_base_src_cmd as biomed_base_src_cmd
|
|
14
14
|
from .chroma import get_base_dest_cmd as chroma_base_dest_cmd
|
|
15
15
|
from .clarifai import get_base_dest_cmd as clarifai_base_dest_cmd
|
|
@@ -118,7 +118,7 @@ base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
|
|
|
118
118
|
gcs_base_dest_cmd,
|
|
119
119
|
kafka_base_dest_cmd,
|
|
120
120
|
s3_base_dest_cmd,
|
|
121
|
-
|
|
121
|
+
azure_ai_search_base_dest_cmd,
|
|
122
122
|
delta_table_dest_cmd,
|
|
123
123
|
sql_base_dest_cmd,
|
|
124
124
|
weaviate_dest_cmd,
|
|
@@ -6,14 +6,14 @@ import click
|
|
|
6
6
|
from unstructured_ingest.cli.interfaces import (
|
|
7
7
|
CliConfig,
|
|
8
8
|
)
|
|
9
|
-
from unstructured_ingest.connector.
|
|
10
|
-
|
|
11
|
-
|
|
9
|
+
from unstructured_ingest.connector.azure_ai_search import (
|
|
10
|
+
AzureAISearchWriteConfig,
|
|
11
|
+
SimpleAzureAISearchStorageConfig,
|
|
12
12
|
)
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@dataclass
|
|
16
|
-
class
|
|
16
|
+
class AzureAISearchCliConfig(SimpleAzureAISearchStorageConfig, CliConfig):
|
|
17
17
|
@staticmethod
|
|
18
18
|
def get_cli_options() -> t.List[click.Option]:
|
|
19
19
|
options = [
|
|
@@ -39,7 +39,7 @@ class AzureCognitiveSearchCliConfig(SimpleAzureCognitiveSearchStorageConfig, Cli
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
@dataclass
|
|
42
|
-
class
|
|
42
|
+
class AzureAISearchCliWriteConfig(AzureAISearchWriteConfig, CliConfig):
|
|
43
43
|
@staticmethod
|
|
44
44
|
def get_cli_options() -> t.List[click.Option]:
|
|
45
45
|
options = [
|
|
@@ -57,9 +57,9 @@ def get_base_dest_cmd():
|
|
|
57
57
|
from unstructured_ingest.cli.base.dest import BaseDestCmd
|
|
58
58
|
|
|
59
59
|
cmd_cls = BaseDestCmd(
|
|
60
|
-
cmd_name="azure-
|
|
61
|
-
cli_config=
|
|
62
|
-
additional_cli_options=[
|
|
63
|
-
write_config=
|
|
60
|
+
cmd_name="azure-ai-search",
|
|
61
|
+
cli_config=AzureAISearchCliConfig,
|
|
62
|
+
additional_cli_options=[AzureAISearchCliWriteConfig],
|
|
63
|
+
write_config=AzureAISearchCliWriteConfig,
|
|
64
64
|
)
|
|
65
65
|
return cmd_cls
|
|
@@ -19,28 +19,28 @@ if t.TYPE_CHECKING:
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
@dataclass
|
|
22
|
-
class
|
|
22
|
+
class AzureAiSearchAccessConfig(AccessConfig):
|
|
23
23
|
key: str = enhanced_field(sensitive=True)
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
@dataclass
|
|
27
|
-
class
|
|
27
|
+
class SimpleAzureAISearchStorageConfig(BaseConnectorConfig):
|
|
28
28
|
endpoint: str
|
|
29
|
-
access_config:
|
|
29
|
+
access_config: AzureAiSearchAccessConfig
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
@dataclass
|
|
33
|
-
class
|
|
33
|
+
class AzureAISearchWriteConfig(WriteConfig):
|
|
34
34
|
index: str
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
@dataclass
|
|
38
|
-
class
|
|
39
|
-
write_config:
|
|
40
|
-
connector_config:
|
|
38
|
+
class AzureAISearchDestinationConnector(BaseDestinationConnector):
|
|
39
|
+
write_config: AzureAISearchWriteConfig
|
|
40
|
+
connector_config: SimpleAzureAISearchStorageConfig
|
|
41
41
|
_client: t.Optional["SearchClient"] = field(init=False, default=None)
|
|
42
42
|
|
|
43
|
-
@requires_dependencies(["azure.search"], extras="azure-
|
|
43
|
+
@requires_dependencies(["azure.search"], extras="azure-ai-search")
|
|
44
44
|
def generate_client(self) -> "SearchClient":
|
|
45
45
|
from azure.core.credentials import AzureKeyCredential
|
|
46
46
|
from azure.search.documents import SearchClient
|
|
@@ -112,7 +112,7 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
|
|
|
112
112
|
if page_number := data.get("metadata", {}).get("page_number"):
|
|
113
113
|
data["metadata"]["page_number"] = str(page_number)
|
|
114
114
|
|
|
115
|
-
@requires_dependencies(["azure"], extras="azure-
|
|
115
|
+
@requires_dependencies(["azure"], extras="azure-ai-search")
|
|
116
116
|
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
117
117
|
import azure.core.exceptions
|
|
118
118
|
|
|
@@ -181,7 +181,6 @@ class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
181
181
|
logger.debug(f"found {len(collected)} messages, stopping")
|
|
182
182
|
consumer.commit(asynchronous=False)
|
|
183
183
|
break
|
|
184
|
-
|
|
185
184
|
return [
|
|
186
185
|
KafkaIngestDoc(
|
|
187
186
|
connector_config=self.connector_config,
|
|
@@ -21,6 +21,7 @@ from unstructured_ingest.enhanced_dataclass.core import _asdict
|
|
|
21
21
|
from unstructured_ingest.error import PartitionError, SourceConnectionError
|
|
22
22
|
from unstructured_ingest.logger import logger
|
|
23
23
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
24
|
+
from unstructured_ingest.v2.unstructured_api import call_api
|
|
24
25
|
|
|
25
26
|
if TYPE_CHECKING:
|
|
26
27
|
from unstructured.documents.elements import Element
|
|
@@ -565,6 +566,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
|
565
566
|
) -> list["Element"]:
|
|
566
567
|
from unstructured.documents.elements import DataSourceMetadata
|
|
567
568
|
from unstructured.partition.auto import partition
|
|
569
|
+
from unstructured.staging.base import elements_from_dicts
|
|
568
570
|
|
|
569
571
|
if not partition_config.partition_by_api:
|
|
570
572
|
logger.debug("Using local partition")
|
|
@@ -582,18 +584,16 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
|
582
584
|
**partition_kwargs,
|
|
583
585
|
)
|
|
584
586
|
else:
|
|
585
|
-
from unstructured.partition.api import partition_via_api
|
|
586
|
-
|
|
587
587
|
endpoint = partition_config.partition_endpoint
|
|
588
588
|
|
|
589
589
|
logger.debug(f"using remote partition ({endpoint})")
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
filename=str(self.filename),
|
|
590
|
+
elements_dicts = call_api(
|
|
591
|
+
server_url=endpoint,
|
|
593
592
|
api_key=partition_config.api_key,
|
|
594
|
-
|
|
595
|
-
|
|
593
|
+
filename=Path(self.filename),
|
|
594
|
+
api_parameters=partition_kwargs,
|
|
596
595
|
)
|
|
596
|
+
elements = elements_from_dicts(elements_dicts)
|
|
597
597
|
# TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
|
|
598
598
|
# pass the stringified json here
|
|
599
599
|
return elements
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import typing as t
|
|
2
2
|
|
|
3
3
|
from .astradb import AstraDBWriter
|
|
4
|
-
from .
|
|
4
|
+
from .azure_ai_search import AzureAiSearchWriter
|
|
5
5
|
from .base_writer import Writer
|
|
6
6
|
from .chroma import ChromaWriter
|
|
7
7
|
from .clarifai import ClarifaiWriter
|
|
@@ -25,7 +25,7 @@ from .weaviate import WeaviateWriter
|
|
|
25
25
|
writer_map: t.Dict[str, t.Type[Writer]] = {
|
|
26
26
|
"astradb": AstraDBWriter,
|
|
27
27
|
"azure": AzureWriter,
|
|
28
|
-
"
|
|
28
|
+
"azure_ai_search": AzureAiSearchWriter,
|
|
29
29
|
"box": BoxWriter,
|
|
30
30
|
"chroma": ChromaWriter,
|
|
31
31
|
"clarifai": ClarifaiWriter,
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.interfaces import BaseDestinationConnector
|
|
5
|
+
from unstructured_ingest.runner.writers.base_writer import Writer
|
|
6
|
+
|
|
7
|
+
if t.TYPE_CHECKING:
|
|
8
|
+
from unstructured_ingest.connector.azure_ai_search import (
|
|
9
|
+
AzureAISearchWriteConfig,
|
|
10
|
+
SimpleAzureAISearchStorageConfig,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class AzureAiSearchWriter(Writer):
|
|
16
|
+
connector_config: "SimpleAzureAISearchStorageConfig"
|
|
17
|
+
write_config: "AzureAISearchWriteConfig"
|
|
18
|
+
|
|
19
|
+
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
|
20
|
+
from unstructured_ingest.connector.azure_ai_search import (
|
|
21
|
+
AzureAISearchDestinationConnector,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
return AzureAISearchDestinationConnector
|
|
@@ -9,7 +9,7 @@ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
11
|
from unstructured_ingest.v2.logger import logger
|
|
12
|
-
from unstructured_ingest.v2.unstructured_api import
|
|
12
|
+
from unstructured_ingest.v2.unstructured_api import call_api_async
|
|
13
13
|
|
|
14
14
|
CHUNK_MAX_CHARS_DEFAULT: int = 500
|
|
15
15
|
CHUNK_MULTI_PAGE_DEFAULT: bool = True
|
|
@@ -112,7 +112,7 @@ class Chunker(BaseProcess, ABC):
|
|
|
112
112
|
|
|
113
113
|
@requires_dependencies(dependencies=["unstructured_client"], extras="remote")
|
|
114
114
|
async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
115
|
-
elements = await
|
|
115
|
+
elements = await call_api_async(
|
|
116
116
|
server_url=self.config.chunking_endpoint,
|
|
117
117
|
api_key=self.config.chunk_api_key.get_secret_value(),
|
|
118
118
|
filename=elements_filepath,
|