unstructured-ingest 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/test_postgres.py +100 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/test_sqlite.py +91 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +198 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/utils/dep_check.py +12 -0
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +9 -11
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +125 -32
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +9 -1
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +12 -10
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +86 -32
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def requires_env(*envs):
|
|
7
|
+
if len(envs) == 1:
|
|
8
|
+
env = envs[0]
|
|
9
|
+
return pytest.mark.skipif(
|
|
10
|
+
env not in os.environ, reason=f"Environment variable not set: {env}"
|
|
11
|
+
)
|
|
12
|
+
return pytest.mark.skipif(
|
|
13
|
+
not all(env in os.environ for env in envs),
|
|
14
|
+
reason="All required environment variables not set: {}".format(", ".join(envs)),
|
|
15
|
+
)
|
test/unit/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from unstructured_ingest.embed.mixedbreadai import (
|
|
2
|
+
MixedbreadAIEmbeddingConfig,
|
|
3
|
+
MixedbreadAIEmbeddingEncoder,
|
|
4
|
+
)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
8
|
+
mock_client = mocker.MagicMock()
|
|
9
|
+
|
|
10
|
+
def mock_embeddings(
|
|
11
|
+
model,
|
|
12
|
+
normalized,
|
|
13
|
+
encoding_format,
|
|
14
|
+
truncation_strategy,
|
|
15
|
+
request_options,
|
|
16
|
+
input,
|
|
17
|
+
):
|
|
18
|
+
mock_response = mocker.MagicMock()
|
|
19
|
+
mock_response.data = [mocker.MagicMock(embedding=[i, i + 1]) for i in range(len(input))]
|
|
20
|
+
return mock_response
|
|
21
|
+
|
|
22
|
+
mock_client.embeddings.side_effect = mock_embeddings
|
|
23
|
+
|
|
24
|
+
# Mock get_client to return our mock_client
|
|
25
|
+
mocker.patch.object(MixedbreadAIEmbeddingConfig, "get_client", return_value=mock_client)
|
|
26
|
+
|
|
27
|
+
encoder = MixedbreadAIEmbeddingEncoder(
|
|
28
|
+
config=MixedbreadAIEmbeddingConfig(
|
|
29
|
+
api_key="api_key", model_name="mixedbread-ai/mxbai-embed-large-v1"
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
raw_elements = [{"text": f"This is sentence {i+1}"} for i in range(2)]
|
|
34
|
+
elements = encoder.embed_documents(
|
|
35
|
+
elements=raw_elements,
|
|
36
|
+
)
|
|
37
|
+
assert len(elements) == 2
|
|
38
|
+
assert elements[0]["text"] == "This is sentence 1"
|
|
39
|
+
assert elements[1]["text"] == "This is sentence 2"
|
|
40
|
+
assert elements[0]["embeddings"] is not None
|
|
41
|
+
assert elements[1]["embeddings"] is not None
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
|
+
# Mocked client with the desired behavior for embed_documents
|
|
6
|
+
mock_client = mocker.MagicMock()
|
|
7
|
+
mock_client.embed_documents.return_value = [1, 2]
|
|
8
|
+
|
|
9
|
+
# Mock get_client to return our mock_client
|
|
10
|
+
mocker.patch.object(OctoAiEmbeddingConfig, "get_client", return_value=mock_client)
|
|
11
|
+
|
|
12
|
+
encoder = OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(api_key="api_key"))
|
|
13
|
+
raw_elements = [{"text": f"This is sentence {i+1}"} for i in range(2)]
|
|
14
|
+
|
|
15
|
+
elements = encoder.embed_documents(
|
|
16
|
+
elements=raw_elements,
|
|
17
|
+
)
|
|
18
|
+
assert len(elements) == 2
|
|
19
|
+
assert elements[0]["text"] == "This is sentence 1"
|
|
20
|
+
assert elements[1]["text"] == "This is sentence 2"
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
|
+
# Mocked client with the desired behavior for embed_documents
|
|
6
|
+
mock_client = mocker.MagicMock()
|
|
7
|
+
mock_client.embed_documents.return_value = [1, 2]
|
|
8
|
+
|
|
9
|
+
# Mock get_client to return our mock_client
|
|
10
|
+
mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client)
|
|
11
|
+
|
|
12
|
+
encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key"))
|
|
13
|
+
raw_elements = [{"text": f"This is sentence {i+1}"} for i in range(2)]
|
|
14
|
+
|
|
15
|
+
elements = encoder.embed_documents(
|
|
16
|
+
elements=raw_elements,
|
|
17
|
+
)
|
|
18
|
+
assert len(elements) == 2
|
|
19
|
+
assert elements[0]["text"] == "This is sentence 1"
|
|
20
|
+
assert elements[1]["text"] == "This is sentence 2"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
|
+
# Mocked client with the desired behavior for embed_documents
|
|
6
|
+
mock_responses = []
|
|
7
|
+
for i in [1, 2]:
|
|
8
|
+
mock_response = mocker.Mock()
|
|
9
|
+
mocker.patch.object(mock_response, "values", i)
|
|
10
|
+
mock_responses.append(mock_response)
|
|
11
|
+
|
|
12
|
+
mock_client = mocker.MagicMock()
|
|
13
|
+
mock_client.get_embeddings.return_value = mock_responses
|
|
14
|
+
|
|
15
|
+
# Mock create_client to return our mock_client
|
|
16
|
+
mocker.patch.object(VertexAIEmbeddingConfig, "get_client", return_value=mock_client)
|
|
17
|
+
encoder = VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(api_key={"api_key": "value"}))
|
|
18
|
+
raw_elements = [{"text": f"This is sentence {i+1}"} for i in range(2)]
|
|
19
|
+
|
|
20
|
+
elements = encoder.embed_documents(
|
|
21
|
+
elements=raw_elements,
|
|
22
|
+
)
|
|
23
|
+
assert len(elements) == 2
|
|
24
|
+
assert elements[0]["text"] == "This is sentence 1"
|
|
25
|
+
assert elements[1]["text"] == "This is sentence 2"
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
5
|
+
# Mocked client with the desired behavior for embed_documents
|
|
6
|
+
mock_response = mocker.MagicMock()
|
|
7
|
+
mocker.patch.object(mock_response, "embeddings", [1, 2])
|
|
8
|
+
mock_client = mocker.MagicMock()
|
|
9
|
+
mock_client.embed.return_value = mock_response
|
|
10
|
+
|
|
11
|
+
# Mock get_client to return our mock_client
|
|
12
|
+
mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client)
|
|
13
|
+
|
|
14
|
+
encoder = VoyageAIEmbeddingEncoder(
|
|
15
|
+
config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-law-2")
|
|
16
|
+
)
|
|
17
|
+
raw_elements = [{"text": f"This is sentence {i+1}"} for i in range(2)]
|
|
18
|
+
|
|
19
|
+
elements = encoder.embed_documents(
|
|
20
|
+
elements=raw_elements,
|
|
21
|
+
)
|
|
22
|
+
assert len(elements) == 2
|
|
23
|
+
assert elements[0]["text"] == "This is sentence 1"
|
|
24
|
+
assert elements[1]["text"] == "This is sentence 2"
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from unstructured.chunking import dispatch
|
|
5
|
+
from unstructured.documents.elements import assign_and_map_hash_ids
|
|
6
|
+
from unstructured.partition.auto import partition
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.utils.chunking import (
|
|
9
|
+
assign_and_map_hash_ids as new_assign_and_map_hash_ids,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
test_file_path = Path(__file__).resolve()
|
|
13
|
+
project_root = test_file_path.parents[2]
|
|
14
|
+
docs_path = project_root / "example-docs"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.mark.parametrize(
|
|
18
|
+
"chunking_strategy",
|
|
19
|
+
["basic", "by_title"],
|
|
20
|
+
)
|
|
21
|
+
def test_assign_and_map_hash_ids(chunking_strategy):
|
|
22
|
+
# Make sure the new logic working on dict content matches the
|
|
23
|
+
# results if using the unstructured version
|
|
24
|
+
file_path = docs_path / "book-war-and-peace-1p.txt"
|
|
25
|
+
elements = partition(filename=str(file_path.resolve()), strategy="fast")
|
|
26
|
+
chunked_elements = dispatch.chunk(elements=elements, chunking_strategy=chunking_strategy)
|
|
27
|
+
chunked_elements_copy = chunked_elements.copy()
|
|
28
|
+
|
|
29
|
+
hashed_chunked_elements = assign_and_map_hash_ids(chunked_elements)
|
|
30
|
+
og_chunked_elements_dicts = [e.to_dict() for e in hashed_chunked_elements]
|
|
31
|
+
|
|
32
|
+
new_chunked_elements_dicts = [e.to_dict() for e in chunked_elements_copy]
|
|
33
|
+
new_chunked_elements_dicts = new_assign_and_map_hash_ids(new_chunked_elements_dicts)
|
|
34
|
+
|
|
35
|
+
for e1, e2 in zip(og_chunked_elements_dicts, new_chunked_elements_dicts):
|
|
36
|
+
assert e1 == e2
|
test/unit/test_error.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.error import (
|
|
4
|
+
DestinationConnectionError,
|
|
5
|
+
PartitionError,
|
|
6
|
+
SourceConnectionError,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@pytest.mark.parametrize(
|
|
11
|
+
("error_class", "exception_type", "error_message"),
|
|
12
|
+
[
|
|
13
|
+
(SourceConnectionError, ValueError, "Simulated connection error"),
|
|
14
|
+
(DestinationConnectionError, RuntimeError, "Simulated connection error"),
|
|
15
|
+
(PartitionError, FileNotFoundError, "Simulated partition error"),
|
|
16
|
+
],
|
|
17
|
+
)
|
|
18
|
+
def test_custom_error_decorator(error_class, exception_type, error_message):
|
|
19
|
+
@error_class.wrap
|
|
20
|
+
def simulate_error():
|
|
21
|
+
raise exception_type(error_message)
|
|
22
|
+
|
|
23
|
+
with pytest.raises(error_class) as context:
|
|
24
|
+
simulate_error()
|
|
25
|
+
|
|
26
|
+
expected_error_string = error_class.error_string.format(error_message)
|
|
27
|
+
assert str(context.value) == expected_error_string
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pathlib
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Any, Dict
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
9
|
+
from unstructured.partition.auto import partition
|
|
10
|
+
from unstructured.staging.base import elements_to_dicts
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.interfaces import (
|
|
13
|
+
BaseConnectorConfig,
|
|
14
|
+
BaseSingleIngestDoc,
|
|
15
|
+
ChunkingConfig,
|
|
16
|
+
PartitionConfig,
|
|
17
|
+
ProcessorConfig,
|
|
18
|
+
ReadConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
DIRECTORY = pathlib.Path(__file__).parents[2].resolve()
|
|
22
|
+
EXAMPLE_DOCS_DIRECTORY = DIRECTORY / "example-docs"
|
|
23
|
+
TEST_DOWNLOAD_DIR = "/tmp"
|
|
24
|
+
TEST_OUTPUT_DIR = "/tmp"
|
|
25
|
+
TEST_ID = "test"
|
|
26
|
+
TEST_FILE_PATH = str(EXAMPLE_DOCS_DIRECTORY / "book-war-and-peace-1p.txt")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class ExampleConfig(BaseConnectorConfig):
|
|
31
|
+
id: str
|
|
32
|
+
path: str
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
TEST_CONFIG = ExampleConfig(id=TEST_ID, path=TEST_FILE_PATH)
|
|
36
|
+
TEST_SOURCE_URL = "test-source-url"
|
|
37
|
+
TEST_VERSION = "1.1.1"
|
|
38
|
+
TEST_RECORD_LOCATOR = {"id": "data-source-id"}
|
|
39
|
+
TEST_DATE_CREATED = "2021-01-01T00:00:00"
|
|
40
|
+
TEST_DATE_MODIFIED = "2021-01-02T00:00:00"
|
|
41
|
+
TEST_DATE_PROCESSED = "2022-12-13T15:44:08"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class ExampleIngestDoc(BaseSingleIngestDoc):
|
|
46
|
+
connector_config: ExampleConfig
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def filename(self):
|
|
50
|
+
return TEST_FILE_PATH
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def _output_filename(self):
|
|
54
|
+
return TEST_FILE_PATH + ".json"
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def source_url(self) -> str:
|
|
58
|
+
return TEST_SOURCE_URL
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def version(self) -> str:
|
|
62
|
+
return TEST_VERSION
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def record_locator(self) -> Dict[str, Any]:
|
|
66
|
+
return TEST_RECORD_LOCATOR
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def date_created(self) -> str:
|
|
70
|
+
return TEST_DATE_CREATED
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def date_modified(self) -> str:
|
|
74
|
+
return TEST_DATE_MODIFIED
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def exists(self) -> bool:
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
def cleanup_file(self):
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
def get_file(self):
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
def has_output(self):
|
|
87
|
+
return True
|
|
88
|
+
|
|
89
|
+
def write_result(self, result):
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@pytest.fixture
|
|
94
|
+
def partition_test_results():
|
|
95
|
+
# Reusable partition test results, calculated only once
|
|
96
|
+
result = partition(
|
|
97
|
+
filename=str(TEST_FILE_PATH),
|
|
98
|
+
data_source_metadata=DataSourceMetadata(
|
|
99
|
+
url=TEST_SOURCE_URL,
|
|
100
|
+
version=TEST_VERSION,
|
|
101
|
+
record_locator=TEST_RECORD_LOCATOR,
|
|
102
|
+
date_created=TEST_DATE_CREATED,
|
|
103
|
+
date_modified=TEST_DATE_MODIFIED,
|
|
104
|
+
date_processed=TEST_DATE_PROCESSED,
|
|
105
|
+
),
|
|
106
|
+
)
|
|
107
|
+
return result
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@pytest.fixture
|
|
111
|
+
def partition_file_test_results(partition_test_results):
|
|
112
|
+
# Reusable partition_file test results, calculated only once
|
|
113
|
+
return elements_to_dicts(partition_test_results)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_partition_file():
|
|
117
|
+
"""Validate partition_file returns a list of dictionaries with the expected keys,
|
|
118
|
+
metadatakeys, and data source metadata values."""
|
|
119
|
+
test_ingest_doc = ExampleIngestDoc(
|
|
120
|
+
connector_config=TEST_CONFIG,
|
|
121
|
+
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
122
|
+
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
123
|
+
)
|
|
124
|
+
test_ingest_doc._date_processed = TEST_DATE_PROCESSED
|
|
125
|
+
elements = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
|
126
|
+
element_dicts = elements_to_dicts(elements)
|
|
127
|
+
assert len(element_dicts)
|
|
128
|
+
expected_keys = {
|
|
129
|
+
"element_id",
|
|
130
|
+
"text",
|
|
131
|
+
"type",
|
|
132
|
+
"metadata",
|
|
133
|
+
}
|
|
134
|
+
# The document in TEST_FILE_PATH does not have elements with coordinates so
|
|
135
|
+
# partition is not expected to return coordinates metadata.
|
|
136
|
+
expected_metadata_keys = {
|
|
137
|
+
"data_source",
|
|
138
|
+
"filename",
|
|
139
|
+
"file_directory",
|
|
140
|
+
"filetype",
|
|
141
|
+
"languages",
|
|
142
|
+
"last_modified",
|
|
143
|
+
}
|
|
144
|
+
for elem in element_dicts:
|
|
145
|
+
# Parent IDs are non-deterministic - remove them from the test
|
|
146
|
+
elem["metadata"].pop("parent_id", None)
|
|
147
|
+
|
|
148
|
+
assert expected_keys == set(elem.keys())
|
|
149
|
+
assert expected_metadata_keys == set(elem["metadata"].keys())
|
|
150
|
+
data_source_metadata = elem["metadata"]["data_source"]
|
|
151
|
+
assert data_source_metadata["url"] == TEST_SOURCE_URL
|
|
152
|
+
assert data_source_metadata["version"] == TEST_VERSION
|
|
153
|
+
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
|
|
154
|
+
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
|
|
155
|
+
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
|
|
156
|
+
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSED
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_process_file_fields_include_default(mocker, partition_test_results):
|
|
160
|
+
"""Validate when metadata_include and metadata_exclude are not set, all fields:
|
|
161
|
+
("element_id", "text", "type", "metadata") are included"""
|
|
162
|
+
mock_partition = mocker.patch(
|
|
163
|
+
"unstructured.partition.auto.partition",
|
|
164
|
+
return_value=partition_test_results,
|
|
165
|
+
)
|
|
166
|
+
test_ingest_doc = ExampleIngestDoc(
|
|
167
|
+
connector_config=TEST_CONFIG,
|
|
168
|
+
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
169
|
+
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
170
|
+
)
|
|
171
|
+
elements = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
|
172
|
+
element_dicts = elements_to_dicts(elements)
|
|
173
|
+
assert len(element_dicts)
|
|
174
|
+
assert mock_partition.call_count == 1
|
|
175
|
+
for elem in element_dicts:
|
|
176
|
+
# Parent IDs are non-deterministic - remove them from the test
|
|
177
|
+
elem["metadata"].pop("parent_id", None)
|
|
178
|
+
|
|
179
|
+
assert {"element_id", "text", "type", "metadata"} == set(elem.keys())
|
|
180
|
+
data_source_metadata = elem["metadata"]["data_source"]
|
|
181
|
+
assert data_source_metadata["url"] == TEST_SOURCE_URL
|
|
182
|
+
assert data_source_metadata["version"] == TEST_VERSION
|
|
183
|
+
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
|
|
184
|
+
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
|
|
185
|
+
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
|
|
186
|
+
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSED
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def test_process_file_metadata_includes_filename_and_filetype(
|
|
190
|
+
mocker,
|
|
191
|
+
partition_test_results,
|
|
192
|
+
):
|
|
193
|
+
"""Validate when metadata_include is set to "filename,filetype",
|
|
194
|
+
only filename is included in metadata"""
|
|
195
|
+
mocker.patch(
|
|
196
|
+
"unstructured.partition.auto.partition",
|
|
197
|
+
return_value=partition_test_results,
|
|
198
|
+
)
|
|
199
|
+
partition_config = PartitionConfig(
|
|
200
|
+
metadata_include=["filename", "filetype"],
|
|
201
|
+
)
|
|
202
|
+
test_ingest_doc = ExampleIngestDoc(
|
|
203
|
+
connector_config=TEST_CONFIG,
|
|
204
|
+
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
205
|
+
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
206
|
+
)
|
|
207
|
+
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
208
|
+
assert len(isd_elems)
|
|
209
|
+
for elem in isd_elems:
|
|
210
|
+
# Parent IDs are non-deterministic - remove them from the test
|
|
211
|
+
elem["metadata"].pop("parent_id", None)
|
|
212
|
+
|
|
213
|
+
assert set(elem["metadata"].keys()) == {"filename", "filetype"}
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_results):
|
|
217
|
+
"""Validate when metadata_exclude is set to "filename,page_number",
|
|
218
|
+
neither filename nor page_number are included in metadata"""
|
|
219
|
+
mocker.patch(
|
|
220
|
+
"unstructured.partition.auto.partition",
|
|
221
|
+
return_value=partition_test_results,
|
|
222
|
+
)
|
|
223
|
+
partition_config = PartitionConfig(
|
|
224
|
+
metadata_exclude=["filename", "page_number"],
|
|
225
|
+
)
|
|
226
|
+
test_ingest_doc = ExampleIngestDoc(
|
|
227
|
+
connector_config=TEST_CONFIG,
|
|
228
|
+
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
229
|
+
processor_config=ProcessorConfig(
|
|
230
|
+
output_dir=TEST_OUTPUT_DIR,
|
|
231
|
+
),
|
|
232
|
+
)
|
|
233
|
+
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
234
|
+
assert len(isd_elems)
|
|
235
|
+
for elem in isd_elems:
|
|
236
|
+
assert "filename" not in elem["metadata"]
|
|
237
|
+
assert "page_number" not in elem["metadata"]
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def test_process_file_flatten_metadata(mocker, partition_test_results):
|
|
241
|
+
mocker.patch(
|
|
242
|
+
"unstructured.partition.auto.partition",
|
|
243
|
+
return_value=partition_test_results,
|
|
244
|
+
)
|
|
245
|
+
partition_config = PartitionConfig(
|
|
246
|
+
metadata_include=["filename", "file_directory", "filetype"],
|
|
247
|
+
flatten_metadata=True,
|
|
248
|
+
)
|
|
249
|
+
test_ingest_doc = ExampleIngestDoc(
|
|
250
|
+
connector_config=TEST_CONFIG,
|
|
251
|
+
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
252
|
+
processor_config=ProcessorConfig(
|
|
253
|
+
output_dir=TEST_OUTPUT_DIR,
|
|
254
|
+
),
|
|
255
|
+
)
|
|
256
|
+
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
257
|
+
expected_keys = {"element_id", "text", "type", "filename", "file_directory", "filetype"}
|
|
258
|
+
for elem in isd_elems:
|
|
259
|
+
assert expected_keys == set(elem.keys())
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
class DescribeChunkingConfig:
|
|
263
|
+
"""Unit tests for unstructured.ingest.interfaces.ChunkingConfig"""
|
|
264
|
+
|
|
265
|
+
def it_accepts_chunking_strategy_by_itself(self):
|
|
266
|
+
config = ChunkingConfig(chunking_strategy="basic")
|
|
267
|
+
assert config.chunking_strategy == "basic"
|
|
268
|
+
|
|
269
|
+
def it_defaults_to_chunk_by_title_if_only_chunk_elements_is_True(self):
|
|
270
|
+
config = ChunkingConfig(chunk_elements=True)
|
|
271
|
+
assert config.chunking_strategy == "by_title"
|
|
272
|
+
|
|
273
|
+
def but_it_defaults_to_chunking_strategy_over_chunk_elements(self):
|
|
274
|
+
config = ChunkingConfig(chunk_elements=True, chunking_strategy="basic")
|
|
275
|
+
assert config.chunking_strategy == "basic"
|
|
276
|
+
|
|
277
|
+
def it_silently_accepts_unrecognized_chunker(self, caplog: pytest.LogCaptureFixture):
|
|
278
|
+
config = ChunkingConfig(chunking_strategy="foobar")
|
|
279
|
+
assert config.chunking_strategy == "foobar"
|
|
280
|
+
assert caplog.text == ""
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from pydantic import Secret, ValidationError
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.v2.interfaces import AccessConfig, ConnectionConfig
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_failing_connection_config():
|
|
8
|
+
class MyAccessConfig(AccessConfig):
|
|
9
|
+
sensitive_value: str
|
|
10
|
+
|
|
11
|
+
class MyConnectionConfig(ConnectionConfig):
|
|
12
|
+
access_config: MyAccessConfig
|
|
13
|
+
|
|
14
|
+
with pytest.raises(ValidationError):
|
|
15
|
+
MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_happy_path_connection_config():
|
|
19
|
+
class MyAccessConfig(AccessConfig):
|
|
20
|
+
sensitive_value: str
|
|
21
|
+
|
|
22
|
+
class MyConnectionConfig(ConnectionConfig):
|
|
23
|
+
access_config: Secret[MyAccessConfig]
|
|
24
|
+
|
|
25
|
+
connection_config = MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
|
|
26
|
+
assert connection_config
|
test/unit/test_logger.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.logger import (
|
|
6
|
+
default_is_data_sensitive,
|
|
7
|
+
hide_sensitive_fields,
|
|
8
|
+
redact_jsons,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.parametrize(
|
|
13
|
+
("key", "value", "is_sensitive"),
|
|
14
|
+
[
|
|
15
|
+
("username", "john_smith", False),
|
|
16
|
+
("password", "13?H%", True),
|
|
17
|
+
("token", "123", True),
|
|
18
|
+
("AWS_CREDENTIAL", "aws_credential", True),
|
|
19
|
+
("AWS_KEY", None, False),
|
|
20
|
+
],
|
|
21
|
+
)
|
|
22
|
+
def test_default_is_sensitive(key, value, is_sensitive):
|
|
23
|
+
assert default_is_data_sensitive(key, value) == is_sensitive
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_hide_sensitive_fields():
|
|
27
|
+
d = {
|
|
28
|
+
"username": "john_smith",
|
|
29
|
+
"password": "13?H%",
|
|
30
|
+
"inner": {
|
|
31
|
+
"token": "123",
|
|
32
|
+
"AWS_KEY": None,
|
|
33
|
+
"inner_j_string": json.dumps(
|
|
34
|
+
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
|
|
35
|
+
),
|
|
36
|
+
},
|
|
37
|
+
}
|
|
38
|
+
redacted_d = hide_sensitive_fields(d)
|
|
39
|
+
expected_d = {
|
|
40
|
+
"password": "*******",
|
|
41
|
+
"username": "john_smith",
|
|
42
|
+
"inner": {
|
|
43
|
+
"token": "*******",
|
|
44
|
+
"AWS_KEY": None,
|
|
45
|
+
"inner_j_string": json.dumps(
|
|
46
|
+
{"account_name": "*******", "client_id": "*******", "timestamp": 123}
|
|
47
|
+
),
|
|
48
|
+
},
|
|
49
|
+
}
|
|
50
|
+
assert redacted_d == expected_d
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_redact_jsons():
|
|
54
|
+
d1 = {
|
|
55
|
+
"username": "john_smith",
|
|
56
|
+
"password": "13?H%",
|
|
57
|
+
"inner": {
|
|
58
|
+
"token": "123",
|
|
59
|
+
"AWS_KEY": None,
|
|
60
|
+
"inner_j_string": json.dumps(
|
|
61
|
+
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
|
|
62
|
+
),
|
|
63
|
+
},
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
d2 = {"username": "tim67", "update_time": 456}
|
|
67
|
+
d3 = {"account_name": "top secret", "host": "http://localhost:8888"}
|
|
68
|
+
|
|
69
|
+
sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
|
|
70
|
+
expected_string = (
|
|
71
|
+
'Some topic secret info ({"username": "john_smith", "password": "*******", '
|
|
72
|
+
'"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
|
|
73
|
+
'"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
|
|
74
|
+
'\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
|
|
75
|
+
'and {"account_name": "*******", "host": "http://localhost:8888"})'
|
|
76
|
+
)
|
|
77
|
+
redacted_string = redact_jsons(sensitive_string)
|
|
78
|
+
assert redacted_string == expected_string
|