unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +178 -0
- test/integration/connectors/sql/test_sqlite.py +151 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +203 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +177 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +310 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +172 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -14
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +85 -31
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pathlib
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Any, Dict
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
9
|
+
from unstructured.partition.auto import partition
|
|
10
|
+
from unstructured.staging.base import elements_to_dicts
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.interfaces import (
|
|
13
|
+
BaseConnectorConfig,
|
|
14
|
+
BaseSingleIngestDoc,
|
|
15
|
+
ChunkingConfig,
|
|
16
|
+
PartitionConfig,
|
|
17
|
+
ProcessorConfig,
|
|
18
|
+
ReadConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
DIRECTORY = pathlib.Path(__file__).parents[2].resolve()
|
|
22
|
+
EXAMPLE_DOCS_DIRECTORY = DIRECTORY / "example-docs"
|
|
23
|
+
TEST_DOWNLOAD_DIR = "/tmp"
|
|
24
|
+
TEST_OUTPUT_DIR = "/tmp"
|
|
25
|
+
TEST_ID = "test"
|
|
26
|
+
TEST_FILE_PATH = str(EXAMPLE_DOCS_DIRECTORY / "book-war-and-peace-1p.txt")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class ExampleConfig(BaseConnectorConfig):
|
|
31
|
+
id: str
|
|
32
|
+
path: str
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
TEST_CONFIG = ExampleConfig(id=TEST_ID, path=TEST_FILE_PATH)
|
|
36
|
+
TEST_SOURCE_URL = "test-source-url"
|
|
37
|
+
TEST_VERSION = "1.1.1"
|
|
38
|
+
TEST_RECORD_LOCATOR = {"id": "data-source-id"}
|
|
39
|
+
TEST_DATE_CREATED = "2021-01-01T00:00:00"
|
|
40
|
+
TEST_DATE_MODIFIED = "2021-01-02T00:00:00"
|
|
41
|
+
TEST_DATE_PROCESSED = "2022-12-13T15:44:08"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class ExampleIngestDoc(BaseSingleIngestDoc):
|
|
46
|
+
connector_config: ExampleConfig
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def filename(self):
|
|
50
|
+
return TEST_FILE_PATH
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def _output_filename(self):
|
|
54
|
+
return TEST_FILE_PATH + ".json"
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def source_url(self) -> str:
|
|
58
|
+
return TEST_SOURCE_URL
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def version(self) -> str:
|
|
62
|
+
return TEST_VERSION
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def record_locator(self) -> Dict[str, Any]:
|
|
66
|
+
return TEST_RECORD_LOCATOR
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def date_created(self) -> str:
|
|
70
|
+
return TEST_DATE_CREATED
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def date_modified(self) -> str:
|
|
74
|
+
return TEST_DATE_MODIFIED
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def exists(self) -> bool:
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
def cleanup_file(self):
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
def get_file(self):
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
def has_output(self):
|
|
87
|
+
return True
|
|
88
|
+
|
|
89
|
+
def write_result(self, result):
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@pytest.fixture
|
|
94
|
+
def partition_test_results():
|
|
95
|
+
# Reusable partition test results, calculated only once
|
|
96
|
+
result = partition(
|
|
97
|
+
filename=str(TEST_FILE_PATH),
|
|
98
|
+
data_source_metadata=DataSourceMetadata(
|
|
99
|
+
url=TEST_SOURCE_URL,
|
|
100
|
+
version=TEST_VERSION,
|
|
101
|
+
record_locator=TEST_RECORD_LOCATOR,
|
|
102
|
+
date_created=TEST_DATE_CREATED,
|
|
103
|
+
date_modified=TEST_DATE_MODIFIED,
|
|
104
|
+
date_processed=TEST_DATE_PROCESSED,
|
|
105
|
+
),
|
|
106
|
+
)
|
|
107
|
+
return result
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@pytest.fixture
|
|
111
|
+
def partition_file_test_results(partition_test_results):
|
|
112
|
+
# Reusable partition_file test results, calculated only once
|
|
113
|
+
return elements_to_dicts(partition_test_results)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_partition_file():
|
|
117
|
+
"""Validate partition_file returns a list of dictionaries with the expected keys,
|
|
118
|
+
metadatakeys, and data source metadata values."""
|
|
119
|
+
test_ingest_doc = ExampleIngestDoc(
|
|
120
|
+
connector_config=TEST_CONFIG,
|
|
121
|
+
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
122
|
+
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
123
|
+
)
|
|
124
|
+
test_ingest_doc._date_processed = TEST_DATE_PROCESSED
|
|
125
|
+
elements = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
|
126
|
+
element_dicts = elements_to_dicts(elements)
|
|
127
|
+
assert len(element_dicts)
|
|
128
|
+
expected_keys = {
|
|
129
|
+
"element_id",
|
|
130
|
+
"text",
|
|
131
|
+
"type",
|
|
132
|
+
"metadata",
|
|
133
|
+
}
|
|
134
|
+
# The document in TEST_FILE_PATH does not have elements with coordinates so
|
|
135
|
+
# partition is not expected to return coordinates metadata.
|
|
136
|
+
expected_metadata_keys = {
|
|
137
|
+
"data_source",
|
|
138
|
+
"filename",
|
|
139
|
+
"file_directory",
|
|
140
|
+
"filetype",
|
|
141
|
+
"languages",
|
|
142
|
+
"last_modified",
|
|
143
|
+
}
|
|
144
|
+
for elem in element_dicts:
|
|
145
|
+
# Parent IDs are non-deterministic - remove them from the test
|
|
146
|
+
elem["metadata"].pop("parent_id", None)
|
|
147
|
+
|
|
148
|
+
assert expected_keys == set(elem.keys())
|
|
149
|
+
assert expected_metadata_keys == set(elem["metadata"].keys())
|
|
150
|
+
data_source_metadata = elem["metadata"]["data_source"]
|
|
151
|
+
assert data_source_metadata["url"] == TEST_SOURCE_URL
|
|
152
|
+
assert data_source_metadata["version"] == TEST_VERSION
|
|
153
|
+
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
|
|
154
|
+
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
|
|
155
|
+
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
|
|
156
|
+
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSED
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_process_file_fields_include_default(mocker, partition_test_results):
|
|
160
|
+
"""Validate when metadata_include and metadata_exclude are not set, all fields:
|
|
161
|
+
("element_id", "text", "type", "metadata") are included"""
|
|
162
|
+
mock_partition = mocker.patch(
|
|
163
|
+
"unstructured.partition.auto.partition",
|
|
164
|
+
return_value=partition_test_results,
|
|
165
|
+
)
|
|
166
|
+
test_ingest_doc = ExampleIngestDoc(
|
|
167
|
+
connector_config=TEST_CONFIG,
|
|
168
|
+
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
169
|
+
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
170
|
+
)
|
|
171
|
+
elements = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
|
172
|
+
element_dicts = elements_to_dicts(elements)
|
|
173
|
+
assert len(element_dicts)
|
|
174
|
+
assert mock_partition.call_count == 1
|
|
175
|
+
for elem in element_dicts:
|
|
176
|
+
# Parent IDs are non-deterministic - remove them from the test
|
|
177
|
+
elem["metadata"].pop("parent_id", None)
|
|
178
|
+
|
|
179
|
+
assert {"element_id", "text", "type", "metadata"} == set(elem.keys())
|
|
180
|
+
data_source_metadata = elem["metadata"]["data_source"]
|
|
181
|
+
assert data_source_metadata["url"] == TEST_SOURCE_URL
|
|
182
|
+
assert data_source_metadata["version"] == TEST_VERSION
|
|
183
|
+
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
|
|
184
|
+
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
|
|
185
|
+
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
|
|
186
|
+
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSED
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def test_process_file_metadata_includes_filename_and_filetype(
|
|
190
|
+
mocker,
|
|
191
|
+
partition_test_results,
|
|
192
|
+
):
|
|
193
|
+
"""Validate when metadata_include is set to "filename,filetype",
|
|
194
|
+
only filename is included in metadata"""
|
|
195
|
+
mocker.patch(
|
|
196
|
+
"unstructured.partition.auto.partition",
|
|
197
|
+
return_value=partition_test_results,
|
|
198
|
+
)
|
|
199
|
+
partition_config = PartitionConfig(
|
|
200
|
+
metadata_include=["filename", "filetype"],
|
|
201
|
+
)
|
|
202
|
+
test_ingest_doc = ExampleIngestDoc(
|
|
203
|
+
connector_config=TEST_CONFIG,
|
|
204
|
+
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
205
|
+
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
206
|
+
)
|
|
207
|
+
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
208
|
+
assert len(isd_elems)
|
|
209
|
+
for elem in isd_elems:
|
|
210
|
+
# Parent IDs are non-deterministic - remove them from the test
|
|
211
|
+
elem["metadata"].pop("parent_id", None)
|
|
212
|
+
|
|
213
|
+
assert set(elem["metadata"].keys()) == {"filename", "filetype"}
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_results):
|
|
217
|
+
"""Validate when metadata_exclude is set to "filename,page_number",
|
|
218
|
+
neither filename nor page_number are included in metadata"""
|
|
219
|
+
mocker.patch(
|
|
220
|
+
"unstructured.partition.auto.partition",
|
|
221
|
+
return_value=partition_test_results,
|
|
222
|
+
)
|
|
223
|
+
partition_config = PartitionConfig(
|
|
224
|
+
metadata_exclude=["filename", "page_number"],
|
|
225
|
+
)
|
|
226
|
+
test_ingest_doc = ExampleIngestDoc(
|
|
227
|
+
connector_config=TEST_CONFIG,
|
|
228
|
+
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
229
|
+
processor_config=ProcessorConfig(
|
|
230
|
+
output_dir=TEST_OUTPUT_DIR,
|
|
231
|
+
),
|
|
232
|
+
)
|
|
233
|
+
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
234
|
+
assert len(isd_elems)
|
|
235
|
+
for elem in isd_elems:
|
|
236
|
+
assert "filename" not in elem["metadata"]
|
|
237
|
+
assert "page_number" not in elem["metadata"]
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def test_process_file_flatten_metadata(mocker, partition_test_results):
|
|
241
|
+
mocker.patch(
|
|
242
|
+
"unstructured.partition.auto.partition",
|
|
243
|
+
return_value=partition_test_results,
|
|
244
|
+
)
|
|
245
|
+
partition_config = PartitionConfig(
|
|
246
|
+
metadata_include=["filename", "file_directory", "filetype"],
|
|
247
|
+
flatten_metadata=True,
|
|
248
|
+
)
|
|
249
|
+
test_ingest_doc = ExampleIngestDoc(
|
|
250
|
+
connector_config=TEST_CONFIG,
|
|
251
|
+
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
252
|
+
processor_config=ProcessorConfig(
|
|
253
|
+
output_dir=TEST_OUTPUT_DIR,
|
|
254
|
+
),
|
|
255
|
+
)
|
|
256
|
+
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
257
|
+
expected_keys = {"element_id", "text", "type", "filename", "file_directory", "filetype"}
|
|
258
|
+
for elem in isd_elems:
|
|
259
|
+
assert expected_keys == set(elem.keys())
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
class DescribeChunkingConfig:
|
|
263
|
+
"""Unit tests for unstructured.ingest.interfaces.ChunkingConfig"""
|
|
264
|
+
|
|
265
|
+
def it_accepts_chunking_strategy_by_itself(self):
|
|
266
|
+
config = ChunkingConfig(chunking_strategy="basic")
|
|
267
|
+
assert config.chunking_strategy == "basic"
|
|
268
|
+
|
|
269
|
+
def it_defaults_to_chunk_by_title_if_only_chunk_elements_is_True(self):
|
|
270
|
+
config = ChunkingConfig(chunk_elements=True)
|
|
271
|
+
assert config.chunking_strategy == "by_title"
|
|
272
|
+
|
|
273
|
+
def but_it_defaults_to_chunking_strategy_over_chunk_elements(self):
|
|
274
|
+
config = ChunkingConfig(chunk_elements=True, chunking_strategy="basic")
|
|
275
|
+
assert config.chunking_strategy == "basic"
|
|
276
|
+
|
|
277
|
+
def it_silently_accepts_unrecognized_chunker(self, caplog: pytest.LogCaptureFixture):
|
|
278
|
+
config = ChunkingConfig(chunking_strategy="foobar")
|
|
279
|
+
assert config.chunking_strategy == "foobar"
|
|
280
|
+
assert caplog.text == ""
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from pydantic import Secret, ValidationError
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.v2.interfaces import AccessConfig, ConnectionConfig
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_failing_connection_config():
|
|
8
|
+
class MyAccessConfig(AccessConfig):
|
|
9
|
+
sensitive_value: str
|
|
10
|
+
|
|
11
|
+
class MyConnectionConfig(ConnectionConfig):
|
|
12
|
+
access_config: MyAccessConfig
|
|
13
|
+
|
|
14
|
+
with pytest.raises(ValidationError):
|
|
15
|
+
MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_happy_path_connection_config():
|
|
19
|
+
class MyAccessConfig(AccessConfig):
|
|
20
|
+
sensitive_value: str
|
|
21
|
+
|
|
22
|
+
class MyConnectionConfig(ConnectionConfig):
|
|
23
|
+
access_config: Secret[MyAccessConfig]
|
|
24
|
+
|
|
25
|
+
connection_config = MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
|
|
26
|
+
assert connection_config
|
test/unit/test_logger.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.logger import (
|
|
6
|
+
default_is_data_sensitive,
|
|
7
|
+
hide_sensitive_fields,
|
|
8
|
+
redact_jsons,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.parametrize(
|
|
13
|
+
("key", "value", "is_sensitive"),
|
|
14
|
+
[
|
|
15
|
+
("username", "john_smith", False),
|
|
16
|
+
("password", "13?H%", True),
|
|
17
|
+
("token", "123", True),
|
|
18
|
+
("AWS_CREDENTIAL", "aws_credential", True),
|
|
19
|
+
("AWS_KEY", None, False),
|
|
20
|
+
],
|
|
21
|
+
)
|
|
22
|
+
def test_default_is_sensitive(key, value, is_sensitive):
|
|
23
|
+
assert default_is_data_sensitive(key, value) == is_sensitive
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_hide_sensitive_fields():
|
|
27
|
+
d = {
|
|
28
|
+
"username": "john_smith",
|
|
29
|
+
"password": "13?H%",
|
|
30
|
+
"inner": {
|
|
31
|
+
"token": "123",
|
|
32
|
+
"AWS_KEY": None,
|
|
33
|
+
"inner_j_string": json.dumps(
|
|
34
|
+
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
|
|
35
|
+
),
|
|
36
|
+
},
|
|
37
|
+
}
|
|
38
|
+
redacted_d = hide_sensitive_fields(d)
|
|
39
|
+
expected_d = {
|
|
40
|
+
"password": "*******",
|
|
41
|
+
"username": "john_smith",
|
|
42
|
+
"inner": {
|
|
43
|
+
"token": "*******",
|
|
44
|
+
"AWS_KEY": None,
|
|
45
|
+
"inner_j_string": json.dumps(
|
|
46
|
+
{"account_name": "*******", "client_id": "*******", "timestamp": 123}
|
|
47
|
+
),
|
|
48
|
+
},
|
|
49
|
+
}
|
|
50
|
+
assert redacted_d == expected_d
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_redact_jsons():
|
|
54
|
+
d1 = {
|
|
55
|
+
"username": "john_smith",
|
|
56
|
+
"password": "13?H%",
|
|
57
|
+
"inner": {
|
|
58
|
+
"token": "123",
|
|
59
|
+
"AWS_KEY": None,
|
|
60
|
+
"inner_j_string": json.dumps(
|
|
61
|
+
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
|
|
62
|
+
),
|
|
63
|
+
},
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
d2 = {"username": "tim67", "update_time": 456}
|
|
67
|
+
d3 = {"account_name": "top secret", "host": "http://localhost:8888"}
|
|
68
|
+
|
|
69
|
+
sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
|
|
70
|
+
expected_string = (
|
|
71
|
+
'Some topic secret info ({"username": "john_smith", "password": "*******", '
|
|
72
|
+
'"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
|
|
73
|
+
'"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
|
|
74
|
+
'\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
|
|
75
|
+
'and {"account_name": "*******", "host": "http://localhost:8888"})'
|
|
76
|
+
)
|
|
77
|
+
redacted_string = redact_jsons(sensitive_string)
|
|
78
|
+
assert redacted_string == expected_string
|
test/unit/test_utils.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import typing as t
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
import pytz
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.cli.utils import extract_config
|
|
10
|
+
from unstructured_ingest.interfaces import BaseConfig
|
|
11
|
+
from unstructured_ingest.utils.string_and_date_utils import ensure_isoformat_datetime, json_to_dict
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class A(BaseConfig):
|
|
16
|
+
a: str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class B(BaseConfig):
|
|
21
|
+
a: A
|
|
22
|
+
b: int
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
flat_data = {"a": "test", "b": 4, "c": True}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_extract_config_concrete():
|
|
29
|
+
@dataclass
|
|
30
|
+
class C(BaseConfig):
|
|
31
|
+
b: B
|
|
32
|
+
c: bool
|
|
33
|
+
|
|
34
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
35
|
+
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
|
|
36
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_extract_config_optional():
|
|
40
|
+
@dataclass
|
|
41
|
+
class C(BaseConfig):
|
|
42
|
+
c: bool
|
|
43
|
+
b: t.Optional[B] = None
|
|
44
|
+
|
|
45
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
46
|
+
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
|
|
47
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_extract_config_union():
|
|
51
|
+
@dataclass
|
|
52
|
+
class C(BaseConfig):
|
|
53
|
+
c: bool
|
|
54
|
+
b: t.Optional[t.Union[B, int]] = None
|
|
55
|
+
|
|
56
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
57
|
+
expected_result = {"b": 4, "c": True}
|
|
58
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_extract_config_list():
|
|
62
|
+
@dataclass
|
|
63
|
+
class C(BaseConfig):
|
|
64
|
+
c: t.List[int]
|
|
65
|
+
b: B
|
|
66
|
+
|
|
67
|
+
flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
|
|
68
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
69
|
+
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
|
|
70
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_extract_config_optional_list():
|
|
74
|
+
@dataclass
|
|
75
|
+
class C(BaseConfig):
|
|
76
|
+
b: B
|
|
77
|
+
c: t.Optional[t.List[int]] = None
|
|
78
|
+
|
|
79
|
+
flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
|
|
80
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
81
|
+
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
|
|
82
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_extract_config_dataclass_list():
|
|
86
|
+
@dataclass
|
|
87
|
+
class C(BaseConfig):
|
|
88
|
+
c: bool
|
|
89
|
+
b: t.List[B] = field(default_factory=list)
|
|
90
|
+
|
|
91
|
+
flat_data = {"a": "test", "c": True}
|
|
92
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
93
|
+
expected_result = {"b": [], "c": True}
|
|
94
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_extract_config_dict():
|
|
98
|
+
@dataclass
|
|
99
|
+
class C(BaseConfig):
|
|
100
|
+
c: bool
|
|
101
|
+
b: t.Dict[str, B] = field(default_factory=dict)
|
|
102
|
+
|
|
103
|
+
flat_data = {"c": True}
|
|
104
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
105
|
+
expected_result = {"c": True, "b": {}}
|
|
106
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def test_json_to_dict_valid_json():
|
|
110
|
+
json_string = '{"key": "value"}'
|
|
111
|
+
expected_result = {"key": "value"}
|
|
112
|
+
assert json_to_dict(json_string) == expected_result
|
|
113
|
+
assert isinstance(json_to_dict(json_string), dict)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_json_to_dict_malformed_json():
|
|
117
|
+
json_string = '{"key": "value"'
|
|
118
|
+
expected_result = '{"key": "value"'
|
|
119
|
+
assert json_to_dict(json_string) == expected_result
|
|
120
|
+
assert isinstance(json_to_dict(json_string), str)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def test_json_to_dict_single_quotes():
|
|
124
|
+
json_string = "{'key': 'value'}"
|
|
125
|
+
expected_result = {"key": "value"}
|
|
126
|
+
assert json_to_dict(json_string) == expected_result
|
|
127
|
+
assert isinstance(json_to_dict(json_string), dict)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def test_json_to_dict_path():
|
|
131
|
+
json_string = "/path/to/file.json"
|
|
132
|
+
expected_result = "/path/to/file.json"
|
|
133
|
+
assert json_to_dict(json_string) == expected_result
|
|
134
|
+
assert isinstance(json_to_dict(json_string), str)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def test_ensure_isoformat_datetime_for_datetime():
|
|
138
|
+
dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0))
|
|
139
|
+
assert dt == "2021-01-01T12:00:00"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_ensure_isoformat_datetime_for_datetime_with_tz():
|
|
143
|
+
dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC))
|
|
144
|
+
assert dt == "2021-01-01T12:00:00+00:00"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def test_ensure_isoformat_datetime_for_string():
|
|
148
|
+
dt = ensure_isoformat_datetime("2021-01-01T12:00:00")
|
|
149
|
+
assert dt == "2021-01-01T12:00:00"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_ensure_isoformat_datetime_for_string2():
|
|
153
|
+
dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00")
|
|
154
|
+
assert dt == "2021-01-01T12:00:00+00:00"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_ensure_isoformat_datetime_fails_on_string():
|
|
158
|
+
with pytest.raises(ValueError):
|
|
159
|
+
ensure_isoformat_datetime("bad timestamp")
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def test_ensure_isoformat_datetime_fails_on_int():
|
|
163
|
+
with pytest.raises(TypeError):
|
|
164
|
+
ensure_isoformat_datetime(1111)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, Secret, SecretStr
|
|
5
|
+
from pydantic.types import _SecretBase
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.utils import serialize_base_model, serialize_base_model_json
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MockChildBaseModel(BaseModel):
|
|
11
|
+
child_secret_str: SecretStr
|
|
12
|
+
child_secret_float: Secret[float]
|
|
13
|
+
child_not_secret_dict: dict[str, Any] = Field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MockBaseModel(BaseModel):
|
|
17
|
+
secret_str: SecretStr
|
|
18
|
+
not_secret_bool: bool
|
|
19
|
+
secret_child_base: Secret[MockChildBaseModel]
|
|
20
|
+
not_secret_list: list[int] = Field(default_factory=list)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
model = MockBaseModel(
|
|
24
|
+
secret_str="secret string",
|
|
25
|
+
not_secret_bool=False,
|
|
26
|
+
secret_child_base=MockChildBaseModel(
|
|
27
|
+
child_secret_str="child secret string",
|
|
28
|
+
child_secret_float=3.14,
|
|
29
|
+
child_not_secret_dict={"key": "value"},
|
|
30
|
+
),
|
|
31
|
+
not_secret_list=[1, 2, 3],
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_serialize_base_model():
|
|
36
|
+
|
|
37
|
+
serialized_dict = model.model_dump()
|
|
38
|
+
assert isinstance(serialized_dict["secret_str"], _SecretBase)
|
|
39
|
+
assert isinstance(serialized_dict["secret_child_base"], _SecretBase)
|
|
40
|
+
|
|
41
|
+
serialized_dict_w_secrets = serialize_base_model(model=model)
|
|
42
|
+
assert not isinstance(serialized_dict_w_secrets["secret_str"], _SecretBase)
|
|
43
|
+
assert not isinstance(serialized_dict_w_secrets["secret_child_base"], _SecretBase)
|
|
44
|
+
|
|
45
|
+
expected_dict = {
|
|
46
|
+
"secret_str": "secret string",
|
|
47
|
+
"not_secret_bool": False,
|
|
48
|
+
"secret_child_base": {
|
|
49
|
+
"child_secret_str": "child secret string",
|
|
50
|
+
"child_secret_float": 3.14,
|
|
51
|
+
"child_not_secret_dict": {"key": "value"},
|
|
52
|
+
},
|
|
53
|
+
"not_secret_list": [1, 2, 3],
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
assert serialized_dict_w_secrets == expected_dict
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_serialize_base_model_json():
|
|
60
|
+
serialized_json = model.model_dump_json()
|
|
61
|
+
serialized_dict = json.loads(serialized_json)
|
|
62
|
+
expected_dict = {
|
|
63
|
+
"secret_str": "**********",
|
|
64
|
+
"not_secret_bool": False,
|
|
65
|
+
"secret_child_base": "**********",
|
|
66
|
+
"not_secret_list": [1, 2, 3],
|
|
67
|
+
}
|
|
68
|
+
assert expected_dict == serialized_dict
|
|
69
|
+
|
|
70
|
+
serialized_json_w_secrets = serialize_base_model_json(model=model)
|
|
71
|
+
serialized_dict_w_secrets = json.loads(serialized_json_w_secrets)
|
|
72
|
+
expected_dict_w_secrets = {
|
|
73
|
+
"secret_str": "secret string",
|
|
74
|
+
"not_secret_bool": False,
|
|
75
|
+
"secret_child_base": {
|
|
76
|
+
"child_secret_str": "child secret string",
|
|
77
|
+
"child_secret_float": 3.14,
|
|
78
|
+
"child_not_secret_dict": {"key": "value"},
|
|
79
|
+
},
|
|
80
|
+
"not_secret_list": [1, 2, 3],
|
|
81
|
+
}
|
|
82
|
+
assert expected_dict_w_secrets == serialized_dict_w_secrets
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.1.1" # pragma: no cover
|
|
@@ -341,9 +341,9 @@ class CliPartitionConfig(PartitionConfig, CliMixin):
|
|
|
341
341
|
),
|
|
342
342
|
click.Option(
|
|
343
343
|
["--partition-endpoint"],
|
|
344
|
-
default="https://api.
|
|
344
|
+
default="https://api.unstructuredapp.io/general/v0/general",
|
|
345
345
|
help="If partitioning via api, use the following host. "
|
|
346
|
-
"Default: https://api.
|
|
346
|
+
"Default: https://api.unstructuredapp.io/general/v0/general",
|
|
347
347
|
),
|
|
348
348
|
click.Option(
|
|
349
349
|
["--api-key"],
|