unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (86) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/sql/__init__.py +0 -0
  10. test/integration/connectors/sql/test_postgres.py +178 -0
  11. test/integration/connectors/sql/test_sqlite.py +151 -0
  12. test/integration/connectors/test_s3.py +152 -0
  13. test/integration/connectors/utils/__init__.py +0 -0
  14. test/integration/connectors/utils/constants.py +7 -0
  15. test/integration/connectors/utils/docker_compose.py +44 -0
  16. test/integration/connectors/utils/validation.py +203 -0
  17. test/integration/embedders/__init__.py +0 -0
  18. test/integration/embedders/conftest.py +13 -0
  19. test/integration/embedders/test_bedrock.py +49 -0
  20. test/integration/embedders/test_huggingface.py +26 -0
  21. test/integration/embedders/test_mixedbread.py +47 -0
  22. test/integration/embedders/test_octoai.py +41 -0
  23. test/integration/embedders/test_openai.py +41 -0
  24. test/integration/embedders/test_vertexai.py +41 -0
  25. test/integration/embedders/test_voyageai.py +41 -0
  26. test/integration/embedders/togetherai.py +43 -0
  27. test/integration/embedders/utils.py +44 -0
  28. test/integration/partitioners/__init__.py +0 -0
  29. test/integration/partitioners/test_partitioner.py +75 -0
  30. test/integration/utils.py +15 -0
  31. test/unit/__init__.py +0 -0
  32. test/unit/embed/__init__.py +0 -0
  33. test/unit/embed/test_mixedbreadai.py +41 -0
  34. test/unit/embed/test_octoai.py +20 -0
  35. test/unit/embed/test_openai.py +20 -0
  36. test/unit/embed/test_vertexai.py +25 -0
  37. test/unit/embed/test_voyageai.py +24 -0
  38. test/unit/test_chunking_utils.py +36 -0
  39. test/unit/test_error.py +27 -0
  40. test/unit/test_interfaces.py +280 -0
  41. test/unit/test_interfaces_v2.py +26 -0
  42. test/unit/test_logger.py +78 -0
  43. test/unit/test_utils.py +164 -0
  44. test/unit/test_utils_v2.py +82 -0
  45. unstructured_ingest/__version__.py +1 -1
  46. unstructured_ingest/cli/interfaces.py +2 -2
  47. unstructured_ingest/connector/notion/types/block.py +1 -0
  48. unstructured_ingest/connector/notion/types/database.py +1 -0
  49. unstructured_ingest/connector/notion/types/page.py +1 -0
  50. unstructured_ingest/embed/bedrock.py +0 -20
  51. unstructured_ingest/embed/huggingface.py +0 -21
  52. unstructured_ingest/embed/interfaces.py +29 -3
  53. unstructured_ingest/embed/mixedbreadai.py +0 -36
  54. unstructured_ingest/embed/octoai.py +2 -24
  55. unstructured_ingest/embed/openai.py +0 -20
  56. unstructured_ingest/embed/togetherai.py +40 -0
  57. unstructured_ingest/embed/vertexai.py +0 -20
  58. unstructured_ingest/embed/voyageai.py +1 -24
  59. unstructured_ingest/interfaces.py +1 -1
  60. unstructured_ingest/v2/cli/utils/click.py +21 -2
  61. unstructured_ingest/v2/interfaces/connector.py +22 -2
  62. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  63. unstructured_ingest/v2/processes/chunker.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
  65. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  70. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  71. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
  72. unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  74. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  75. unstructured_ingest/v2/processes/connectors/sql/postgres.py +177 -0
  76. unstructured_ingest/v2/processes/connectors/sql/sql.py +310 -0
  77. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +172 -0
  78. unstructured_ingest/v2/processes/embedder.py +13 -0
  79. unstructured_ingest/v2/processes/partitioner.py +2 -1
  80. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -14
  81. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +85 -31
  82. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +1 -0
  83. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  84. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,280 @@
1
+ from __future__ import annotations
2
+
3
+ import pathlib
4
+ from dataclasses import dataclass
5
+ from typing import Any, Dict
6
+
7
+ import pytest
8
+ from unstructured.documents.elements import DataSourceMetadata
9
+ from unstructured.partition.auto import partition
10
+ from unstructured.staging.base import elements_to_dicts
11
+
12
+ from unstructured_ingest.interfaces import (
13
+ BaseConnectorConfig,
14
+ BaseSingleIngestDoc,
15
+ ChunkingConfig,
16
+ PartitionConfig,
17
+ ProcessorConfig,
18
+ ReadConfig,
19
+ )
20
+
21
+ DIRECTORY = pathlib.Path(__file__).parents[2].resolve()
22
+ EXAMPLE_DOCS_DIRECTORY = DIRECTORY / "example-docs"
23
+ TEST_DOWNLOAD_DIR = "/tmp"
24
+ TEST_OUTPUT_DIR = "/tmp"
25
+ TEST_ID = "test"
26
+ TEST_FILE_PATH = str(EXAMPLE_DOCS_DIRECTORY / "book-war-and-peace-1p.txt")
27
+
28
+
29
+ @dataclass
30
+ class ExampleConfig(BaseConnectorConfig):
31
+ id: str
32
+ path: str
33
+
34
+
35
+ TEST_CONFIG = ExampleConfig(id=TEST_ID, path=TEST_FILE_PATH)
36
+ TEST_SOURCE_URL = "test-source-url"
37
+ TEST_VERSION = "1.1.1"
38
+ TEST_RECORD_LOCATOR = {"id": "data-source-id"}
39
+ TEST_DATE_CREATED = "2021-01-01T00:00:00"
40
+ TEST_DATE_MODIFIED = "2021-01-02T00:00:00"
41
+ TEST_DATE_PROCESSED = "2022-12-13T15:44:08"
42
+
43
+
44
+ @dataclass
45
+ class ExampleIngestDoc(BaseSingleIngestDoc):
46
+ connector_config: ExampleConfig
47
+
48
+ @property
49
+ def filename(self):
50
+ return TEST_FILE_PATH
51
+
52
+ @property
53
+ def _output_filename(self):
54
+ return TEST_FILE_PATH + ".json"
55
+
56
+ @property
57
+ def source_url(self) -> str:
58
+ return TEST_SOURCE_URL
59
+
60
+ @property
61
+ def version(self) -> str:
62
+ return TEST_VERSION
63
+
64
+ @property
65
+ def record_locator(self) -> Dict[str, Any]:
66
+ return TEST_RECORD_LOCATOR
67
+
68
+ @property
69
+ def date_created(self) -> str:
70
+ return TEST_DATE_CREATED
71
+
72
+ @property
73
+ def date_modified(self) -> str:
74
+ return TEST_DATE_MODIFIED
75
+
76
+ @property
77
+ def exists(self) -> bool:
78
+ return True
79
+
80
+ def cleanup_file(self):
81
+ pass
82
+
83
+ def get_file(self):
84
+ pass
85
+
86
+ def has_output(self):
87
+ return True
88
+
89
+ def write_result(self, result):
90
+ pass
91
+
92
+
93
+ @pytest.fixture
94
+ def partition_test_results():
95
+ # Reusable partition test results, calculated only once
96
+ result = partition(
97
+ filename=str(TEST_FILE_PATH),
98
+ data_source_metadata=DataSourceMetadata(
99
+ url=TEST_SOURCE_URL,
100
+ version=TEST_VERSION,
101
+ record_locator=TEST_RECORD_LOCATOR,
102
+ date_created=TEST_DATE_CREATED,
103
+ date_modified=TEST_DATE_MODIFIED,
104
+ date_processed=TEST_DATE_PROCESSED,
105
+ ),
106
+ )
107
+ return result
108
+
109
+
110
+ @pytest.fixture
111
+ def partition_file_test_results(partition_test_results):
112
+ # Reusable partition_file test results, calculated only once
113
+ return elements_to_dicts(partition_test_results)
114
+
115
+
116
+ def test_partition_file():
117
+ """Validate partition_file returns a list of dictionaries with the expected keys,
118
+ metadatakeys, and data source metadata values."""
119
+ test_ingest_doc = ExampleIngestDoc(
120
+ connector_config=TEST_CONFIG,
121
+ read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
122
+ processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
123
+ )
124
+ test_ingest_doc._date_processed = TEST_DATE_PROCESSED
125
+ elements = test_ingest_doc.partition_file(partition_config=PartitionConfig())
126
+ element_dicts = elements_to_dicts(elements)
127
+ assert len(element_dicts)
128
+ expected_keys = {
129
+ "element_id",
130
+ "text",
131
+ "type",
132
+ "metadata",
133
+ }
134
+ # The document in TEST_FILE_PATH does not have elements with coordinates so
135
+ # partition is not expected to return coordinates metadata.
136
+ expected_metadata_keys = {
137
+ "data_source",
138
+ "filename",
139
+ "file_directory",
140
+ "filetype",
141
+ "languages",
142
+ "last_modified",
143
+ }
144
+ for elem in element_dicts:
145
+ # Parent IDs are non-deterministic - remove them from the test
146
+ elem["metadata"].pop("parent_id", None)
147
+
148
+ assert expected_keys == set(elem.keys())
149
+ assert expected_metadata_keys == set(elem["metadata"].keys())
150
+ data_source_metadata = elem["metadata"]["data_source"]
151
+ assert data_source_metadata["url"] == TEST_SOURCE_URL
152
+ assert data_source_metadata["version"] == TEST_VERSION
153
+ assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
154
+ assert data_source_metadata["date_created"] == TEST_DATE_CREATED
155
+ assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
156
+ assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSED
157
+
158
+
159
+ def test_process_file_fields_include_default(mocker, partition_test_results):
160
+ """Validate when metadata_include and metadata_exclude are not set, all fields:
161
+ ("element_id", "text", "type", "metadata") are included"""
162
+ mock_partition = mocker.patch(
163
+ "unstructured.partition.auto.partition",
164
+ return_value=partition_test_results,
165
+ )
166
+ test_ingest_doc = ExampleIngestDoc(
167
+ connector_config=TEST_CONFIG,
168
+ read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
169
+ processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
170
+ )
171
+ elements = test_ingest_doc.partition_file(partition_config=PartitionConfig())
172
+ element_dicts = elements_to_dicts(elements)
173
+ assert len(element_dicts)
174
+ assert mock_partition.call_count == 1
175
+ for elem in element_dicts:
176
+ # Parent IDs are non-deterministic - remove them from the test
177
+ elem["metadata"].pop("parent_id", None)
178
+
179
+ assert {"element_id", "text", "type", "metadata"} == set(elem.keys())
180
+ data_source_metadata = elem["metadata"]["data_source"]
181
+ assert data_source_metadata["url"] == TEST_SOURCE_URL
182
+ assert data_source_metadata["version"] == TEST_VERSION
183
+ assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
184
+ assert data_source_metadata["date_created"] == TEST_DATE_CREATED
185
+ assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
186
+ assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSED
187
+
188
+
189
+ def test_process_file_metadata_includes_filename_and_filetype(
190
+ mocker,
191
+ partition_test_results,
192
+ ):
193
+ """Validate when metadata_include is set to "filename,filetype",
194
+ only filename is included in metadata"""
195
+ mocker.patch(
196
+ "unstructured.partition.auto.partition",
197
+ return_value=partition_test_results,
198
+ )
199
+ partition_config = PartitionConfig(
200
+ metadata_include=["filename", "filetype"],
201
+ )
202
+ test_ingest_doc = ExampleIngestDoc(
203
+ connector_config=TEST_CONFIG,
204
+ read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
205
+ processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
206
+ )
207
+ isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
208
+ assert len(isd_elems)
209
+ for elem in isd_elems:
210
+ # Parent IDs are non-deterministic - remove them from the test
211
+ elem["metadata"].pop("parent_id", None)
212
+
213
+ assert set(elem["metadata"].keys()) == {"filename", "filetype"}
214
+
215
+
216
+ def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_results):
217
+ """Validate when metadata_exclude is set to "filename,page_number",
218
+ neither filename nor page_number are included in metadata"""
219
+ mocker.patch(
220
+ "unstructured.partition.auto.partition",
221
+ return_value=partition_test_results,
222
+ )
223
+ partition_config = PartitionConfig(
224
+ metadata_exclude=["filename", "page_number"],
225
+ )
226
+ test_ingest_doc = ExampleIngestDoc(
227
+ connector_config=TEST_CONFIG,
228
+ read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
229
+ processor_config=ProcessorConfig(
230
+ output_dir=TEST_OUTPUT_DIR,
231
+ ),
232
+ )
233
+ isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
234
+ assert len(isd_elems)
235
+ for elem in isd_elems:
236
+ assert "filename" not in elem["metadata"]
237
+ assert "page_number" not in elem["metadata"]
238
+
239
+
240
+ def test_process_file_flatten_metadata(mocker, partition_test_results):
241
+ mocker.patch(
242
+ "unstructured.partition.auto.partition",
243
+ return_value=partition_test_results,
244
+ )
245
+ partition_config = PartitionConfig(
246
+ metadata_include=["filename", "file_directory", "filetype"],
247
+ flatten_metadata=True,
248
+ )
249
+ test_ingest_doc = ExampleIngestDoc(
250
+ connector_config=TEST_CONFIG,
251
+ read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
252
+ processor_config=ProcessorConfig(
253
+ output_dir=TEST_OUTPUT_DIR,
254
+ ),
255
+ )
256
+ isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
257
+ expected_keys = {"element_id", "text", "type", "filename", "file_directory", "filetype"}
258
+ for elem in isd_elems:
259
+ assert expected_keys == set(elem.keys())
260
+
261
+
262
+ class DescribeChunkingConfig:
263
+ """Unit tests for unstructured.ingest.interfaces.ChunkingConfig"""
264
+
265
+ def it_accepts_chunking_strategy_by_itself(self):
266
+ config = ChunkingConfig(chunking_strategy="basic")
267
+ assert config.chunking_strategy == "basic"
268
+
269
+ def it_defaults_to_chunk_by_title_if_only_chunk_elements_is_True(self):
270
+ config = ChunkingConfig(chunk_elements=True)
271
+ assert config.chunking_strategy == "by_title"
272
+
273
+ def but_it_defaults_to_chunking_strategy_over_chunk_elements(self):
274
+ config = ChunkingConfig(chunk_elements=True, chunking_strategy="basic")
275
+ assert config.chunking_strategy == "basic"
276
+
277
+ def it_silently_accepts_unrecognized_chunker(self, caplog: pytest.LogCaptureFixture):
278
+ config = ChunkingConfig(chunking_strategy="foobar")
279
+ assert config.chunking_strategy == "foobar"
280
+ assert caplog.text == ""
@@ -0,0 +1,26 @@
1
+ import pytest
2
+ from pydantic import Secret, ValidationError
3
+
4
+ from unstructured_ingest.v2.interfaces import AccessConfig, ConnectionConfig
5
+
6
+
7
+ def test_failing_connection_config():
8
+ class MyAccessConfig(AccessConfig):
9
+ sensitive_value: str
10
+
11
+ class MyConnectionConfig(ConnectionConfig):
12
+ access_config: MyAccessConfig
13
+
14
+ with pytest.raises(ValidationError):
15
+ MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
16
+
17
+
18
+ def test_happy_path_connection_config():
19
+ class MyAccessConfig(AccessConfig):
20
+ sensitive_value: str
21
+
22
+ class MyConnectionConfig(ConnectionConfig):
23
+ access_config: Secret[MyAccessConfig]
24
+
25
+ connection_config = MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
26
+ assert connection_config
@@ -0,0 +1,78 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from unstructured_ingest.logger import (
6
+ default_is_data_sensitive,
7
+ hide_sensitive_fields,
8
+ redact_jsons,
9
+ )
10
+
11
+
12
+ @pytest.mark.parametrize(
13
+ ("key", "value", "is_sensitive"),
14
+ [
15
+ ("username", "john_smith", False),
16
+ ("password", "13?H%", True),
17
+ ("token", "123", True),
18
+ ("AWS_CREDENTIAL", "aws_credential", True),
19
+ ("AWS_KEY", None, False),
20
+ ],
21
+ )
22
+ def test_default_is_sensitive(key, value, is_sensitive):
23
+ assert default_is_data_sensitive(key, value) == is_sensitive
24
+
25
+
26
+ def test_hide_sensitive_fields():
27
+ d = {
28
+ "username": "john_smith",
29
+ "password": "13?H%",
30
+ "inner": {
31
+ "token": "123",
32
+ "AWS_KEY": None,
33
+ "inner_j_string": json.dumps(
34
+ {"account_name": "secret name", "client_id": 123, "timestamp": 123}
35
+ ),
36
+ },
37
+ }
38
+ redacted_d = hide_sensitive_fields(d)
39
+ expected_d = {
40
+ "password": "*******",
41
+ "username": "john_smith",
42
+ "inner": {
43
+ "token": "*******",
44
+ "AWS_KEY": None,
45
+ "inner_j_string": json.dumps(
46
+ {"account_name": "*******", "client_id": "*******", "timestamp": 123}
47
+ ),
48
+ },
49
+ }
50
+ assert redacted_d == expected_d
51
+
52
+
53
+ def test_redact_jsons():
54
+ d1 = {
55
+ "username": "john_smith",
56
+ "password": "13?H%",
57
+ "inner": {
58
+ "token": "123",
59
+ "AWS_KEY": None,
60
+ "inner_j_string": json.dumps(
61
+ {"account_name": "secret name", "client_id": 123, "timestamp": 123}
62
+ ),
63
+ },
64
+ }
65
+
66
+ d2 = {"username": "tim67", "update_time": 456}
67
+ d3 = {"account_name": "top secret", "host": "http://localhost:8888"}
68
+
69
+ sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
70
+ expected_string = (
71
+ 'Some topic secret info ({"username": "john_smith", "password": "*******", '
72
+ '"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
73
+ '"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
74
+ '\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
75
+ 'and {"account_name": "*******", "host": "http://localhost:8888"})'
76
+ )
77
+ redacted_string = redact_jsons(sensitive_string)
78
+ assert redacted_string == expected_string
@@ -0,0 +1,164 @@
1
+ import json
2
+ import typing as t
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+
6
+ import pytest
7
+ import pytz
8
+
9
+ from unstructured_ingest.cli.utils import extract_config
10
+ from unstructured_ingest.interfaces import BaseConfig
11
+ from unstructured_ingest.utils.string_and_date_utils import ensure_isoformat_datetime, json_to_dict
12
+
13
+
14
+ @dataclass
15
+ class A(BaseConfig):
16
+ a: str
17
+
18
+
19
+ @dataclass
20
+ class B(BaseConfig):
21
+ a: A
22
+ b: int
23
+
24
+
25
+ flat_data = {"a": "test", "b": 4, "c": True}
26
+
27
+
28
+ def test_extract_config_concrete():
29
+ @dataclass
30
+ class C(BaseConfig):
31
+ b: B
32
+ c: bool
33
+
34
+ c = extract_config(flat_data=flat_data, config=C)
35
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
36
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
37
+
38
+
39
+ def test_extract_config_optional():
40
+ @dataclass
41
+ class C(BaseConfig):
42
+ c: bool
43
+ b: t.Optional[B] = None
44
+
45
+ c = extract_config(flat_data=flat_data, config=C)
46
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
47
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
48
+
49
+
50
+ def test_extract_config_union():
51
+ @dataclass
52
+ class C(BaseConfig):
53
+ c: bool
54
+ b: t.Optional[t.Union[B, int]] = None
55
+
56
+ c = extract_config(flat_data=flat_data, config=C)
57
+ expected_result = {"b": 4, "c": True}
58
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
59
+
60
+
61
+ def test_extract_config_list():
62
+ @dataclass
63
+ class C(BaseConfig):
64
+ c: t.List[int]
65
+ b: B
66
+
67
+ flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
68
+ c = extract_config(flat_data=flat_data, config=C)
69
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
70
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
71
+
72
+
73
+ def test_extract_config_optional_list():
74
+ @dataclass
75
+ class C(BaseConfig):
76
+ b: B
77
+ c: t.Optional[t.List[int]] = None
78
+
79
+ flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
80
+ c = extract_config(flat_data=flat_data, config=C)
81
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
82
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
83
+
84
+
85
+ def test_extract_config_dataclass_list():
86
+ @dataclass
87
+ class C(BaseConfig):
88
+ c: bool
89
+ b: t.List[B] = field(default_factory=list)
90
+
91
+ flat_data = {"a": "test", "c": True}
92
+ c = extract_config(flat_data=flat_data, config=C)
93
+ expected_result = {"b": [], "c": True}
94
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
95
+
96
+
97
+ def test_extract_config_dict():
98
+ @dataclass
99
+ class C(BaseConfig):
100
+ c: bool
101
+ b: t.Dict[str, B] = field(default_factory=dict)
102
+
103
+ flat_data = {"c": True}
104
+ c = extract_config(flat_data=flat_data, config=C)
105
+ expected_result = {"c": True, "b": {}}
106
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
107
+
108
+
109
+ def test_json_to_dict_valid_json():
110
+ json_string = '{"key": "value"}'
111
+ expected_result = {"key": "value"}
112
+ assert json_to_dict(json_string) == expected_result
113
+ assert isinstance(json_to_dict(json_string), dict)
114
+
115
+
116
+ def test_json_to_dict_malformed_json():
117
+ json_string = '{"key": "value"'
118
+ expected_result = '{"key": "value"'
119
+ assert json_to_dict(json_string) == expected_result
120
+ assert isinstance(json_to_dict(json_string), str)
121
+
122
+
123
+ def test_json_to_dict_single_quotes():
124
+ json_string = "{'key': 'value'}"
125
+ expected_result = {"key": "value"}
126
+ assert json_to_dict(json_string) == expected_result
127
+ assert isinstance(json_to_dict(json_string), dict)
128
+
129
+
130
+ def test_json_to_dict_path():
131
+ json_string = "/path/to/file.json"
132
+ expected_result = "/path/to/file.json"
133
+ assert json_to_dict(json_string) == expected_result
134
+ assert isinstance(json_to_dict(json_string), str)
135
+
136
+
137
+ def test_ensure_isoformat_datetime_for_datetime():
138
+ dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0))
139
+ assert dt == "2021-01-01T12:00:00"
140
+
141
+
142
+ def test_ensure_isoformat_datetime_for_datetime_with_tz():
143
+ dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC))
144
+ assert dt == "2021-01-01T12:00:00+00:00"
145
+
146
+
147
+ def test_ensure_isoformat_datetime_for_string():
148
+ dt = ensure_isoformat_datetime("2021-01-01T12:00:00")
149
+ assert dt == "2021-01-01T12:00:00"
150
+
151
+
152
+ def test_ensure_isoformat_datetime_for_string2():
153
+ dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00")
154
+ assert dt == "2021-01-01T12:00:00+00:00"
155
+
156
+
157
+ def test_ensure_isoformat_datetime_fails_on_string():
158
+ with pytest.raises(ValueError):
159
+ ensure_isoformat_datetime("bad timestamp")
160
+
161
+
162
+ def test_ensure_isoformat_datetime_fails_on_int():
163
+ with pytest.raises(TypeError):
164
+ ensure_isoformat_datetime(1111)
@@ -0,0 +1,82 @@
1
+ import json
2
+ from typing import Any
3
+
4
+ from pydantic import BaseModel, Field, Secret, SecretStr
5
+ from pydantic.types import _SecretBase
6
+
7
+ from unstructured_ingest.v2.utils import serialize_base_model, serialize_base_model_json
8
+
9
+
10
+ class MockChildBaseModel(BaseModel):
11
+ child_secret_str: SecretStr
12
+ child_secret_float: Secret[float]
13
+ child_not_secret_dict: dict[str, Any] = Field(default_factory=dict)
14
+
15
+
16
+ class MockBaseModel(BaseModel):
17
+ secret_str: SecretStr
18
+ not_secret_bool: bool
19
+ secret_child_base: Secret[MockChildBaseModel]
20
+ not_secret_list: list[int] = Field(default_factory=list)
21
+
22
+
23
+ model = MockBaseModel(
24
+ secret_str="secret string",
25
+ not_secret_bool=False,
26
+ secret_child_base=MockChildBaseModel(
27
+ child_secret_str="child secret string",
28
+ child_secret_float=3.14,
29
+ child_not_secret_dict={"key": "value"},
30
+ ),
31
+ not_secret_list=[1, 2, 3],
32
+ )
33
+
34
+
35
+ def test_serialize_base_model():
36
+
37
+ serialized_dict = model.model_dump()
38
+ assert isinstance(serialized_dict["secret_str"], _SecretBase)
39
+ assert isinstance(serialized_dict["secret_child_base"], _SecretBase)
40
+
41
+ serialized_dict_w_secrets = serialize_base_model(model=model)
42
+ assert not isinstance(serialized_dict_w_secrets["secret_str"], _SecretBase)
43
+ assert not isinstance(serialized_dict_w_secrets["secret_child_base"], _SecretBase)
44
+
45
+ expected_dict = {
46
+ "secret_str": "secret string",
47
+ "not_secret_bool": False,
48
+ "secret_child_base": {
49
+ "child_secret_str": "child secret string",
50
+ "child_secret_float": 3.14,
51
+ "child_not_secret_dict": {"key": "value"},
52
+ },
53
+ "not_secret_list": [1, 2, 3],
54
+ }
55
+
56
+ assert serialized_dict_w_secrets == expected_dict
57
+
58
+
59
+ def test_serialize_base_model_json():
60
+ serialized_json = model.model_dump_json()
61
+ serialized_dict = json.loads(serialized_json)
62
+ expected_dict = {
63
+ "secret_str": "**********",
64
+ "not_secret_bool": False,
65
+ "secret_child_base": "**********",
66
+ "not_secret_list": [1, 2, 3],
67
+ }
68
+ assert expected_dict == serialized_dict
69
+
70
+ serialized_json_w_secrets = serialize_base_model_json(model=model)
71
+ serialized_dict_w_secrets = json.loads(serialized_json_w_secrets)
72
+ expected_dict_w_secrets = {
73
+ "secret_str": "secret string",
74
+ "not_secret_bool": False,
75
+ "secret_child_base": {
76
+ "child_secret_str": "child secret string",
77
+ "child_secret_float": 3.14,
78
+ "child_not_secret_dict": {"key": "value"},
79
+ },
80
+ "not_secret_list": [1, 2, 3],
81
+ }
82
+ assert expected_dict_w_secrets == serialized_dict_w_secrets
@@ -1 +1 @@
1
- __version__ = "0.0.25" # pragma: no cover
1
+ __version__ = "0.1.1" # pragma: no cover
@@ -341,9 +341,9 @@ class CliPartitionConfig(PartitionConfig, CliMixin):
341
341
  ),
342
342
  click.Option(
343
343
  ["--partition-endpoint"],
344
- default="https://api.unstructured.io/general/v0/general",
344
+ default="https://api.unstructuredapp.io/general/v0/general",
345
345
  help="If partitioning via api, use the following host. "
346
- "Default: https://api.unstructured.io/general/v0/general",
346
+ "Default: https://api.unstructuredapp.io/general/v0/general",
347
347
  ),
348
348
  click.Option(
349
349
  ["--api-key"],
@@ -58,6 +58,7 @@ class Block(FromJSONMixin, GetHTMLMixin):
58
58
  last_edited_time: str
59
59
  last_edited_by: PartialUser
60
60
  archived: bool
61
+ in_trash: bool
61
62
  has_children: bool
62
63
  parent: Parent
63
64
  block: BlockBase
@@ -26,6 +26,7 @@ class Database(FromJSONMixin, GetHTMLMixin):
26
26
  last_edited_time: str
27
27
  last_edited_by: PartialUser
28
28
  archived: bool
29
+ in_trash: bool
29
30
  parent: Parent
30
31
  url: str
31
32
  is_inline: bool
@@ -16,6 +16,7 @@ class Page(FromJSONMixin):
16
16
  last_edited_time: str
17
17
  last_edited_by: PartialUser
18
18
  archived: bool
19
+ in_trash: bool
19
20
  properties: dict
20
21
  parent: Parent
21
22
  url: str