unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +102 -91
- test/integration/connectors/sql/test_singlestore.py +111 -99
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +86 -75
- test/integration/connectors/test_astradb.py +22 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +4 -4
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +3 -3
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +15 -91
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
- unstructured_ingest/v2/processes/connectors/astradb.py +8 -30
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +42 -52
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +5 -30
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +4 -8
- unstructured_ingest/v2/processes/connectors/neo4j.py +381 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/sql.py +41 -40
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/METADATA +18 -14
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/RECORD +64 -56
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/top_level.txt +0 -0
|
@@ -1,83 +1,28 @@
|
|
|
1
|
-
import filecmp
|
|
2
1
|
import json
|
|
3
2
|
import os
|
|
4
3
|
import shutil
|
|
5
|
-
from dataclasses import
|
|
4
|
+
from dataclasses import replace
|
|
6
5
|
from pathlib import Path
|
|
7
6
|
from typing import Callable, Optional
|
|
8
7
|
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from bs4 import BeautifulSoup
|
|
11
8
|
from deepdiff import DeepDiff
|
|
9
|
+
from pydantic import Field
|
|
12
10
|
|
|
13
|
-
from test.integration.connectors.utils.
|
|
11
|
+
from test.integration.connectors.utils.validation.utils import ValidationConfig, reset_dir
|
|
14
12
|
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
15
13
|
|
|
16
14
|
|
|
17
|
-
|
|
18
|
-
expected_df = pd.read_csv(expected_filepath)
|
|
19
|
-
current_df = pd.read_csv(current_filepath)
|
|
20
|
-
if expected_df.equals(current_df):
|
|
21
|
-
return True
|
|
22
|
-
# Print diff
|
|
23
|
-
diff = expected_df.merge(current_df, indicator=True, how="left").loc[
|
|
24
|
-
lambda x: x["_merge"] != "both"
|
|
25
|
-
]
|
|
26
|
-
print("diff between expected and current df:")
|
|
27
|
-
print(diff)
|
|
28
|
-
return False
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
32
|
-
with expected_filepath.open() as expected_f:
|
|
33
|
-
expected_soup = BeautifulSoup(expected_f, "html.parser")
|
|
34
|
-
with current_filepath.open() as current_f:
|
|
35
|
-
current_soup = BeautifulSoup(current_f, "html.parser")
|
|
36
|
-
return expected_soup.text == current_soup.text
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
40
|
-
with expected_filepath.open() as expected_f:
|
|
41
|
-
expected_text_lines = expected_f.readlines()
|
|
42
|
-
with current_filepath.open() as current_f:
|
|
43
|
-
current_text_lines = current_f.readlines()
|
|
44
|
-
if len(expected_text_lines) != len(current_text_lines):
|
|
45
|
-
print(
|
|
46
|
-
f"Lines in expected text file ({len(expected_text_lines)}) "
|
|
47
|
-
f"don't match current text file ({len(current_text_lines)})"
|
|
48
|
-
)
|
|
49
|
-
return False
|
|
50
|
-
expected_text = "\n".join(expected_text_lines)
|
|
51
|
-
current_text = "\n".join(current_text_lines)
|
|
52
|
-
if expected_text == current_text:
|
|
53
|
-
return True
|
|
54
|
-
print("txt content don't match:")
|
|
55
|
-
print(f"expected: {expected_text}")
|
|
56
|
-
print(f"current: {current_text}")
|
|
57
|
-
return False
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
file_type_equality_check = {
|
|
61
|
-
".json": json_equality_check,
|
|
62
|
-
".html": html_equality_check,
|
|
63
|
-
".txt": txt_equality_check,
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
@dataclass
|
|
68
|
-
class ValidationConfigs:
|
|
69
|
-
test_id: str
|
|
15
|
+
class SourceValidationConfigs(ValidationConfig):
|
|
70
16
|
expected_number_indexed_file_data: Optional[int] = None
|
|
71
17
|
expected_num_files: Optional[int] = None
|
|
72
18
|
predownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
73
19
|
postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
74
|
-
exclude_fields: list[str] =
|
|
20
|
+
exclude_fields: list[str] = Field(
|
|
75
21
|
default_factory=lambda: ["local_download_path", "metadata.date_processed"]
|
|
76
22
|
)
|
|
77
|
-
exclude_fields_extend: list[str] =
|
|
23
|
+
exclude_fields_extend: list[str] = Field(default_factory=list)
|
|
78
24
|
validate_downloaded_files: bool = False
|
|
79
25
|
validate_file_data: bool = True
|
|
80
|
-
downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
|
|
81
26
|
|
|
82
27
|
def get_exclude_fields(self) -> list[str]:
|
|
83
28
|
exclude_fields = self.exclude_fields
|
|
@@ -97,9 +42,6 @@ class ValidationConfigs:
|
|
|
97
42
|
downloaded_files = [p for p in download_dir.rglob("*") if p.is_file()]
|
|
98
43
|
assert len(downloaded_files) == expected_num_files
|
|
99
44
|
|
|
100
|
-
def test_output_dir(self) -> Path:
|
|
101
|
-
return expected_results_path / self.test_id
|
|
102
|
-
|
|
103
45
|
def omit_ignored_fields(self, data: dict) -> dict:
|
|
104
46
|
exclude_fields = self.get_exclude_fields()
|
|
105
47
|
# Ignore fields that dynamically change every time the tests run
|
|
@@ -143,7 +85,7 @@ def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
|
|
|
143
85
|
|
|
144
86
|
|
|
145
87
|
def check_contents(
|
|
146
|
-
expected_output_dir: Path, all_file_data: list[FileData], configs:
|
|
88
|
+
expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
|
|
147
89
|
):
|
|
148
90
|
found_diff = False
|
|
149
91
|
for file_data in all_file_data:
|
|
@@ -160,27 +102,10 @@ def check_contents(
|
|
|
160
102
|
assert not found_diff, f"Diffs found between files: {found_diff}"
|
|
161
103
|
|
|
162
104
|
|
|
163
|
-
def detect_diff(
|
|
164
|
-
configs: ValidationConfigs, expected_filepath: Path, current_filepath: Path
|
|
165
|
-
) -> bool:
|
|
166
|
-
if expected_filepath.suffix != current_filepath.suffix:
|
|
167
|
-
return True
|
|
168
|
-
if downloaded_file_equality_check := configs.downloaded_file_equality_check:
|
|
169
|
-
return not downloaded_file_equality_check(expected_filepath, current_filepath)
|
|
170
|
-
current_suffix = expected_filepath.suffix
|
|
171
|
-
if current_suffix in file_type_equality_check:
|
|
172
|
-
equality_check_callable = file_type_equality_check[current_suffix]
|
|
173
|
-
return not equality_check_callable(
|
|
174
|
-
expected_filepath=expected_filepath, current_filepath=current_filepath
|
|
175
|
-
)
|
|
176
|
-
# Fallback is using filecmp.cmp to compare the files
|
|
177
|
-
return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
|
|
178
|
-
|
|
179
|
-
|
|
180
105
|
def check_raw_file_contents(
|
|
181
106
|
expected_output_dir: Path,
|
|
182
107
|
current_output_dir: Path,
|
|
183
|
-
configs:
|
|
108
|
+
configs: SourceValidationConfigs,
|
|
184
109
|
):
|
|
185
110
|
current_files = get_files(dir_path=current_output_dir)
|
|
186
111
|
found_diff = False
|
|
@@ -188,7 +113,7 @@ def check_raw_file_contents(
|
|
|
188
113
|
for current_file in current_files:
|
|
189
114
|
current_file_path = current_output_dir / current_file
|
|
190
115
|
expected_file_path = expected_output_dir / current_file
|
|
191
|
-
if detect_diff(
|
|
116
|
+
if configs.detect_diff(expected_file_path, current_file_path):
|
|
192
117
|
found_diff = True
|
|
193
118
|
files.append(str(expected_file_path))
|
|
194
119
|
print(f"diffs between files {expected_file_path} and {current_file_path}")
|
|
@@ -196,7 +121,7 @@ def check_raw_file_contents(
|
|
|
196
121
|
|
|
197
122
|
|
|
198
123
|
def run_expected_results_validation(
|
|
199
|
-
expected_output_dir: Path, all_file_data: list[FileData], configs:
|
|
124
|
+
expected_output_dir: Path, all_file_data: list[FileData], configs: SourceValidationConfigs
|
|
200
125
|
):
|
|
201
126
|
check_files(expected_output_dir=expected_output_dir, all_file_data=all_file_data)
|
|
202
127
|
check_contents(
|
|
@@ -207,7 +132,7 @@ def run_expected_results_validation(
|
|
|
207
132
|
def run_expected_download_files_validation(
|
|
208
133
|
expected_output_dir: Path,
|
|
209
134
|
current_download_dir: Path,
|
|
210
|
-
configs:
|
|
135
|
+
configs: SourceValidationConfigs,
|
|
211
136
|
):
|
|
212
137
|
check_files_in_paths(
|
|
213
138
|
expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
|
|
@@ -234,12 +159,10 @@ def update_fixtures(
|
|
|
234
159
|
save_downloads: bool = False,
|
|
235
160
|
save_filedata: bool = True,
|
|
236
161
|
):
|
|
237
|
-
# Delete current files
|
|
238
|
-
shutil.rmtree(path=output_dir, ignore_errors=True)
|
|
239
|
-
output_dir.mkdir(parents=True)
|
|
240
162
|
# Rewrite the current file data
|
|
241
163
|
if save_filedata:
|
|
242
164
|
file_data_output_path = output_dir / "file_data"
|
|
165
|
+
reset_dir(dir_path=file_data_output_path)
|
|
243
166
|
print(
|
|
244
167
|
f"Writing {len(all_file_data)} file data to "
|
|
245
168
|
f"saved fixture location {file_data_output_path}"
|
|
@@ -260,6 +183,7 @@ def update_fixtures(
|
|
|
260
183
|
# If applicable, save raw downloads
|
|
261
184
|
if save_downloads:
|
|
262
185
|
raw_download_output_path = output_dir / "downloads"
|
|
186
|
+
reset_dir(raw_download_output_path)
|
|
263
187
|
print(
|
|
264
188
|
f"Writing {len(download_files)} downloaded files to "
|
|
265
189
|
f"saved fixture location {raw_download_output_path}"
|
|
@@ -268,7 +192,7 @@ def update_fixtures(
|
|
|
268
192
|
|
|
269
193
|
|
|
270
194
|
def run_all_validations(
|
|
271
|
-
configs:
|
|
195
|
+
configs: SourceValidationConfigs,
|
|
272
196
|
predownload_file_data: list[FileData],
|
|
273
197
|
postdownload_file_data: list[FileData],
|
|
274
198
|
download_dir: Path,
|
|
@@ -308,7 +232,7 @@ def run_all_validations(
|
|
|
308
232
|
async def source_connector_validation(
|
|
309
233
|
indexer: Indexer,
|
|
310
234
|
downloader: Downloader,
|
|
311
|
-
configs:
|
|
235
|
+
configs: SourceValidationConfigs,
|
|
312
236
|
overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true",
|
|
313
237
|
) -> None:
|
|
314
238
|
# Run common validations on the process of running a source connector, supporting dynamic
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import filecmp
|
|
2
|
+
import shutil
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from test.integration.connectors.utils.constants import expected_results_path
|
|
9
|
+
from test.integration.connectors.utils.validation.equality import file_type_equality_check
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ValidationConfig(BaseModel):
|
|
13
|
+
test_id: str
|
|
14
|
+
file_equality_check: Optional[Callable[[Path, Path], bool]] = None
|
|
15
|
+
|
|
16
|
+
def test_output_dir(self) -> Path:
|
|
17
|
+
return expected_results_path / self.test_id
|
|
18
|
+
|
|
19
|
+
def detect_diff(self, expected_filepath: Path, current_filepath: Path) -> bool:
|
|
20
|
+
if expected_filepath.suffix != current_filepath.suffix:
|
|
21
|
+
return True
|
|
22
|
+
if file_equality_check := self.file_equality_check:
|
|
23
|
+
return not file_equality_check(expected_filepath, current_filepath)
|
|
24
|
+
current_suffix = expected_filepath.suffix
|
|
25
|
+
if current_suffix in file_type_equality_check:
|
|
26
|
+
equality_check_callable = file_type_equality_check[current_suffix]
|
|
27
|
+
return not equality_check_callable(
|
|
28
|
+
expected_filepath=expected_filepath, current_filepath=current_filepath
|
|
29
|
+
)
|
|
30
|
+
# Fallback is using filecmp.cmp to compare the files
|
|
31
|
+
return not filecmp.cmp(expected_filepath, current_filepath, shallow=False)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def reset_dir(dir_path: Path) -> None:
|
|
35
|
+
shutil.rmtree(path=dir_path, ignore_errors=True)
|
|
36
|
+
dir_path.mkdir(parents=True)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.9" # pragma: no cover
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
import zlib
|
|
2
5
|
from itertools import groupby
|
|
3
6
|
|
|
4
7
|
|
|
@@ -43,3 +46,11 @@ def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
|
|
|
43
46
|
e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
|
|
44
47
|
|
|
45
48
|
return elements
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def elements_from_base64_gzipped_json(raw_s: str) -> list[dict]:
|
|
52
|
+
decoded_b64_bytes = base64.b64decode(raw_s)
|
|
53
|
+
elements_json_bytes = zlib.decompress(decoded_b64_bytes)
|
|
54
|
+
elements_json_str = elements_json_bytes.decode("utf-8")
|
|
55
|
+
element_dicts = json.loads(elements_json_str)
|
|
56
|
+
return element_dicts
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
4
5
|
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
|
|
5
6
|
|
|
7
|
+
import ndjson
|
|
6
8
|
import pandas as pd
|
|
7
9
|
|
|
8
10
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
@@ -131,3 +133,37 @@ def validate_date_args(date: Optional[str] = None) -> bool:
|
|
|
131
133
|
f"The argument {date} does not satisfy the format:"
|
|
132
134
|
f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM",
|
|
133
135
|
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def get_data(path: Path) -> list[dict]:
|
|
139
|
+
with path.open() as f:
|
|
140
|
+
if path.suffix == ".json":
|
|
141
|
+
return json.load(f)
|
|
142
|
+
elif path.suffix == ".ndjson":
|
|
143
|
+
return ndjson.load(f)
|
|
144
|
+
elif path.suffix == ".csv":
|
|
145
|
+
df = pd.read_csv(path)
|
|
146
|
+
return df.to_dict(orient="records")
|
|
147
|
+
elif path.suffix == ".parquet":
|
|
148
|
+
df = pd.read_parquet(path)
|
|
149
|
+
return df.to_dict(orient="records")
|
|
150
|
+
else:
|
|
151
|
+
raise ValueError(f"Unsupported file type: {path}")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def get_data_df(path: Path) -> pd.DataFrame:
|
|
155
|
+
with path.open() as f:
|
|
156
|
+
if path.suffix == ".json":
|
|
157
|
+
data = json.load(f)
|
|
158
|
+
return pd.DataFrame(data=data)
|
|
159
|
+
elif path.suffix == ".ndjson":
|
|
160
|
+
data = ndjson.load(f)
|
|
161
|
+
return pd.DataFrame(data=data)
|
|
162
|
+
elif path.suffix == ".csv":
|
|
163
|
+
df = pd.read_csv(path)
|
|
164
|
+
return df
|
|
165
|
+
elif path.suffix == ".parquet":
|
|
166
|
+
df = pd.read_parquet(path)
|
|
167
|
+
return df
|
|
168
|
+
else:
|
|
169
|
+
raise ValueError(f"Unsupported file type: {path}")
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
import json
|
|
2
|
+
from abc import ABC
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Any, TypeVar
|
|
5
6
|
|
|
7
|
+
import ndjson
|
|
6
8
|
from pydantic import BaseModel
|
|
7
9
|
|
|
8
10
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
@@ -20,16 +22,78 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
|
|
|
20
22
|
class UploadStager(BaseProcess, ABC):
|
|
21
23
|
upload_stager_config: UploadStagerConfigT
|
|
22
24
|
|
|
23
|
-
|
|
25
|
+
def write_output(self, output_path: Path, data: list[dict]) -> None:
|
|
26
|
+
if output_path.suffix == ".json":
|
|
27
|
+
with output_path.open("w") as f:
|
|
28
|
+
json.dump(data, f, indent=2)
|
|
29
|
+
elif output_path.suffix == ".ndjson":
|
|
30
|
+
with output_path.open("w") as f:
|
|
31
|
+
ndjson.dump(data, f)
|
|
32
|
+
else:
|
|
33
|
+
raise ValueError(f"Unsupported output format: {output_path}")
|
|
34
|
+
|
|
35
|
+
def get_data(self, elements_filepath: Path) -> list[dict]:
|
|
36
|
+
if elements_filepath.suffix == ".json":
|
|
37
|
+
with elements_filepath.open() as f:
|
|
38
|
+
return json.load(f)
|
|
39
|
+
elif elements_filepath.suffix == ".ndjson":
|
|
40
|
+
with elements_filepath.open() as f:
|
|
41
|
+
return ndjson.load(f)
|
|
42
|
+
else:
|
|
43
|
+
raise ValueError(f"Unsupported input format: {elements_filepath}")
|
|
44
|
+
|
|
45
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
46
|
+
return element_dict
|
|
47
|
+
|
|
48
|
+
def get_output_path(self, output_filename: str, output_dir: Path) -> Path:
|
|
49
|
+
output_path = Path(output_filename)
|
|
50
|
+
output_filename = f"{Path(output_filename).stem}{output_path.suffix}"
|
|
51
|
+
output_path = Path(output_dir) / Path(f"{output_filename}")
|
|
52
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
return output_path
|
|
54
|
+
|
|
55
|
+
def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
56
|
+
with input_file.open() as in_f:
|
|
57
|
+
reader = ndjson.reader(in_f)
|
|
58
|
+
with output_file.open("w") as out_f:
|
|
59
|
+
writer = ndjson.writer(out_f)
|
|
60
|
+
for element in reader:
|
|
61
|
+
conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
|
|
62
|
+
writer.writerow(row=conformed_element)
|
|
63
|
+
writer.f.flush()
|
|
64
|
+
|
|
65
|
+
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
66
|
+
with input_file.open() as in_f:
|
|
67
|
+
elements_contents = json.load(in_f)
|
|
68
|
+
|
|
69
|
+
conformed_elements = [
|
|
70
|
+
self.conform_dict(element_dict=element, file_data=file_data)
|
|
71
|
+
for element in elements_contents
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
with open(output_file, "w") as out_f:
|
|
75
|
+
json.dump(conformed_elements, out_f, indent=2)
|
|
76
|
+
|
|
24
77
|
def run(
|
|
25
78
|
self,
|
|
26
79
|
elements_filepath: Path,
|
|
27
80
|
file_data: FileData,
|
|
28
81
|
output_dir: Path,
|
|
29
82
|
output_filename: str,
|
|
30
|
-
**kwargs: Any
|
|
83
|
+
**kwargs: Any,
|
|
31
84
|
) -> Path:
|
|
32
|
-
|
|
85
|
+
output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
86
|
+
if elements_filepath.suffix == ".ndjson":
|
|
87
|
+
self.stream_update(
|
|
88
|
+
input_file=elements_filepath, output_file=output_file, file_data=file_data
|
|
89
|
+
)
|
|
90
|
+
elif elements_filepath.suffix == ".json":
|
|
91
|
+
self.process_whole(
|
|
92
|
+
input_file=elements_filepath, output_file=output_file, file_data=file_data
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
raise ValueError(f"Unsupported file extension: {elements_filepath}")
|
|
96
|
+
return output_file
|
|
33
97
|
|
|
34
98
|
async def run_async(
|
|
35
99
|
self,
|
|
@@ -37,12 +101,12 @@ class UploadStager(BaseProcess, ABC):
|
|
|
37
101
|
file_data: FileData,
|
|
38
102
|
output_dir: Path,
|
|
39
103
|
output_filename: str,
|
|
40
|
-
**kwargs: Any
|
|
104
|
+
**kwargs: Any,
|
|
41
105
|
) -> Path:
|
|
42
106
|
return self.run(
|
|
43
107
|
elements_filepath=elements_filepath,
|
|
44
108
|
output_dir=output_dir,
|
|
45
109
|
output_filename=output_filename,
|
|
46
110
|
file_data=file_data,
|
|
47
|
-
**kwargs
|
|
111
|
+
**kwargs,
|
|
48
112
|
)
|
|
@@ -5,6 +5,7 @@ from typing import Any, TypeVar
|
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
|
+
from unstructured_ingest.utils.data_prep import get_data
|
|
8
9
|
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
9
10
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
10
11
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
@@ -38,7 +39,15 @@ class Uploader(BaseProcess, BaseConnector, ABC):
|
|
|
38
39
|
raise NotImplementedError()
|
|
39
40
|
|
|
40
41
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
41
|
-
|
|
42
|
+
data = get_data(path=path)
|
|
43
|
+
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
42
44
|
|
|
43
45
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
44
|
-
|
|
46
|
+
data = get_data(path=path)
|
|
47
|
+
await self.run_data_async(data=data, file_data=file_data, **kwargs)
|
|
48
|
+
|
|
49
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
50
|
+
raise NotImplementedError()
|
|
51
|
+
|
|
52
|
+
async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
53
|
+
return self.run_data(data=data, file_data=file_data, **kwargs)
|
|
@@ -39,11 +39,13 @@ class UploadStageStep(PipelineStep):
|
|
|
39
39
|
self, fn: Callable, path: str, file_data_path: str
|
|
40
40
|
) -> UploadStageStepResponse:
|
|
41
41
|
path = Path(path)
|
|
42
|
+
# Maintain extension
|
|
43
|
+
output_filename = f"{self.get_hash(extras=[path.name])}{path.suffix}"
|
|
42
44
|
fn_kwargs = {
|
|
43
45
|
"elements_filepath": path,
|
|
44
46
|
"file_data": FileData.from_file(path=file_data_path),
|
|
45
47
|
"output_dir": self.cache_dir,
|
|
46
|
-
"output_filename":
|
|
48
|
+
"output_filename": output_filename,
|
|
47
49
|
}
|
|
48
50
|
if not asyncio.iscoroutinefunction(fn):
|
|
49
51
|
staged_output_path = fn(**fn_kwargs)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
import csv
|
|
3
3
|
import hashlib
|
|
4
|
-
import json
|
|
5
4
|
import sys
|
|
6
5
|
from dataclasses import dataclass, field
|
|
7
6
|
from pathlib import Path
|
|
@@ -17,7 +16,7 @@ from unstructured_ingest.error import (
|
|
|
17
16
|
SourceConnectionError,
|
|
18
17
|
SourceConnectionNetworkError,
|
|
19
18
|
)
|
|
20
|
-
from unstructured_ingest.utils.data_prep import batch_generator
|
|
19
|
+
from unstructured_ingest.utils.data_prep import batch_generator, get_data
|
|
21
20
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
22
21
|
from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
|
|
23
22
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
@@ -325,29 +324,6 @@ class AstraDBUploadStager(UploadStager):
|
|
|
325
324
|
"metadata": element_dict,
|
|
326
325
|
}
|
|
327
326
|
|
|
328
|
-
def run(
|
|
329
|
-
self,
|
|
330
|
-
elements_filepath: Path,
|
|
331
|
-
file_data: FileData,
|
|
332
|
-
output_dir: Path,
|
|
333
|
-
output_filename: str,
|
|
334
|
-
**kwargs: Any,
|
|
335
|
-
) -> Path:
|
|
336
|
-
with open(elements_filepath) as elements_file:
|
|
337
|
-
elements_contents = json.load(elements_file)
|
|
338
|
-
conformed_elements = []
|
|
339
|
-
for element in elements_contents:
|
|
340
|
-
conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
|
|
341
|
-
output_filename_path = Path(output_filename)
|
|
342
|
-
if output_filename_path.suffix == ".json":
|
|
343
|
-
output_path = Path(output_dir) / output_filename_path
|
|
344
|
-
else:
|
|
345
|
-
output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
|
|
346
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
347
|
-
with open(output_path, "w") as output_file:
|
|
348
|
-
json.dump(conformed_elements, output_file, indent=2)
|
|
349
|
-
return output_path
|
|
350
|
-
|
|
351
327
|
|
|
352
328
|
@dataclass
|
|
353
329
|
class AstraDBUploader(Uploader):
|
|
@@ -386,11 +362,9 @@ class AstraDBUploader(Uploader):
|
|
|
386
362
|
f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
|
|
387
363
|
)
|
|
388
364
|
|
|
389
|
-
def
|
|
390
|
-
with path.open("r") as file:
|
|
391
|
-
elements_dict = json.load(file)
|
|
365
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
392
366
|
logger.info(
|
|
393
|
-
f"writing {len(
|
|
367
|
+
f"writing {len(data)} objects to destination "
|
|
394
368
|
f"collection {self.upload_config.collection_name}"
|
|
395
369
|
)
|
|
396
370
|
|
|
@@ -399,9 +373,13 @@ class AstraDBUploader(Uploader):
|
|
|
399
373
|
|
|
400
374
|
self.delete_by_record_id(collection=collection, file_data=file_data)
|
|
401
375
|
|
|
402
|
-
for chunk in batch_generator(
|
|
376
|
+
for chunk in batch_generator(data, astra_db_batch_size):
|
|
403
377
|
collection.insert_many(chunk)
|
|
404
378
|
|
|
379
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
380
|
+
data = get_data(path=path)
|
|
381
|
+
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
382
|
+
|
|
405
383
|
|
|
406
384
|
astra_db_source_entry = SourceRegistryEntry(
|
|
407
385
|
indexer=AstraDBIndexer,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from contextlib import contextmanager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
|
-
from
|
|
4
|
-
from typing import TYPE_CHECKING, Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator
|
|
5
5
|
|
|
6
6
|
from pydantic import Field, Secret
|
|
7
7
|
|
|
@@ -49,29 +49,33 @@ class AzureAISearchConnectionConfig(ConnectionConfig):
|
|
|
49
49
|
access_config: Secret[AzureAISearchAccessConfig]
|
|
50
50
|
|
|
51
51
|
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
52
|
-
|
|
52
|
+
@contextmanager
|
|
53
|
+
def get_search_client(self) -> Generator["SearchClient", None, None]:
|
|
53
54
|
from azure.core.credentials import AzureKeyCredential
|
|
54
55
|
from azure.search.documents import SearchClient
|
|
55
56
|
|
|
56
|
-
|
|
57
|
+
with SearchClient(
|
|
57
58
|
endpoint=self.endpoint,
|
|
58
59
|
index_name=self.index,
|
|
59
60
|
credential=AzureKeyCredential(
|
|
60
61
|
self.access_config.get_secret_value().azure_ai_search_key
|
|
61
62
|
),
|
|
62
|
-
)
|
|
63
|
+
) as client:
|
|
64
|
+
yield client
|
|
63
65
|
|
|
64
66
|
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
65
|
-
|
|
67
|
+
@contextmanager
|
|
68
|
+
def get_search_index_client(self) -> Generator["SearchIndexClient", None, None]:
|
|
66
69
|
from azure.core.credentials import AzureKeyCredential
|
|
67
70
|
from azure.search.documents.indexes import SearchIndexClient
|
|
68
71
|
|
|
69
|
-
|
|
72
|
+
with SearchIndexClient(
|
|
70
73
|
endpoint=self.endpoint,
|
|
71
74
|
credential=AzureKeyCredential(
|
|
72
75
|
self.access_config.get_secret_value().azure_ai_search_key
|
|
73
76
|
),
|
|
74
|
-
)
|
|
77
|
+
) as search_index_client:
|
|
78
|
+
yield search_index_client
|
|
75
79
|
|
|
76
80
|
|
|
77
81
|
class AzureAISearchUploadStagerConfig(UploadStagerConfig):
|
|
@@ -92,14 +96,13 @@ class AzureAISearchUploadStager(UploadStager):
|
|
|
92
96
|
default_factory=lambda: AzureAISearchUploadStagerConfig()
|
|
93
97
|
)
|
|
94
98
|
|
|
95
|
-
|
|
96
|
-
def conform_dict(data: dict, file_data: FileData) -> dict:
|
|
99
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
97
100
|
"""
|
|
98
101
|
updates the dictionary that is from each Element being converted into a dict/json
|
|
99
102
|
into a dictionary that conforms to the schema expected by the
|
|
100
103
|
Azure Cognitive Search index
|
|
101
104
|
"""
|
|
102
|
-
|
|
105
|
+
data = element_dict.copy()
|
|
103
106
|
data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
|
|
104
107
|
data[RECORD_ID_LABEL] = file_data.identifier
|
|
105
108
|
|
|
@@ -140,31 +143,6 @@ class AzureAISearchUploadStager(UploadStager):
|
|
|
140
143
|
data["metadata"]["page_number"] = str(page_number)
|
|
141
144
|
return data
|
|
142
145
|
|
|
143
|
-
def run(
|
|
144
|
-
self,
|
|
145
|
-
file_data: FileData,
|
|
146
|
-
elements_filepath: Path,
|
|
147
|
-
output_dir: Path,
|
|
148
|
-
output_filename: str,
|
|
149
|
-
**kwargs: Any,
|
|
150
|
-
) -> Path:
|
|
151
|
-
with open(elements_filepath) as elements_file:
|
|
152
|
-
elements_contents = json.load(elements_file)
|
|
153
|
-
|
|
154
|
-
conformed_elements = [
|
|
155
|
-
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
156
|
-
]
|
|
157
|
-
|
|
158
|
-
if Path(output_filename).suffix != ".json":
|
|
159
|
-
output_filename = f"{output_filename}.json"
|
|
160
|
-
else:
|
|
161
|
-
output_filename = f"{Path(output_filename).stem}.json"
|
|
162
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
163
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
164
|
-
with open(output_path, "w") as output_file:
|
|
165
|
-
json.dump(conformed_elements, output_file, indent=2)
|
|
166
|
-
return output_path
|
|
167
|
-
|
|
168
146
|
|
|
169
147
|
@dataclass
|
|
170
148
|
class AzureAISearchUploader(Uploader):
|
|
@@ -270,9 +248,7 @@ class AzureAISearchUploader(Uploader):
|
|
|
270
248
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
271
249
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
272
250
|
|
|
273
|
-
def
|
|
274
|
-
with path.open("r") as file:
|
|
275
|
-
elements_dict = json.load(file)
|
|
251
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
276
252
|
logger.info(
|
|
277
253
|
f"writing document batches to destination"
|
|
278
254
|
f" endpoint at {str(self.connection_config.endpoint)}"
|
|
@@ -287,7 +263,7 @@ class AzureAISearchUploader(Uploader):
|
|
|
287
263
|
|
|
288
264
|
batch_size = self.upload_config.batch_size
|
|
289
265
|
with self.connection_config.get_search_client() as search_client:
|
|
290
|
-
for chunk in batch_generator(
|
|
266
|
+
for chunk in batch_generator(data, batch_size):
|
|
291
267
|
self.write_dict(elements_dict=chunk, search_client=search_client) # noqa: E203
|
|
292
268
|
|
|
293
269
|
|