unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_postgres.py +3 -3
- test/integration/connectors/sql/test_singlestore.py +3 -3
- test/integration/connectors/sql/test_sqlite.py +3 -3
- test/integration/connectors/test_astradb.py +40 -0
- test/integration/connectors/test_kafka.py +2 -2
- test/integration/connectors/test_mongodb.py +4 -1
- test/integration/connectors/utils/validation/source.py +31 -11
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +35 -33
- unstructured_ingest/v2/processes/connectors/couchbase.py +50 -41
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/mongodb.py +95 -100
- unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +31 -26
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +14 -13
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +44 -44
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -28,7 +28,7 @@ from unstructured_ingest.v2.processes.connectors.sql.postgres import (
|
|
|
28
28
|
PostgresUploadStager,
|
|
29
29
|
)
|
|
30
30
|
|
|
31
|
-
SEED_DATA_ROWS =
|
|
31
|
+
SEED_DATA_ROWS = 10
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
@pytest.fixture
|
|
@@ -69,7 +69,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
|
|
|
69
69
|
)
|
|
70
70
|
indexer = PostgresIndexer(
|
|
71
71
|
connection_config=connection_config,
|
|
72
|
-
index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=
|
|
72
|
+
index_config=PostgresIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
|
|
73
73
|
)
|
|
74
74
|
downloader = PostgresDownloader(
|
|
75
75
|
connection_config=connection_config,
|
|
@@ -81,7 +81,7 @@ async def test_postgres_source(temp_dir: Path, source_database_setup: str):
|
|
|
81
81
|
configs=SourceValidationConfigs(
|
|
82
82
|
test_id="postgres",
|
|
83
83
|
expected_num_files=SEED_DATA_ROWS,
|
|
84
|
-
expected_number_indexed_file_data=
|
|
84
|
+
expected_number_indexed_file_data=2,
|
|
85
85
|
validate_downloaded_files=True,
|
|
86
86
|
),
|
|
87
87
|
)
|
|
@@ -29,7 +29,7 @@ from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
|
|
|
29
29
|
SingleStoreUploadStager,
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
-
SEED_DATA_ROWS =
|
|
32
|
+
SEED_DATA_ROWS = 10
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
@pytest.fixture
|
|
@@ -66,7 +66,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
|
|
|
66
66
|
)
|
|
67
67
|
indexer = SingleStoreIndexer(
|
|
68
68
|
connection_config=connection_config,
|
|
69
|
-
index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=
|
|
69
|
+
index_config=SingleStoreIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
|
|
70
70
|
)
|
|
71
71
|
downloader = SingleStoreDownloader(
|
|
72
72
|
connection_config=connection_config,
|
|
@@ -80,7 +80,7 @@ async def test_singlestore_source(temp_dir: Path, source_database_setup: dict):
|
|
|
80
80
|
configs=SourceValidationConfigs(
|
|
81
81
|
test_id="singlestore",
|
|
82
82
|
expected_num_files=SEED_DATA_ROWS,
|
|
83
|
-
expected_number_indexed_file_data=
|
|
83
|
+
expected_number_indexed_file_data=2,
|
|
84
84
|
validate_downloaded_files=True,
|
|
85
85
|
),
|
|
86
86
|
)
|
|
@@ -27,7 +27,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sqlite import (
|
|
|
27
27
|
SQLiteUploadStager,
|
|
28
28
|
)
|
|
29
29
|
|
|
30
|
-
SEED_DATA_ROWS =
|
|
30
|
+
SEED_DATA_ROWS = 10
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
@pytest.fixture
|
|
@@ -57,7 +57,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
|
|
|
57
57
|
connection_config = SQLiteConnectionConfig(database_path=source_database_setup)
|
|
58
58
|
indexer = SQLiteIndexer(
|
|
59
59
|
connection_config=connection_config,
|
|
60
|
-
index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=
|
|
60
|
+
index_config=SQLiteIndexerConfig(table_name="cars", id_column="car_id", batch_size=6),
|
|
61
61
|
)
|
|
62
62
|
downloader = SQLiteDownloader(
|
|
63
63
|
connection_config=connection_config,
|
|
@@ -69,7 +69,7 @@ async def test_sqlite_source(source_database_setup: Path, temp_dir: Path):
|
|
|
69
69
|
configs=SourceValidationConfigs(
|
|
70
70
|
test_id="sqlite",
|
|
71
71
|
expected_num_files=SEED_DATA_ROWS,
|
|
72
|
-
expected_number_indexed_file_data=
|
|
72
|
+
expected_number_indexed_file_data=2,
|
|
73
73
|
validate_downloaded_files=True,
|
|
74
74
|
),
|
|
75
75
|
)
|
|
@@ -14,12 +14,18 @@ from test.integration.connectors.utils.validation.destination import (
|
|
|
14
14
|
StagerValidationConfigs,
|
|
15
15
|
stager_validation,
|
|
16
16
|
)
|
|
17
|
+
from test.integration.connectors.utils.validation.source import (
|
|
18
|
+
SourceValidationConfigs,
|
|
19
|
+
source_connector_validation,
|
|
20
|
+
)
|
|
17
21
|
from test.integration.utils import requires_env
|
|
18
22
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
19
23
|
from unstructured_ingest.v2.processes.connectors.astradb import (
|
|
20
24
|
CONNECTOR_TYPE,
|
|
21
25
|
AstraDBAccessConfig,
|
|
22
26
|
AstraDBConnectionConfig,
|
|
27
|
+
AstraDBDownloader,
|
|
28
|
+
AstraDBDownloaderConfig,
|
|
23
29
|
AstraDBIndexer,
|
|
24
30
|
AstraDBIndexerConfig,
|
|
25
31
|
AstraDBUploader,
|
|
@@ -110,6 +116,40 @@ def collection(upload_file: Path) -> Collection:
|
|
|
110
116
|
astra_db.drop_collection(collection)
|
|
111
117
|
|
|
112
118
|
|
|
119
|
+
@pytest.mark.asyncio
|
|
120
|
+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
|
|
121
|
+
@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
|
|
122
|
+
async def test_astra_search_source(
|
|
123
|
+
tmp_path: Path,
|
|
124
|
+
):
|
|
125
|
+
env_data = get_env_data()
|
|
126
|
+
collection_name = "ingest_test_src"
|
|
127
|
+
connection_config = AstraDBConnectionConfig(
|
|
128
|
+
access_config=AstraDBAccessConfig(token=env_data.token, api_endpoint=env_data.api_endpoint)
|
|
129
|
+
)
|
|
130
|
+
indexer = AstraDBIndexer(
|
|
131
|
+
index_config=AstraDBIndexerConfig(
|
|
132
|
+
collection_name=collection_name,
|
|
133
|
+
),
|
|
134
|
+
connection_config=connection_config,
|
|
135
|
+
)
|
|
136
|
+
downloader = AstraDBDownloader(
|
|
137
|
+
connection_config=connection_config,
|
|
138
|
+
download_config=AstraDBDownloaderConfig(download_dir=tmp_path),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
await source_connector_validation(
|
|
142
|
+
indexer=indexer,
|
|
143
|
+
downloader=downloader,
|
|
144
|
+
configs=SourceValidationConfigs(
|
|
145
|
+
test_id=CONNECTOR_TYPE,
|
|
146
|
+
expected_num_files=5,
|
|
147
|
+
expected_number_indexed_file_data=1,
|
|
148
|
+
validate_downloaded_files=True,
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
113
153
|
@pytest.mark.asyncio
|
|
114
154
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
115
155
|
@requires_env("ASTRA_DB_API_ENDPOINT", "ASTRA_DB_APPLICATION_TOKEN")
|
|
@@ -122,7 +122,7 @@ async def test_kafka_source_local(kafka_seed_topic: str):
|
|
|
122
122
|
indexer=indexer,
|
|
123
123
|
downloader=downloader,
|
|
124
124
|
configs=SourceValidationConfigs(
|
|
125
|
-
test_id="kafka", expected_num_files=5, validate_downloaded_files=True
|
|
125
|
+
test_id="kafka-local", expected_num_files=5, validate_downloaded_files=True
|
|
126
126
|
),
|
|
127
127
|
)
|
|
128
128
|
|
|
@@ -204,7 +204,7 @@ async def test_kafka_source_cloud(kafka_seed_topic_cloud: int):
|
|
|
204
204
|
indexer=indexer,
|
|
205
205
|
downloader=downloader,
|
|
206
206
|
configs=SourceValidationConfigs(
|
|
207
|
-
test_id="kafka",
|
|
207
|
+
test_id="kafka-cloud",
|
|
208
208
|
exclude_fields_extend=["connector_type"],
|
|
209
209
|
expected_num_files=expected_messages,
|
|
210
210
|
validate_downloaded_files=True,
|
|
@@ -197,7 +197,10 @@ async def test_mongodb_source(temp_dir: Path):
|
|
|
197
197
|
indexer=indexer,
|
|
198
198
|
downloader=downloader,
|
|
199
199
|
configs=SourceValidationConfigs(
|
|
200
|
-
test_id=CONNECTOR_TYPE,
|
|
200
|
+
test_id=CONNECTOR_TYPE,
|
|
201
|
+
expected_num_files=4,
|
|
202
|
+
validate_downloaded_files=True,
|
|
203
|
+
expected_number_indexed_file_data=1,
|
|
201
204
|
),
|
|
202
205
|
)
|
|
203
206
|
|
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import shutil
|
|
4
|
-
from dataclasses import replace
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional
|
|
7
6
|
|
|
8
7
|
from deepdiff import DeepDiff
|
|
9
8
|
from pydantic import Field
|
|
10
9
|
|
|
11
|
-
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
10
|
+
from test.integration.connectors.utils.validation.utils import ValidationConfig
|
|
12
11
|
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
13
12
|
|
|
14
13
|
|
|
@@ -92,7 +91,7 @@ def check_contents(
|
|
|
92
91
|
file_data_path = expected_output_dir / f"{file_data.identifier}.json"
|
|
93
92
|
with file_data_path.open("r") as file:
|
|
94
93
|
expected_file_data_contents = json.load(file)
|
|
95
|
-
current_file_data_contents = file_data.
|
|
94
|
+
current_file_data_contents = file_data.model_dump()
|
|
96
95
|
expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
|
|
97
96
|
current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
|
|
98
97
|
diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
|
|
@@ -160,9 +159,11 @@ def update_fixtures(
|
|
|
160
159
|
save_filedata: bool = True,
|
|
161
160
|
):
|
|
162
161
|
# Rewrite the current file data
|
|
162
|
+
if not output_dir.exists():
|
|
163
|
+
output_dir.mkdir(parents=True)
|
|
163
164
|
if save_filedata:
|
|
164
165
|
file_data_output_path = output_dir / "file_data"
|
|
165
|
-
|
|
166
|
+
shutil.rmtree(path=file_data_output_path, ignore_errors=True)
|
|
166
167
|
print(
|
|
167
168
|
f"Writing {len(all_file_data)} file data to "
|
|
168
169
|
f"saved fixture location {file_data_output_path}"
|
|
@@ -171,7 +172,7 @@ def update_fixtures(
|
|
|
171
172
|
for file_data in all_file_data:
|
|
172
173
|
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
173
174
|
with file_data_path.open(mode="w") as f:
|
|
174
|
-
json.dump(file_data.
|
|
175
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
175
176
|
|
|
176
177
|
# Record file structure of download directory
|
|
177
178
|
download_files = get_files(dir_path=download_dir)
|
|
@@ -183,7 +184,7 @@ def update_fixtures(
|
|
|
183
184
|
# If applicable, save raw downloads
|
|
184
185
|
if save_downloads:
|
|
185
186
|
raw_download_output_path = output_dir / "downloads"
|
|
186
|
-
|
|
187
|
+
shutil.rmtree(path=raw_download_output_path, ignore_errors=True)
|
|
187
188
|
print(
|
|
188
189
|
f"Writing {len(download_files)} downloaded files to "
|
|
189
190
|
f"saved fixture location {raw_download_output_path}"
|
|
@@ -213,7 +214,10 @@ def run_all_validations(
|
|
|
213
214
|
if configs.validate_file_data:
|
|
214
215
|
run_expected_results_validation(
|
|
215
216
|
expected_output_dir=test_output_dir / "file_data",
|
|
216
|
-
all_file_data=
|
|
217
|
+
all_file_data=get_all_file_data(
|
|
218
|
+
all_predownload_file_data=predownload_file_data,
|
|
219
|
+
all_postdownload_file_data=postdownload_file_data,
|
|
220
|
+
),
|
|
217
221
|
configs=configs,
|
|
218
222
|
)
|
|
219
223
|
download_files = get_files(dir_path=download_dir)
|
|
@@ -229,6 +233,19 @@ def run_all_validations(
|
|
|
229
233
|
)
|
|
230
234
|
|
|
231
235
|
|
|
236
|
+
def get_all_file_data(
|
|
237
|
+
all_postdownload_file_data: list[FileData], all_predownload_file_data: list[FileData]
|
|
238
|
+
) -> list[FileData]:
|
|
239
|
+
all_file_data = all_postdownload_file_data
|
|
240
|
+
indexed_file_data = [
|
|
241
|
+
fd
|
|
242
|
+
for fd in all_predownload_file_data
|
|
243
|
+
if fd.identifier not in [f.identifier for f in all_file_data]
|
|
244
|
+
]
|
|
245
|
+
all_file_data += indexed_file_data
|
|
246
|
+
return all_file_data
|
|
247
|
+
|
|
248
|
+
|
|
232
249
|
async def source_connector_validation(
|
|
233
250
|
indexer: Indexer,
|
|
234
251
|
downloader: Downloader,
|
|
@@ -246,7 +263,7 @@ async def source_connector_validation(
|
|
|
246
263
|
test_output_dir = configs.test_output_dir()
|
|
247
264
|
for file_data in indexer.run():
|
|
248
265
|
assert file_data
|
|
249
|
-
predownload_file_data =
|
|
266
|
+
predownload_file_data = file_data.model_copy(deep=True)
|
|
250
267
|
all_predownload_file_data.append(predownload_file_data)
|
|
251
268
|
if downloader.is_async():
|
|
252
269
|
resp = await downloader.run_async(file_data=file_data)
|
|
@@ -254,10 +271,10 @@ async def source_connector_validation(
|
|
|
254
271
|
resp = downloader.run(file_data=file_data)
|
|
255
272
|
if isinstance(resp, list):
|
|
256
273
|
for r in resp:
|
|
257
|
-
postdownload_file_data =
|
|
274
|
+
postdownload_file_data = r["file_data"].model_copy(deep=True)
|
|
258
275
|
all_postdownload_file_data.append(postdownload_file_data)
|
|
259
276
|
else:
|
|
260
|
-
postdownload_file_data =
|
|
277
|
+
postdownload_file_data = resp["file_data"].model_copy(deep=True)
|
|
261
278
|
all_postdownload_file_data.append(postdownload_file_data)
|
|
262
279
|
if not overwrite_fixtures:
|
|
263
280
|
print("Running validation")
|
|
@@ -273,7 +290,10 @@ async def source_connector_validation(
|
|
|
273
290
|
update_fixtures(
|
|
274
291
|
output_dir=test_output_dir,
|
|
275
292
|
download_dir=download_dir,
|
|
276
|
-
all_file_data=
|
|
293
|
+
all_file_data=get_all_file_data(
|
|
294
|
+
all_predownload_file_data=all_predownload_file_data,
|
|
295
|
+
all_postdownload_file_data=all_postdownload_file_data,
|
|
296
|
+
),
|
|
277
297
|
save_downloads=configs.validate_downloaded_files,
|
|
278
298
|
save_filedata=configs.validate_file_data,
|
|
279
299
|
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.10" # pragma: no cover
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from .connector import AccessConfig, BaseConnector, ConnectionConfig
|
|
2
2
|
from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
|
|
3
|
-
from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
|
|
3
|
+
from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
|
|
4
4
|
from .indexer import Indexer, IndexerConfig
|
|
5
5
|
from .process import BaseProcess
|
|
6
6
|
from .processor import ProcessorConfig
|
|
@@ -27,4 +27,6 @@ __all__ = [
|
|
|
27
27
|
"ConnectionConfig",
|
|
28
28
|
"BaseConnector",
|
|
29
29
|
"FileDataSourceMetadata",
|
|
30
|
+
"BatchFileData",
|
|
31
|
+
"BatchItem",
|
|
30
32
|
]
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from dataclasses import dataclass, field
|
|
3
2
|
from pathlib import Path
|
|
4
|
-
from typing import Any,
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
|
|
7
7
|
|
|
8
|
+
from unstructured_ingest.v2.logger import logger
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
class SourceIdentifiers:
|
|
10
|
+
|
|
11
|
+
class SourceIdentifiers(BaseModel):
|
|
11
12
|
filename: str
|
|
12
13
|
fullpath: str
|
|
13
14
|
rel_path: Optional[str] = None
|
|
@@ -21,8 +22,7 @@ class SourceIdentifiers:
|
|
|
21
22
|
return self.rel_path or self.fullpath
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
|
|
25
|
-
class FileDataSourceMetadata(DataClassJsonMixin):
|
|
25
|
+
class FileDataSourceMetadata(BaseModel):
|
|
26
26
|
url: Optional[str] = None
|
|
27
27
|
version: Optional[str] = None
|
|
28
28
|
record_locator: Optional[dict[str, Any]] = None
|
|
@@ -33,14 +33,12 @@ class FileDataSourceMetadata(DataClassJsonMixin):
|
|
|
33
33
|
filesize_bytes: Optional[int] = None
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
class FileData(DataClassJsonMixin):
|
|
36
|
+
class FileData(BaseModel):
|
|
38
37
|
identifier: str
|
|
39
38
|
connector_type: str
|
|
40
39
|
source_identifiers: Optional[SourceIdentifiers] = None
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
additional_metadata: dict[str, Any] = field(default_factory=dict)
|
|
40
|
+
metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
|
|
41
|
+
additional_metadata: dict[str, Any] = Field(default_factory=dict)
|
|
44
42
|
reprocess: bool = False
|
|
45
43
|
local_download_path: Optional[str] = None
|
|
46
44
|
display_name: Optional[str] = None
|
|
@@ -52,11 +50,57 @@ class FileData(DataClassJsonMixin):
|
|
|
52
50
|
raise ValueError(f"file path not valid: {path}")
|
|
53
51
|
with open(str(path.resolve()), "rb") as f:
|
|
54
52
|
file_data_dict = json.load(f)
|
|
55
|
-
file_data =
|
|
53
|
+
file_data = cls.model_validate(file_data_dict)
|
|
56
54
|
return file_data
|
|
57
55
|
|
|
56
|
+
@classmethod
|
|
57
|
+
def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
|
|
58
|
+
file_data_dict = file_data.model_dump()
|
|
59
|
+
return cls.model_validate(file_data_dict, **kwargs)
|
|
60
|
+
|
|
58
61
|
def to_file(self, path: str) -> None:
|
|
59
62
|
path = Path(path).resolve()
|
|
60
63
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
61
64
|
with open(str(path.resolve()), "w") as f:
|
|
62
|
-
json.dump(self.
|
|
65
|
+
json.dump(self.model_dump(), f, indent=2)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class BatchItem(BaseModel):
|
|
69
|
+
identifier: str
|
|
70
|
+
version: Optional[str] = None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BatchFileData(FileData):
|
|
74
|
+
identifier: str = Field(init=False)
|
|
75
|
+
batch_items: list[BatchItem]
|
|
76
|
+
|
|
77
|
+
@field_validator("batch_items")
|
|
78
|
+
@classmethod
|
|
79
|
+
def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
|
|
80
|
+
if not v:
|
|
81
|
+
raise ValueError("batch items cannot be empty")
|
|
82
|
+
all_identifiers = [item.identifier for item in v]
|
|
83
|
+
if len(all_identifiers) != len(set(all_identifiers)):
|
|
84
|
+
raise ValueError(f"duplicate identifiers: {all_identifiers}")
|
|
85
|
+
sorted_batch_items = sorted(v, key=lambda item: item.identifier)
|
|
86
|
+
return sorted_batch_items
|
|
87
|
+
|
|
88
|
+
@model_validator(mode="before")
|
|
89
|
+
@classmethod
|
|
90
|
+
def populate_identifier(cls, data: Any) -> Any:
|
|
91
|
+
if isinstance(data, dict) and "identifier" not in data:
|
|
92
|
+
batch_items = data["batch_items"]
|
|
93
|
+
identifier_data = json.dumps(
|
|
94
|
+
{item.identifier: item.version for item in batch_items}, sort_keys=True
|
|
95
|
+
)
|
|
96
|
+
data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
|
|
97
|
+
return data
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def file_data_from_file(path: str) -> FileData:
|
|
101
|
+
try:
|
|
102
|
+
return BatchFileData.from_file(path=path)
|
|
103
|
+
except ValidationError:
|
|
104
|
+
logger.debug(f"{path} not valid for batch file data")
|
|
105
|
+
|
|
106
|
+
return FileData.from_file(path=path)
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Callable, Optional, TypedDict
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
10
11
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
12
|
from unstructured_ingest.v2.processes.chunker import Chunker
|
|
@@ -51,7 +52,7 @@ class ChunkStep(PipelineStep):
|
|
|
51
52
|
self, fn: Callable, path: str, file_data_path: str, **kwargs
|
|
52
53
|
) -> ChunkStepResponse:
|
|
53
54
|
path = Path(path)
|
|
54
|
-
file_data =
|
|
55
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
55
56
|
output_filepath = self.get_output_filepath(filename=path)
|
|
56
57
|
if not self.should_chunk(filepath=output_filepath, file_data=file_data):
|
|
57
58
|
logger.debug(f"skipping chunking, output already exists: {output_filepath}")
|
|
@@ -8,6 +8,7 @@ from typing import Callable, Optional, TypedDict, TypeVar
|
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.v2.interfaces import FileData, download_responses
|
|
10
10
|
from unstructured_ingest.v2.interfaces.downloader import Downloader
|
|
11
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
11
12
|
from unstructured_ingest.v2.logger import logger
|
|
12
13
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
13
14
|
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
@@ -87,12 +88,12 @@ class DownloadStep(PipelineStep):
|
|
|
87
88
|
f"match size of local file: {file_size_bytes}, updating"
|
|
88
89
|
)
|
|
89
90
|
file_data.metadata.filesize_bytes = file_size_bytes
|
|
90
|
-
logger.debug(f"updating file data with new content: {file_data.
|
|
91
|
+
logger.debug(f"updating file data with new content: {file_data.model_dump()}")
|
|
91
92
|
with file_data_path.open("w") as file:
|
|
92
|
-
json.dump(file_data.
|
|
93
|
+
json.dump(file_data.model_dump(), file, indent=2)
|
|
93
94
|
|
|
94
95
|
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
95
|
-
file_data =
|
|
96
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
96
97
|
download_path = self.process.get_download_path(file_data=file_data)
|
|
97
98
|
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
|
|
98
99
|
logger.debug(f"skipping download, file already exists locally: {download_path}")
|
|
@@ -172,7 +173,7 @@ class DownloadStep(PipelineStep):
|
|
|
172
173
|
filepath = (self.cache_dir / filename).resolve()
|
|
173
174
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
174
175
|
with open(str(filepath), "w") as f:
|
|
175
|
-
json.dump(file_data.
|
|
176
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
176
177
|
return str(filepath)
|
|
177
178
|
|
|
178
179
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Callable, Optional, TypedDict
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
10
11
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
12
|
from unstructured_ingest.v2.processes.embedder import Embedder
|
|
@@ -49,7 +50,7 @@ class EmbedStep(PipelineStep):
|
|
|
49
50
|
|
|
50
51
|
async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
|
|
51
52
|
path = Path(path)
|
|
52
|
-
file_data =
|
|
53
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
53
54
|
output_filepath = self.get_output_filepath(filename=path)
|
|
54
55
|
if not self.should_embed(filepath=output_filepath, file_data=file_data):
|
|
55
56
|
logger.debug(f"skipping embedding, output already exists: {output_filepath}")
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Callable, Optional
|
|
4
4
|
|
|
5
|
-
from unstructured_ingest.v2.interfaces.file_data import
|
|
5
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
6
6
|
from unstructured_ingest.v2.logger import logger
|
|
7
7
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
8
8
|
from unstructured_ingest.v2.processes.filter import Filterer
|
|
@@ -20,7 +20,7 @@ class FilterStep(PipelineStep):
|
|
|
20
20
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
21
21
|
|
|
22
22
|
async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
|
|
23
|
-
file_data =
|
|
23
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
24
24
|
fn_kwargs = {"file_data": file_data}
|
|
25
25
|
if not asyncio.iscoroutinefunction(fn):
|
|
26
26
|
resp = fn(**fn_kwargs)
|
|
@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
|
|
|
37
37
|
@instrument(span_name=STEP_ID)
|
|
38
38
|
def run(self) -> Generator[str, None, None]:
|
|
39
39
|
for file_data in self.process.run():
|
|
40
|
-
logger.debug(f"generated file data: {file_data.
|
|
40
|
+
logger.debug(f"generated file data: {file_data.model_dump()}")
|
|
41
41
|
try:
|
|
42
42
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
43
43
|
filename = f"{record_hash}.json"
|
|
44
44
|
filepath = (self.cache_dir / filename).resolve()
|
|
45
45
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
46
46
|
with open(str(filepath), "w") as f:
|
|
47
|
-
json.dump(file_data.
|
|
47
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
48
48
|
yield str(filepath)
|
|
49
49
|
except Exception as e:
|
|
50
50
|
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
|
|
|
54
54
|
|
|
55
55
|
async def run_async(self) -> AsyncGenerator[str, None]:
|
|
56
56
|
async for file_data in self.process.run_async():
|
|
57
|
-
logger.debug(f"generated file data: {file_data.
|
|
57
|
+
logger.debug(f"generated file data: {file_data.model_dump()}")
|
|
58
58
|
try:
|
|
59
59
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
60
60
|
filename = f"{record_hash}.json"
|
|
61
61
|
filepath = (self.cache_dir / filename).resolve()
|
|
62
62
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
63
63
|
with open(str(filepath), "w") as f:
|
|
64
|
-
json.dump(file_data.
|
|
64
|
+
json.dump(file_data.model_dump(), f, indent=2)
|
|
65
65
|
yield str(filepath)
|
|
66
66
|
except Exception as e:
|
|
67
67
|
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Callable, Optional, TypedDict
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
10
11
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
12
|
from unstructured_ingest.v2.processes.partitioner import Partitioner
|
|
@@ -51,12 +52,12 @@ class PartitionStep(PipelineStep):
|
|
|
51
52
|
self, fn: Callable, path: str, file_data_path: str
|
|
52
53
|
) -> Optional[PartitionStepResponse]:
|
|
53
54
|
path = Path(path)
|
|
54
|
-
file_data =
|
|
55
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
55
56
|
output_filepath = self.get_output_filepath(filename=Path(file_data_path))
|
|
56
57
|
if not self.should_partition(filepath=output_filepath, file_data=file_data):
|
|
57
58
|
logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
|
|
58
59
|
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
59
|
-
fn_kwargs = {"filename": path, "metadata": file_data.metadata.
|
|
60
|
+
fn_kwargs = {"filename": path, "metadata": file_data.metadata.model_dump()}
|
|
60
61
|
if not asyncio.iscoroutinefunction(fn):
|
|
61
62
|
partitioned_content = fn(**fn_kwargs)
|
|
62
63
|
elif semaphore := self.context.semaphore:
|
|
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Callable, Optional, TypedDict
|
|
6
6
|
|
|
7
|
-
from unstructured_ingest.v2.interfaces.file_data import
|
|
7
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
8
8
|
from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
|
|
9
9
|
from unstructured_ingest.v2.logger import logger
|
|
10
10
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
@@ -43,7 +43,7 @@ class UploadStageStep(PipelineStep):
|
|
|
43
43
|
output_filename = f"{self.get_hash(extras=[path.name])}{path.suffix}"
|
|
44
44
|
fn_kwargs = {
|
|
45
45
|
"elements_filepath": path,
|
|
46
|
-
"file_data":
|
|
46
|
+
"file_data": file_data_from_file(path=file_data_path),
|
|
47
47
|
"output_dir": self.cache_dir,
|
|
48
48
|
"output_filename": output_filename,
|
|
49
49
|
}
|
|
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Callable, TypedDict
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces.file_data import
|
|
6
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
7
7
|
from unstructured_ingest.v2.logger import logger
|
|
8
8
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
9
9
|
from unstructured_ingest.v2.processes.uncompress import Uncompressor
|
|
@@ -28,7 +28,7 @@ class UncompressStep(PipelineStep):
|
|
|
28
28
|
async def _run_async(
|
|
29
29
|
self, fn: Callable, path: str, file_data_path: str
|
|
30
30
|
) -> list[UncompressStepResponse]:
|
|
31
|
-
file_data =
|
|
31
|
+
file_data = file_data_from_file(path=file_data_path)
|
|
32
32
|
fn_kwargs = {"file_data": file_data}
|
|
33
33
|
if not asyncio.iscoroutinefunction(fn):
|
|
34
34
|
new_file_data = fn(**fn_kwargs)
|
|
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Callable, Optional, TypedDict
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces import
|
|
6
|
+
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
7
7
|
from unstructured_ingest.v2.interfaces.uploader import UploadContent
|
|
8
8
|
from unstructured_ingest.v2.logger import logger
|
|
9
9
|
from unstructured_ingest.v2.pipeline.interfaces import BatchPipelineStep
|
|
@@ -41,14 +41,14 @@ class UploadStep(BatchPipelineStep):
|
|
|
41
41
|
@instrument(span_name=STEP_ID)
|
|
42
42
|
def _run_batch(self, contents: list[UploadStepContent]) -> None:
|
|
43
43
|
upload_contents = [
|
|
44
|
-
UploadContent(path=Path(c["path"]), file_data=
|
|
44
|
+
UploadContent(path=Path(c["path"]), file_data=file_data_from_file(c["file_data_path"]))
|
|
45
45
|
for c in contents
|
|
46
46
|
]
|
|
47
47
|
self.process.run_batch(contents=upload_contents)
|
|
48
48
|
|
|
49
49
|
async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None):
|
|
50
50
|
fn = fn or self.process.run_async
|
|
51
|
-
fn_kwargs = {"path": Path(path), "file_data":
|
|
51
|
+
fn_kwargs = {"path": Path(path), "file_data": file_data_from_file(path=file_data_path)}
|
|
52
52
|
if not asyncio.iscoroutinefunction(fn):
|
|
53
53
|
fn(**fn_kwargs)
|
|
54
54
|
elif semaphore := self.context.semaphore:
|
|
@@ -40,6 +40,8 @@ from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
|
|
|
40
40
|
from .milvus import milvus_destination_entry
|
|
41
41
|
from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
42
42
|
from .mongodb import mongodb_destination_entry, mongodb_source_entry
|
|
43
|
+
from .neo4j import CONNECTOR_TYPE as NEO4J_CONNECTOR_TYPE
|
|
44
|
+
from .neo4j import neo4j_destination_entry
|
|
43
45
|
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
44
46
|
from .onedrive import onedrive_destination_entry, onedrive_source_entry
|
|
45
47
|
from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
|
|
@@ -74,6 +76,7 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
|
|
|
74
76
|
add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
|
|
75
77
|
add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
|
|
76
78
|
|
|
79
|
+
add_destination_entry(destination_type=NEO4J_CONNECTOR_TYPE, entry=neo4j_destination_entry)
|
|
77
80
|
|
|
78
81
|
add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
|
|
79
82
|
|