unstructured-ingest 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/conftest.py +13 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +8 -4
- test/integration/connectors/sql/test_postgres.py +6 -10
- test/integration/connectors/sql/test_snowflake.py +205 -0
- test/integration/connectors/sql/test_sqlite.py +6 -10
- test/integration/connectors/test_delta_table.py +138 -0
- test/integration/connectors/utils/docker.py +78 -0
- test/integration/connectors/utils/validation.py +93 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +32 -1
- unstructured_ingest/v2/cli/utils/model_conversion.py +10 -3
- unstructured_ingest/v2/interfaces/indexer.py +4 -1
- unstructured_ingest/v2/pipeline/pipeline.py +10 -2
- unstructured_ingest/v2/pipeline/steps/index.py +18 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +10 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +1 -1
- unstructured_ingest/v2/processes/connectors/delta_table.py +185 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +10 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +33 -37
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +164 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +38 -10
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +31 -32
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/METADATA +14 -12
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/RECORD +29 -24
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -250
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import filecmp
|
|
1
2
|
import json
|
|
2
3
|
import os
|
|
3
4
|
import shutil
|
|
@@ -5,15 +6,31 @@ from dataclasses import dataclass, field, replace
|
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Callable, Optional
|
|
7
8
|
|
|
9
|
+
import pandas as pd
|
|
8
10
|
from deepdiff import DeepDiff
|
|
9
11
|
|
|
10
12
|
from test.integration.connectors.utils.constants import expected_results_path
|
|
11
13
|
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
12
14
|
|
|
13
15
|
|
|
16
|
+
def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
17
|
+
expected_df = pd.read_csv(expected_filepath)
|
|
18
|
+
current_df = pd.read_csv(current_filepath)
|
|
19
|
+
if expected_df.equals(current_df):
|
|
20
|
+
return True
|
|
21
|
+
# Print diff
|
|
22
|
+
diff = expected_df.merge(current_df, indicator=True, how="left").loc[
|
|
23
|
+
lambda x: x["_merge"] != "both"
|
|
24
|
+
]
|
|
25
|
+
print("diff between expected and current df:")
|
|
26
|
+
print(diff)
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
|
|
14
30
|
@dataclass
|
|
15
31
|
class ValidationConfigs:
|
|
16
32
|
test_id: str
|
|
33
|
+
expected_number_indexed_file_data: Optional[int] = None
|
|
17
34
|
expected_num_files: Optional[int] = None
|
|
18
35
|
predownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
19
36
|
postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
@@ -21,6 +38,8 @@ class ValidationConfigs:
|
|
|
21
38
|
default_factory=lambda: ["local_download_path", "metadata.date_processed"]
|
|
22
39
|
)
|
|
23
40
|
exclude_fields_extend: list[str] = field(default_factory=list)
|
|
41
|
+
validate_downloaded_files: bool = False
|
|
42
|
+
downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
|
|
24
43
|
|
|
25
44
|
def get_exclude_fields(self) -> list[str]:
|
|
26
45
|
exclude_fields = self.exclude_fields
|
|
@@ -78,6 +97,13 @@ def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
|
|
|
78
97
|
assert not diff, "diff in files that exist: {}".format(", ".join(diff))
|
|
79
98
|
|
|
80
99
|
|
|
100
|
+
def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
|
|
101
|
+
expected_files = get_files(dir_path=expected_output_dir)
|
|
102
|
+
current_files = get_files(dir_path=current_output_dir)
|
|
103
|
+
diff = set(expected_files) ^ set(current_files)
|
|
104
|
+
assert not diff, "diff in files that exist: {}".format(", ".join(diff))
|
|
105
|
+
|
|
106
|
+
|
|
81
107
|
def check_contents(
|
|
82
108
|
expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
|
|
83
109
|
):
|
|
@@ -96,6 +122,32 @@ def check_contents(
|
|
|
96
122
|
assert not found_diff, f"Diffs found between files: {found_diff}"
|
|
97
123
|
|
|
98
124
|
|
|
125
|
+
def check_raw_file_contents(
|
|
126
|
+
expected_output_dir: Path,
|
|
127
|
+
current_output_dir: Path,
|
|
128
|
+
configs: ValidationConfigs,
|
|
129
|
+
):
|
|
130
|
+
current_files = get_files(dir_path=current_output_dir)
|
|
131
|
+
found_diff = False
|
|
132
|
+
files = []
|
|
133
|
+
for current_file in current_files:
|
|
134
|
+
current_file_path = current_output_dir / current_file
|
|
135
|
+
expected_file_path = expected_output_dir / current_file
|
|
136
|
+
if downloaded_file_equality_check := configs.downloaded_file_equality_check:
|
|
137
|
+
is_different = downloaded_file_equality_check(expected_file_path, current_file_path)
|
|
138
|
+
elif expected_file_path.suffix == ".csv" and current_file_path.suffix == ".csv":
|
|
139
|
+
is_different = not pandas_df_equality_check(
|
|
140
|
+
expected_filepath=expected_file_path, current_filepath=current_file_path
|
|
141
|
+
)
|
|
142
|
+
else:
|
|
143
|
+
is_different = not filecmp.cmp(expected_file_path, current_file_path, shallow=False)
|
|
144
|
+
if is_different:
|
|
145
|
+
found_diff = True
|
|
146
|
+
files.append(str(expected_file_path))
|
|
147
|
+
print(f"diffs between files {expected_file_path} and {current_file_path}")
|
|
148
|
+
assert not found_diff, "Diffs found between files: {}".format(", ".join(files))
|
|
149
|
+
|
|
150
|
+
|
|
99
151
|
def run_expected_results_validation(
|
|
100
152
|
expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
|
|
101
153
|
):
|
|
@@ -105,6 +157,21 @@ def run_expected_results_validation(
|
|
|
105
157
|
)
|
|
106
158
|
|
|
107
159
|
|
|
160
|
+
def run_expected_download_files_validation(
|
|
161
|
+
expected_output_dir: Path,
|
|
162
|
+
current_download_dir: Path,
|
|
163
|
+
configs: ValidationConfigs,
|
|
164
|
+
):
|
|
165
|
+
check_files_in_paths(
|
|
166
|
+
expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
|
|
167
|
+
)
|
|
168
|
+
check_raw_file_contents(
|
|
169
|
+
expected_output_dir=expected_output_dir,
|
|
170
|
+
current_output_dir=current_download_dir,
|
|
171
|
+
configs=configs,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
108
175
|
def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
|
|
109
176
|
directory_record = expected_output_dir / "directory_structure.json"
|
|
110
177
|
with directory_record.open("r") as directory_file:
|
|
@@ -113,13 +180,18 @@ def run_directory_structure_validation(expected_output_dir: Path, download_files
|
|
|
113
180
|
assert directory_structure == download_files
|
|
114
181
|
|
|
115
182
|
|
|
116
|
-
def update_fixtures(
|
|
183
|
+
def update_fixtures(
|
|
184
|
+
output_dir: Path,
|
|
185
|
+
download_dir: Path,
|
|
186
|
+
all_file_data: list[FileData],
|
|
187
|
+
save_downloads: bool = False,
|
|
188
|
+
):
|
|
117
189
|
# Delete current files
|
|
118
190
|
shutil.rmtree(path=output_dir, ignore_errors=True)
|
|
119
191
|
output_dir.mkdir(parents=True)
|
|
120
192
|
# Rewrite the current file data
|
|
121
193
|
file_data_output_path = output_dir / "file_data"
|
|
122
|
-
file_data_output_path.mkdir(parents=True)
|
|
194
|
+
file_data_output_path.mkdir(parents=True, exist_ok=True)
|
|
123
195
|
for file_data in all_file_data:
|
|
124
196
|
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
125
197
|
with file_data_path.open(mode="w") as f:
|
|
@@ -132,6 +204,11 @@ def update_fixtures(output_dir: Path, download_dir: Path, all_file_data: list[Fi
|
|
|
132
204
|
with download_dir_record.open(mode="w") as f:
|
|
133
205
|
json.dump({"directory_structure": download_files}, f, indent=2)
|
|
134
206
|
|
|
207
|
+
# If applicable, save raw downloads
|
|
208
|
+
if save_downloads:
|
|
209
|
+
raw_download_output_path = output_dir / "downloads"
|
|
210
|
+
shutil.copytree(download_dir, raw_download_output_path)
|
|
211
|
+
|
|
135
212
|
|
|
136
213
|
def run_all_validations(
|
|
137
214
|
configs: ValidationConfigs,
|
|
@@ -140,6 +217,13 @@ def run_all_validations(
|
|
|
140
217
|
download_dir: Path,
|
|
141
218
|
test_output_dir: Path,
|
|
142
219
|
):
|
|
220
|
+
if expected_number_indexed_file_data := configs.expected_number_indexed_file_data:
|
|
221
|
+
assert (
|
|
222
|
+
len(predownload_file_data) == expected_number_indexed_file_data
|
|
223
|
+
), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
|
|
224
|
+
if expected_num_files := configs.expected_num_files:
|
|
225
|
+
assert len(postdownload_file_data) == expected_num_files
|
|
226
|
+
|
|
143
227
|
for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
|
|
144
228
|
configs.run_file_data_validation(
|
|
145
229
|
predownload_file_data=pre_data, postdownload_file_data=post_data
|
|
@@ -155,6 +239,12 @@ def run_all_validations(
|
|
|
155
239
|
run_directory_structure_validation(
|
|
156
240
|
expected_output_dir=configs.test_output_dir(), download_files=download_files
|
|
157
241
|
)
|
|
242
|
+
if configs.validate_downloaded_files:
|
|
243
|
+
run_expected_download_files_validation(
|
|
244
|
+
expected_output_dir=test_output_dir / "downloads",
|
|
245
|
+
current_download_dir=download_dir,
|
|
246
|
+
configs=configs,
|
|
247
|
+
)
|
|
158
248
|
|
|
159
249
|
|
|
160
250
|
async def source_connector_validation(
|
|
@@ -200,4 +290,5 @@ async def source_connector_validation(
|
|
|
200
290
|
output_dir=test_output_dir,
|
|
201
291
|
download_dir=download_dir,
|
|
202
292
|
all_file_data=all_postdownload_file_data,
|
|
293
|
+
save_downloads=configs.validate_downloaded_files,
|
|
203
294
|
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.2.0" # pragma: no cover
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os.path
|
|
3
|
+
from datetime import date, datetime
|
|
3
4
|
from gettext import gettext, ngettext
|
|
4
5
|
from gettext import gettext as _
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Any, Optional, Type, TypeVar, Union
|
|
7
8
|
|
|
8
9
|
import click
|
|
9
|
-
from pydantic import BaseModel, ConfigDict, Secret
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Secret, TypeAdapter, ValidationError
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def conform_click_options(options: dict):
|
|
@@ -109,6 +110,36 @@ class DelimitedString(click.ParamType):
|
|
|
109
110
|
return split
|
|
110
111
|
|
|
111
112
|
|
|
113
|
+
class PydanticDateTime(click.ParamType):
|
|
114
|
+
name = "datetime"
|
|
115
|
+
|
|
116
|
+
def convert(
|
|
117
|
+
self,
|
|
118
|
+
value: Any,
|
|
119
|
+
param: Optional[click.Parameter] = None,
|
|
120
|
+
ctx: Optional[click.Context] = None,
|
|
121
|
+
) -> Any:
|
|
122
|
+
try:
|
|
123
|
+
return TypeAdapter(datetime).validate_strings(value)
|
|
124
|
+
except ValidationError:
|
|
125
|
+
self.fail(f"{value} is not a valid datetime", param, ctx)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class PydanticDate(click.ParamType):
|
|
129
|
+
name = "date"
|
|
130
|
+
|
|
131
|
+
def convert(
|
|
132
|
+
self,
|
|
133
|
+
value: Any,
|
|
134
|
+
param: Optional[click.Parameter] = None,
|
|
135
|
+
ctx: Optional[click.Context] = None,
|
|
136
|
+
) -> Any:
|
|
137
|
+
try:
|
|
138
|
+
return TypeAdapter(date).validate_strings(value)
|
|
139
|
+
except ValidationError:
|
|
140
|
+
self.fail(f"{value} is not a valid date", param, ctx)
|
|
141
|
+
|
|
142
|
+
|
|
112
143
|
BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
|
|
113
144
|
|
|
114
145
|
|
|
@@ -25,7 +25,12 @@ from pydantic.fields import FieldInfo
|
|
|
25
25
|
from pydantic.types import _SecretBase
|
|
26
26
|
from pydantic_core import PydanticUndefined
|
|
27
27
|
|
|
28
|
-
from unstructured_ingest.v2.cli.utils.click import
|
|
28
|
+
from unstructured_ingest.v2.cli.utils.click import (
|
|
29
|
+
DelimitedString,
|
|
30
|
+
Dict,
|
|
31
|
+
PydanticDate,
|
|
32
|
+
PydanticDateTime,
|
|
33
|
+
)
|
|
29
34
|
|
|
30
35
|
NoneType = type(None)
|
|
31
36
|
|
|
@@ -135,8 +140,10 @@ def get_type_from_annotation(field_type: Any) -> click.ParamType:
|
|
|
135
140
|
return click.UUID
|
|
136
141
|
if field_type is Path:
|
|
137
142
|
return click.Path(path_type=Path)
|
|
138
|
-
if field_type
|
|
139
|
-
return
|
|
143
|
+
if field_type is datetime.datetime:
|
|
144
|
+
return PydanticDateTime()
|
|
145
|
+
if field_type is datetime.date:
|
|
146
|
+
return PydanticDate()
|
|
140
147
|
if field_origin is Literal:
|
|
141
148
|
return click.Choice(field_args)
|
|
142
149
|
if isinstance(field_type, EnumMeta):
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Any, Generator, Optional, TypeVar
|
|
2
|
+
from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
@@ -25,3 +25,6 @@ class Indexer(BaseProcess, BaseConnector, ABC):
|
|
|
25
25
|
@abstractmethod
|
|
26
26
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
27
27
|
pass
|
|
28
|
+
|
|
29
|
+
async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
|
|
30
|
+
raise NotImplementedError()
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import logging
|
|
4
5
|
import multiprocessing as mp
|
|
5
6
|
import shutil
|
|
@@ -186,6 +187,14 @@ class Pipeline:
|
|
|
186
187
|
filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
|
|
187
188
|
return filtered_records
|
|
188
189
|
|
|
190
|
+
def get_indices(self) -> list[dict]:
|
|
191
|
+
if self.indexer_step.process.is_async():
|
|
192
|
+
indices = asyncio.run(self.indexer_step.run_async())
|
|
193
|
+
else:
|
|
194
|
+
indices = self.indexer_step.run()
|
|
195
|
+
indices_inputs = [{"file_data_path": i} for i in indices]
|
|
196
|
+
return indices_inputs
|
|
197
|
+
|
|
189
198
|
def _run(self):
|
|
190
199
|
logger.info(
|
|
191
200
|
f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
|
|
@@ -197,8 +206,7 @@ class Pipeline:
|
|
|
197
206
|
self.context.status = {}
|
|
198
207
|
|
|
199
208
|
# Index into data source
|
|
200
|
-
|
|
201
|
-
indices_inputs = [{"file_data_path": i} for i in indices]
|
|
209
|
+
indices_inputs = self.get_indices()
|
|
202
210
|
if not indices_inputs:
|
|
203
211
|
logger.info("No files to process after indexer, exiting")
|
|
204
212
|
return
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import json
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import Generator, Optional, TypeVar
|
|
4
|
+
from typing import AsyncGenerator, Generator, Optional, TypeVar
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.v2.interfaces.indexer import Indexer
|
|
7
7
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -52,6 +52,23 @@ class IndexStep(PipelineStep):
|
|
|
52
52
|
raise e
|
|
53
53
|
continue
|
|
54
54
|
|
|
55
|
+
async def run_async(self) -> AsyncGenerator[str, None]:
|
|
56
|
+
async for file_data in self.process.run_async():
|
|
57
|
+
logger.debug(f"generated file data: {file_data.to_dict()}")
|
|
58
|
+
try:
|
|
59
|
+
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
60
|
+
filename = f"{record_hash}.json"
|
|
61
|
+
filepath = (self.cache_dir / filename).resolve()
|
|
62
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
63
|
+
with open(str(filepath), "w") as f:
|
|
64
|
+
json.dump(file_data.to_dict(), f, indent=2)
|
|
65
|
+
yield str(filepath)
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
68
|
+
if self.context.raise_on_error:
|
|
69
|
+
raise e
|
|
70
|
+
continue
|
|
71
|
+
|
|
55
72
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
56
73
|
index_config_dict = json.loads(
|
|
57
74
|
serialize_base_model_json(model=self.process.index_config, sort_keys=True)
|
|
@@ -18,6 +18,8 @@ from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
|
|
|
18
18
|
from .chroma import chroma_destination_entry
|
|
19
19
|
from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
|
|
20
20
|
from .couchbase import couchbase_destination_entry, couchbase_source_entry
|
|
21
|
+
from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
|
|
22
|
+
from .delta_table import delta_table_destination_entry
|
|
21
23
|
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
22
24
|
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
23
25
|
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
@@ -44,6 +46,8 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
|
|
|
44
46
|
from .sharepoint import sharepoint_source_entry
|
|
45
47
|
from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
|
|
46
48
|
from .singlestore import singlestore_destination_entry
|
|
49
|
+
from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
|
|
50
|
+
from .slack import slack_source_entry
|
|
47
51
|
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
|
48
52
|
from .weaviate import weaviate_destination_entry
|
|
49
53
|
|
|
@@ -54,6 +58,10 @@ add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_desti
|
|
|
54
58
|
add_source_entry(source_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_source_entry)
|
|
55
59
|
add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
|
|
56
60
|
|
|
61
|
+
add_destination_entry(
|
|
62
|
+
destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
|
|
63
|
+
)
|
|
64
|
+
|
|
57
65
|
add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
|
|
58
66
|
add_destination_entry(
|
|
59
67
|
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
|
|
@@ -93,3 +101,5 @@ add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destina
|
|
|
93
101
|
add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
|
|
94
102
|
|
|
95
103
|
add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
|
|
104
|
+
|
|
105
|
+
add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
|
|
@@ -166,7 +166,7 @@ class DatabricksVolumesUploader(Uploader, ABC):
|
|
|
166
166
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
167
167
|
|
|
168
168
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
169
|
-
output_path = os.path.join(self.upload_config.path,
|
|
169
|
+
output_path = os.path.join(self.upload_config.path, file_data.source_identifiers.filename)
|
|
170
170
|
with open(path, "rb") as elements_file:
|
|
171
171
|
self.connection_config.get_client().files.upload(
|
|
172
172
|
file_path=output_path,
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from multiprocessing import Process
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
14
|
+
from unstructured_ingest.v2.interfaces import (
|
|
15
|
+
AccessConfig,
|
|
16
|
+
ConnectionConfig,
|
|
17
|
+
FileData,
|
|
18
|
+
Uploader,
|
|
19
|
+
UploaderConfig,
|
|
20
|
+
UploadStager,
|
|
21
|
+
UploadStagerConfig,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.v2.logger import logger
|
|
24
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
25
|
+
|
|
26
|
+
CONNECTOR_TYPE = "delta_table"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DeltaTableAccessConfig(AccessConfig):
|
|
30
|
+
aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
|
|
31
|
+
aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DeltaTableConnectionConfig(ConnectionConfig):
|
|
35
|
+
access_config: Secret[DeltaTableAccessConfig] = Field(
|
|
36
|
+
default=DeltaTableAccessConfig(), validate_default=True
|
|
37
|
+
)
|
|
38
|
+
aws_region: Optional[str] = Field(default=None, description="AWS Region")
|
|
39
|
+
table_uri: str = Field(
|
|
40
|
+
default=None,
|
|
41
|
+
description=(
|
|
42
|
+
"Local path or path to the target folder in the S3 bucket, "
|
|
43
|
+
"formatted as s3://my-bucket/my-folder/"
|
|
44
|
+
),
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def update_storage_options(self, storage_options: dict) -> None:
|
|
48
|
+
secrets = self.access_config.get_secret_value()
|
|
49
|
+
if self.aws_region and secrets.aws_access_key_id and secrets.aws_secret_access_key:
|
|
50
|
+
storage_options["AWS_REGION"] = self.aws_region
|
|
51
|
+
storage_options["AWS_ACCESS_KEY_ID"] = secrets.aws_access_key_id
|
|
52
|
+
storage_options["AWS_SECRET_ACCESS_KEY"] = secrets.aws_secret_access_key
|
|
53
|
+
# Delta-rs doesn't support concurrent S3 writes without external locks (DynamoDB).
|
|
54
|
+
# This flag allows single-writer uploads to S3 without using locks, according to:
|
|
55
|
+
# https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/
|
|
56
|
+
storage_options["AWS_S3_ALLOW_UNSAFE_RENAME"] = "true"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class DeltaTableUploadStagerConfig(UploadStagerConfig):
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class DeltaTableUploadStager(UploadStager):
|
|
65
|
+
upload_stager_config: DeltaTableUploadStagerConfig = field(
|
|
66
|
+
default_factory=lambda: DeltaTableUploadStagerConfig()
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def run(
|
|
70
|
+
self,
|
|
71
|
+
elements_filepath: Path,
|
|
72
|
+
output_dir: Path,
|
|
73
|
+
output_filename: str,
|
|
74
|
+
**kwargs: Any,
|
|
75
|
+
) -> Path:
|
|
76
|
+
with open(elements_filepath) as elements_file:
|
|
77
|
+
elements_contents = json.load(elements_file)
|
|
78
|
+
|
|
79
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.parquet")
|
|
80
|
+
|
|
81
|
+
df = convert_to_pandas_dataframe(elements_dict=elements_contents)
|
|
82
|
+
df.to_parquet(output_path)
|
|
83
|
+
|
|
84
|
+
return output_path
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class DeltaTableUploaderConfig(UploaderConfig):
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class DeltaTableUploader(Uploader):
|
|
93
|
+
upload_config: DeltaTableUploaderConfig
|
|
94
|
+
connection_config: DeltaTableConnectionConfig
|
|
95
|
+
connector_type: str = CONNECTOR_TYPE
|
|
96
|
+
|
|
97
|
+
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
98
|
+
def precheck(self):
|
|
99
|
+
secrets = self.connection_config.access_config.get_secret_value()
|
|
100
|
+
if (
|
|
101
|
+
self.connection_config.aws_region
|
|
102
|
+
and secrets.aws_access_key_id
|
|
103
|
+
and secrets.aws_secret_access_key
|
|
104
|
+
):
|
|
105
|
+
from fsspec import get_filesystem_class
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
fs = get_filesystem_class("s3")(
|
|
109
|
+
key=secrets.aws_access_key_id, secret=secrets.aws_secret_access_key
|
|
110
|
+
)
|
|
111
|
+
fs.write_bytes(path=self.connection_config.table_uri, value=b"")
|
|
112
|
+
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
115
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
116
|
+
|
|
117
|
+
def process_csv(self, csv_paths: list[Path]) -> pd.DataFrame:
|
|
118
|
+
logger.debug(f"uploading content from {len(csv_paths)} csv files")
|
|
119
|
+
df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
|
|
120
|
+
return df
|
|
121
|
+
|
|
122
|
+
def process_json(self, json_paths: list[Path]) -> pd.DataFrame:
|
|
123
|
+
logger.debug(f"uploading content from {len(json_paths)} json files")
|
|
124
|
+
all_records = []
|
|
125
|
+
for p in json_paths:
|
|
126
|
+
with open(p) as json_file:
|
|
127
|
+
all_records.extend(json.load(json_file))
|
|
128
|
+
|
|
129
|
+
return pd.DataFrame(data=all_records)
|
|
130
|
+
|
|
131
|
+
def process_parquet(self, parquet_paths: list[Path]) -> pd.DataFrame:
|
|
132
|
+
logger.debug(f"uploading content from {len(parquet_paths)} parquet files")
|
|
133
|
+
df = pd.concat((pd.read_parquet(path) for path in parquet_paths), ignore_index=True)
|
|
134
|
+
return df
|
|
135
|
+
|
|
136
|
+
def read_dataframe(self, path: Path) -> pd.DataFrame:
|
|
137
|
+
if path.suffix == ".csv":
|
|
138
|
+
return self.process_csv(csv_paths=[path])
|
|
139
|
+
elif path.suffix == ".json":
|
|
140
|
+
return self.process_json(json_paths=[path])
|
|
141
|
+
elif path.suffix == ".parquet":
|
|
142
|
+
return self.process_parquet(parquet_paths=[path])
|
|
143
|
+
else:
|
|
144
|
+
raise ValueError(f"Unsupported file type, must be parquet, json or csv file: {path}")
|
|
145
|
+
|
|
146
|
+
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
147
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
148
|
+
from deltalake.writer import write_deltalake
|
|
149
|
+
|
|
150
|
+
df = self.read_dataframe(path)
|
|
151
|
+
updated_upload_path = os.path.join(
|
|
152
|
+
self.connection_config.table_uri, file_data.source_identifiers.relative_path
|
|
153
|
+
)
|
|
154
|
+
logger.info(
|
|
155
|
+
f"writing {len(df)} rows to destination table "
|
|
156
|
+
f"at {updated_upload_path}\ndtypes: {df.dtypes}",
|
|
157
|
+
)
|
|
158
|
+
storage_options = {}
|
|
159
|
+
self.connection_config.update_storage_options(storage_options=storage_options)
|
|
160
|
+
|
|
161
|
+
writer_kwargs = {
|
|
162
|
+
"table_or_uri": updated_upload_path,
|
|
163
|
+
"data": df,
|
|
164
|
+
"mode": "overwrite",
|
|
165
|
+
"storage_options": storage_options,
|
|
166
|
+
}
|
|
167
|
+
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
|
168
|
+
# ingest to fail, even though all tasks are completed normally. Putting the writer into a
|
|
169
|
+
# process mitigates this issue by ensuring python interpreter waits properly for deltalake's
|
|
170
|
+
# rust backend to finish
|
|
171
|
+
writer = Process(
|
|
172
|
+
target=write_deltalake,
|
|
173
|
+
kwargs=writer_kwargs,
|
|
174
|
+
)
|
|
175
|
+
writer.start()
|
|
176
|
+
writer.join()
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
delta_table_destination_entry = DestinationRegistryEntry(
|
|
180
|
+
connection_config=DeltaTableConnectionConfig,
|
|
181
|
+
uploader=DeltaTableUploader,
|
|
182
|
+
uploader_config=DeltaTableUploaderConfig,
|
|
183
|
+
upload_stager=DeltaTableUploadStager,
|
|
184
|
+
upload_stager_config=DeltaTableUploadStagerConfig,
|
|
185
|
+
)
|