unstructured-ingest 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/conftest.py +13 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +8 -4
- test/integration/connectors/sql/test_postgres.py +6 -10
- test/integration/connectors/sql/test_singlestore.py +156 -0
- test/integration/connectors/sql/test_snowflake.py +205 -0
- test/integration/connectors/sql/test_sqlite.py +6 -10
- test/integration/connectors/test_delta_table.py +138 -0
- test/integration/connectors/test_s3.py +1 -1
- test/integration/connectors/utils/docker.py +78 -0
- test/integration/connectors/utils/docker_compose.py +23 -8
- test/integration/connectors/utils/validation.py +93 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +32 -1
- unstructured_ingest/v2/cli/utils/model_conversion.py +10 -3
- unstructured_ingest/v2/interfaces/file_data.py +1 -0
- unstructured_ingest/v2/interfaces/indexer.py +4 -1
- unstructured_ingest/v2/pipeline/pipeline.py +10 -2
- unstructured_ingest/v2/pipeline/steps/index.py +18 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -6
- unstructured_ingest/v2/processes/connectors/astradb.py +278 -55
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +3 -1
- unstructured_ingest/v2/processes/connectors/delta_table.py +185 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +1 -0
- unstructured_ingest/v2/processes/connectors/slack.py +248 -0
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +15 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +33 -56
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +168 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +162 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +51 -12
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +31 -32
- unstructured_ingest/v2/unstructured_api.py +1 -1
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/METADATA +19 -17
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/RECORD +37 -31
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -250
- unstructured_ingest/v2/processes/connectors/singlestore.py +0 -156
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import docker
|
|
6
|
+
from docker.models.containers import Container
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_container(
|
|
10
|
+
docker_client: docker.DockerClient,
|
|
11
|
+
image: str,
|
|
12
|
+
ports: dict,
|
|
13
|
+
environment: Optional[dict] = None,
|
|
14
|
+
volumes: Optional[dict] = None,
|
|
15
|
+
healthcheck: Optional[dict] = None,
|
|
16
|
+
) -> Container:
|
|
17
|
+
run_kwargs = {
|
|
18
|
+
"image": image,
|
|
19
|
+
"detach": True,
|
|
20
|
+
"ports": ports,
|
|
21
|
+
}
|
|
22
|
+
if environment:
|
|
23
|
+
run_kwargs["environment"] = environment
|
|
24
|
+
if volumes:
|
|
25
|
+
run_kwargs["volumes"] = volumes
|
|
26
|
+
if healthcheck:
|
|
27
|
+
run_kwargs["healthcheck"] = healthcheck
|
|
28
|
+
container: Container = docker_client.containers.run(**run_kwargs)
|
|
29
|
+
return container
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def has_healthcheck(container: Container) -> bool:
|
|
33
|
+
return container.attrs.get("Config", {}).get("Healthcheck", None) is not None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def healthcheck_wait(container: Container, timeout: int = 10) -> None:
|
|
37
|
+
health = container.health
|
|
38
|
+
start = time.time()
|
|
39
|
+
while health != "healthy" and time.time() - start < timeout:
|
|
40
|
+
time.sleep(1)
|
|
41
|
+
container.reload()
|
|
42
|
+
health = container.health
|
|
43
|
+
if health != "healthy":
|
|
44
|
+
health_dict = container.attrs.get("State", {}).get("Health", {})
|
|
45
|
+
raise TimeoutError(f"Docker container never came up healthy: {health_dict}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@contextmanager
|
|
49
|
+
def container_context(
|
|
50
|
+
docker_client: docker.DockerClient,
|
|
51
|
+
image: str,
|
|
52
|
+
ports: dict,
|
|
53
|
+
environment: Optional[dict] = None,
|
|
54
|
+
volumes: Optional[dict] = None,
|
|
55
|
+
healthcheck: Optional[dict] = None,
|
|
56
|
+
healthcheck_timeout: int = 10,
|
|
57
|
+
):
|
|
58
|
+
container: Optional[Container] = None
|
|
59
|
+
try:
|
|
60
|
+
container = get_container(
|
|
61
|
+
docker_client=docker_client,
|
|
62
|
+
image=image,
|
|
63
|
+
ports=ports,
|
|
64
|
+
environment=environment,
|
|
65
|
+
volumes=volumes,
|
|
66
|
+
healthcheck=healthcheck,
|
|
67
|
+
)
|
|
68
|
+
if has_healthcheck(container):
|
|
69
|
+
healthcheck_wait(container=container, timeout=healthcheck_timeout)
|
|
70
|
+
yield container
|
|
71
|
+
except AssertionError as e:
|
|
72
|
+
if container:
|
|
73
|
+
logs = container.logs()
|
|
74
|
+
print(logs.decode("utf-8"))
|
|
75
|
+
raise e
|
|
76
|
+
finally:
|
|
77
|
+
if container:
|
|
78
|
+
container.kill()
|
|
@@ -3,6 +3,23 @@ from contextlib import contextmanager
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
|
|
6
|
+
def docker_compose_down(docker_compose_path: Path):
|
|
7
|
+
cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v --rmi all"
|
|
8
|
+
print(f"Running command: {cmd}")
|
|
9
|
+
final_resp = subprocess.run(
|
|
10
|
+
cmd,
|
|
11
|
+
shell=True,
|
|
12
|
+
capture_output=True,
|
|
13
|
+
)
|
|
14
|
+
if final_resp.returncode != 0:
|
|
15
|
+
print("STDOUT: {}".format(final_resp.stdout.decode("utf-8")))
|
|
16
|
+
print("STDERR: {}".format(final_resp.stderr.decode("utf-8")))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def run_cleanup(docker_compose_path: Path):
|
|
20
|
+
docker_compose_down(docker_compose_path=docker_compose_path)
|
|
21
|
+
|
|
22
|
+
|
|
6
23
|
@contextmanager
|
|
7
24
|
def docker_compose_context(docker_compose_path: Path):
|
|
8
25
|
# Dynamically run a specific docker compose file and make sure it gets cleanup by
|
|
@@ -30,15 +47,13 @@ def docker_compose_context(docker_compose_path: Path):
|
|
|
30
47
|
if resp:
|
|
31
48
|
print("STDOUT: {}".format(resp.stdout.decode("utf-8")))
|
|
32
49
|
print("STDERR: {}".format(resp.stderr.decode("utf-8")))
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
cmd = f"docker compose -f {docker_compose_path.resolve()} down --remove-orphans -v"
|
|
36
|
-
print(f"Running command: {cmd}")
|
|
37
|
-
final_resp = subprocess.run(
|
|
50
|
+
cmd = f"docker compose -f {docker_compose_path.resolve()} logs"
|
|
51
|
+
logs = subprocess.run(
|
|
38
52
|
cmd,
|
|
39
53
|
shell=True,
|
|
40
54
|
capture_output=True,
|
|
41
55
|
)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
56
|
+
print("DOCKER LOGS: {}".format(logs.stdout.decode("utf-8")))
|
|
57
|
+
raise e
|
|
58
|
+
finally:
|
|
59
|
+
run_cleanup(docker_compose_path=docker_compose_path)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import filecmp
|
|
1
2
|
import json
|
|
2
3
|
import os
|
|
3
4
|
import shutil
|
|
@@ -5,15 +6,31 @@ from dataclasses import dataclass, field, replace
|
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Callable, Optional
|
|
7
8
|
|
|
9
|
+
import pandas as pd
|
|
8
10
|
from deepdiff import DeepDiff
|
|
9
11
|
|
|
10
12
|
from test.integration.connectors.utils.constants import expected_results_path
|
|
11
13
|
from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
|
|
12
14
|
|
|
13
15
|
|
|
16
|
+
def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
17
|
+
expected_df = pd.read_csv(expected_filepath)
|
|
18
|
+
current_df = pd.read_csv(current_filepath)
|
|
19
|
+
if expected_df.equals(current_df):
|
|
20
|
+
return True
|
|
21
|
+
# Print diff
|
|
22
|
+
diff = expected_df.merge(current_df, indicator=True, how="left").loc[
|
|
23
|
+
lambda x: x["_merge"] != "both"
|
|
24
|
+
]
|
|
25
|
+
print("diff between expected and current df:")
|
|
26
|
+
print(diff)
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
|
|
14
30
|
@dataclass
|
|
15
31
|
class ValidationConfigs:
|
|
16
32
|
test_id: str
|
|
33
|
+
expected_number_indexed_file_data: Optional[int] = None
|
|
17
34
|
expected_num_files: Optional[int] = None
|
|
18
35
|
predownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
19
36
|
postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
|
|
@@ -21,6 +38,8 @@ class ValidationConfigs:
|
|
|
21
38
|
default_factory=lambda: ["local_download_path", "metadata.date_processed"]
|
|
22
39
|
)
|
|
23
40
|
exclude_fields_extend: list[str] = field(default_factory=list)
|
|
41
|
+
validate_downloaded_files: bool = False
|
|
42
|
+
downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
|
|
24
43
|
|
|
25
44
|
def get_exclude_fields(self) -> list[str]:
|
|
26
45
|
exclude_fields = self.exclude_fields
|
|
@@ -78,6 +97,13 @@ def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
|
|
|
78
97
|
assert not diff, "diff in files that exist: {}".format(", ".join(diff))
|
|
79
98
|
|
|
80
99
|
|
|
100
|
+
def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
|
|
101
|
+
expected_files = get_files(dir_path=expected_output_dir)
|
|
102
|
+
current_files = get_files(dir_path=current_output_dir)
|
|
103
|
+
diff = set(expected_files) ^ set(current_files)
|
|
104
|
+
assert not diff, "diff in files that exist: {}".format(", ".join(diff))
|
|
105
|
+
|
|
106
|
+
|
|
81
107
|
def check_contents(
|
|
82
108
|
expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
|
|
83
109
|
):
|
|
@@ -96,6 +122,32 @@ def check_contents(
|
|
|
96
122
|
assert not found_diff, f"Diffs found between files: {found_diff}"
|
|
97
123
|
|
|
98
124
|
|
|
125
|
+
def check_raw_file_contents(
|
|
126
|
+
expected_output_dir: Path,
|
|
127
|
+
current_output_dir: Path,
|
|
128
|
+
configs: ValidationConfigs,
|
|
129
|
+
):
|
|
130
|
+
current_files = get_files(dir_path=current_output_dir)
|
|
131
|
+
found_diff = False
|
|
132
|
+
files = []
|
|
133
|
+
for current_file in current_files:
|
|
134
|
+
current_file_path = current_output_dir / current_file
|
|
135
|
+
expected_file_path = expected_output_dir / current_file
|
|
136
|
+
if downloaded_file_equality_check := configs.downloaded_file_equality_check:
|
|
137
|
+
is_different = downloaded_file_equality_check(expected_file_path, current_file_path)
|
|
138
|
+
elif expected_file_path.suffix == ".csv" and current_file_path.suffix == ".csv":
|
|
139
|
+
is_different = not pandas_df_equality_check(
|
|
140
|
+
expected_filepath=expected_file_path, current_filepath=current_file_path
|
|
141
|
+
)
|
|
142
|
+
else:
|
|
143
|
+
is_different = not filecmp.cmp(expected_file_path, current_file_path, shallow=False)
|
|
144
|
+
if is_different:
|
|
145
|
+
found_diff = True
|
|
146
|
+
files.append(str(expected_file_path))
|
|
147
|
+
print(f"diffs between files {expected_file_path} and {current_file_path}")
|
|
148
|
+
assert not found_diff, "Diffs found between files: {}".format(", ".join(files))
|
|
149
|
+
|
|
150
|
+
|
|
99
151
|
def run_expected_results_validation(
|
|
100
152
|
expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
|
|
101
153
|
):
|
|
@@ -105,6 +157,21 @@ def run_expected_results_validation(
|
|
|
105
157
|
)
|
|
106
158
|
|
|
107
159
|
|
|
160
|
+
def run_expected_download_files_validation(
|
|
161
|
+
expected_output_dir: Path,
|
|
162
|
+
current_download_dir: Path,
|
|
163
|
+
configs: ValidationConfigs,
|
|
164
|
+
):
|
|
165
|
+
check_files_in_paths(
|
|
166
|
+
expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
|
|
167
|
+
)
|
|
168
|
+
check_raw_file_contents(
|
|
169
|
+
expected_output_dir=expected_output_dir,
|
|
170
|
+
current_output_dir=current_download_dir,
|
|
171
|
+
configs=configs,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
108
175
|
def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
|
|
109
176
|
directory_record = expected_output_dir / "directory_structure.json"
|
|
110
177
|
with directory_record.open("r") as directory_file:
|
|
@@ -113,13 +180,18 @@ def run_directory_structure_validation(expected_output_dir: Path, download_files
|
|
|
113
180
|
assert directory_structure == download_files
|
|
114
181
|
|
|
115
182
|
|
|
116
|
-
def update_fixtures(
|
|
183
|
+
def update_fixtures(
|
|
184
|
+
output_dir: Path,
|
|
185
|
+
download_dir: Path,
|
|
186
|
+
all_file_data: list[FileData],
|
|
187
|
+
save_downloads: bool = False,
|
|
188
|
+
):
|
|
117
189
|
# Delete current files
|
|
118
190
|
shutil.rmtree(path=output_dir, ignore_errors=True)
|
|
119
191
|
output_dir.mkdir(parents=True)
|
|
120
192
|
# Rewrite the current file data
|
|
121
193
|
file_data_output_path = output_dir / "file_data"
|
|
122
|
-
file_data_output_path.mkdir(parents=True)
|
|
194
|
+
file_data_output_path.mkdir(parents=True, exist_ok=True)
|
|
123
195
|
for file_data in all_file_data:
|
|
124
196
|
file_data_path = file_data_output_path / f"{file_data.identifier}.json"
|
|
125
197
|
with file_data_path.open(mode="w") as f:
|
|
@@ -132,6 +204,11 @@ def update_fixtures(output_dir: Path, download_dir: Path, all_file_data: list[Fi
|
|
|
132
204
|
with download_dir_record.open(mode="w") as f:
|
|
133
205
|
json.dump({"directory_structure": download_files}, f, indent=2)
|
|
134
206
|
|
|
207
|
+
# If applicable, save raw downloads
|
|
208
|
+
if save_downloads:
|
|
209
|
+
raw_download_output_path = output_dir / "downloads"
|
|
210
|
+
shutil.copytree(download_dir, raw_download_output_path)
|
|
211
|
+
|
|
135
212
|
|
|
136
213
|
def run_all_validations(
|
|
137
214
|
configs: ValidationConfigs,
|
|
@@ -140,6 +217,13 @@ def run_all_validations(
|
|
|
140
217
|
download_dir: Path,
|
|
141
218
|
test_output_dir: Path,
|
|
142
219
|
):
|
|
220
|
+
if expected_number_indexed_file_data := configs.expected_number_indexed_file_data:
|
|
221
|
+
assert (
|
|
222
|
+
len(predownload_file_data) == expected_number_indexed_file_data
|
|
223
|
+
), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
|
|
224
|
+
if expected_num_files := configs.expected_num_files:
|
|
225
|
+
assert len(postdownload_file_data) == expected_num_files
|
|
226
|
+
|
|
143
227
|
for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
|
|
144
228
|
configs.run_file_data_validation(
|
|
145
229
|
predownload_file_data=pre_data, postdownload_file_data=post_data
|
|
@@ -155,6 +239,12 @@ def run_all_validations(
|
|
|
155
239
|
run_directory_structure_validation(
|
|
156
240
|
expected_output_dir=configs.test_output_dir(), download_files=download_files
|
|
157
241
|
)
|
|
242
|
+
if configs.validate_downloaded_files:
|
|
243
|
+
run_expected_download_files_validation(
|
|
244
|
+
expected_output_dir=test_output_dir / "downloads",
|
|
245
|
+
current_download_dir=download_dir,
|
|
246
|
+
configs=configs,
|
|
247
|
+
)
|
|
158
248
|
|
|
159
249
|
|
|
160
250
|
async def source_connector_validation(
|
|
@@ -200,4 +290,5 @@ async def source_connector_validation(
|
|
|
200
290
|
output_dir=test_output_dir,
|
|
201
291
|
download_dir=download_dir,
|
|
202
292
|
all_file_data=all_postdownload_file_data,
|
|
293
|
+
save_downloads=configs.validate_downloaded_files,
|
|
203
294
|
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.2.1" # pragma: no cover
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os.path
|
|
3
|
+
from datetime import date, datetime
|
|
3
4
|
from gettext import gettext, ngettext
|
|
4
5
|
from gettext import gettext as _
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Any, Optional, Type, TypeVar, Union
|
|
7
8
|
|
|
8
9
|
import click
|
|
9
|
-
from pydantic import BaseModel, ConfigDict, Secret
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Secret, TypeAdapter, ValidationError
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def conform_click_options(options: dict):
|
|
@@ -109,6 +110,36 @@ class DelimitedString(click.ParamType):
|
|
|
109
110
|
return split
|
|
110
111
|
|
|
111
112
|
|
|
113
|
+
class PydanticDateTime(click.ParamType):
|
|
114
|
+
name = "datetime"
|
|
115
|
+
|
|
116
|
+
def convert(
|
|
117
|
+
self,
|
|
118
|
+
value: Any,
|
|
119
|
+
param: Optional[click.Parameter] = None,
|
|
120
|
+
ctx: Optional[click.Context] = None,
|
|
121
|
+
) -> Any:
|
|
122
|
+
try:
|
|
123
|
+
return TypeAdapter(datetime).validate_strings(value)
|
|
124
|
+
except ValidationError:
|
|
125
|
+
self.fail(f"{value} is not a valid datetime", param, ctx)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class PydanticDate(click.ParamType):
|
|
129
|
+
name = "date"
|
|
130
|
+
|
|
131
|
+
def convert(
|
|
132
|
+
self,
|
|
133
|
+
value: Any,
|
|
134
|
+
param: Optional[click.Parameter] = None,
|
|
135
|
+
ctx: Optional[click.Context] = None,
|
|
136
|
+
) -> Any:
|
|
137
|
+
try:
|
|
138
|
+
return TypeAdapter(date).validate_strings(value)
|
|
139
|
+
except ValidationError:
|
|
140
|
+
self.fail(f"{value} is not a valid date", param, ctx)
|
|
141
|
+
|
|
142
|
+
|
|
112
143
|
BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
|
|
113
144
|
|
|
114
145
|
|
|
@@ -25,7 +25,12 @@ from pydantic.fields import FieldInfo
|
|
|
25
25
|
from pydantic.types import _SecretBase
|
|
26
26
|
from pydantic_core import PydanticUndefined
|
|
27
27
|
|
|
28
|
-
from unstructured_ingest.v2.cli.utils.click import
|
|
28
|
+
from unstructured_ingest.v2.cli.utils.click import (
|
|
29
|
+
DelimitedString,
|
|
30
|
+
Dict,
|
|
31
|
+
PydanticDate,
|
|
32
|
+
PydanticDateTime,
|
|
33
|
+
)
|
|
29
34
|
|
|
30
35
|
NoneType = type(None)
|
|
31
36
|
|
|
@@ -135,8 +140,10 @@ def get_type_from_annotation(field_type: Any) -> click.ParamType:
|
|
|
135
140
|
return click.UUID
|
|
136
141
|
if field_type is Path:
|
|
137
142
|
return click.Path(path_type=Path)
|
|
138
|
-
if field_type
|
|
139
|
-
return
|
|
143
|
+
if field_type is datetime.datetime:
|
|
144
|
+
return PydanticDateTime()
|
|
145
|
+
if field_type is datetime.date:
|
|
146
|
+
return PydanticDate()
|
|
140
147
|
if field_origin is Literal:
|
|
141
148
|
return click.Choice(field_args)
|
|
142
149
|
if isinstance(field_type, EnumMeta):
|
|
@@ -43,6 +43,7 @@ class FileData(DataClassJsonMixin):
|
|
|
43
43
|
additional_metadata: dict[str, Any] = field(default_factory=dict)
|
|
44
44
|
reprocess: bool = False
|
|
45
45
|
local_download_path: Optional[str] = None
|
|
46
|
+
display_name: Optional[str] = None
|
|
46
47
|
|
|
47
48
|
@classmethod
|
|
48
49
|
def from_file(cls, path: str) -> "FileData":
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Any, Generator, Optional, TypeVar
|
|
2
|
+
from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
@@ -25,3 +25,6 @@ class Indexer(BaseProcess, BaseConnector, ABC):
|
|
|
25
25
|
@abstractmethod
|
|
26
26
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
27
27
|
pass
|
|
28
|
+
|
|
29
|
+
async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
|
|
30
|
+
raise NotImplementedError()
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import logging
|
|
4
5
|
import multiprocessing as mp
|
|
5
6
|
import shutil
|
|
@@ -186,6 +187,14 @@ class Pipeline:
|
|
|
186
187
|
filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
|
|
187
188
|
return filtered_records
|
|
188
189
|
|
|
190
|
+
def get_indices(self) -> list[dict]:
|
|
191
|
+
if self.indexer_step.process.is_async():
|
|
192
|
+
indices = asyncio.run(self.indexer_step.run_async())
|
|
193
|
+
else:
|
|
194
|
+
indices = self.indexer_step.run()
|
|
195
|
+
indices_inputs = [{"file_data_path": i} for i in indices]
|
|
196
|
+
return indices_inputs
|
|
197
|
+
|
|
189
198
|
def _run(self):
|
|
190
199
|
logger.info(
|
|
191
200
|
f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
|
|
@@ -197,8 +206,7 @@ class Pipeline:
|
|
|
197
206
|
self.context.status = {}
|
|
198
207
|
|
|
199
208
|
# Index into data source
|
|
200
|
-
|
|
201
|
-
indices_inputs = [{"file_data_path": i} for i in indices]
|
|
209
|
+
indices_inputs = self.get_indices()
|
|
202
210
|
if not indices_inputs:
|
|
203
211
|
logger.info("No files to process after indexer, exiting")
|
|
204
212
|
return
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import json
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import Generator, Optional, TypeVar
|
|
4
|
+
from typing import AsyncGenerator, Generator, Optional, TypeVar
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.v2.interfaces.indexer import Indexer
|
|
7
7
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -52,6 +52,23 @@ class IndexStep(PipelineStep):
|
|
|
52
52
|
raise e
|
|
53
53
|
continue
|
|
54
54
|
|
|
55
|
+
async def run_async(self) -> AsyncGenerator[str, None]:
|
|
56
|
+
async for file_data in self.process.run_async():
|
|
57
|
+
logger.debug(f"generated file data: {file_data.to_dict()}")
|
|
58
|
+
try:
|
|
59
|
+
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
60
|
+
filename = f"{record_hash}.json"
|
|
61
|
+
filepath = (self.cache_dir / filename).resolve()
|
|
62
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
63
|
+
with open(str(filepath), "w") as f:
|
|
64
|
+
json.dump(file_data.to_dict(), f, indent=2)
|
|
65
|
+
yield str(filepath)
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
|
|
68
|
+
if self.context.raise_on_error:
|
|
69
|
+
raise e
|
|
70
|
+
continue
|
|
71
|
+
|
|
55
72
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
56
73
|
index_config_dict = json.loads(
|
|
57
74
|
serialize_base_model_json(model=self.process.index_config, sort_keys=True)
|
|
@@ -11,13 +11,15 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
11
11
|
from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
|
|
12
12
|
from .airtable import airtable_source_entry
|
|
13
13
|
from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
|
|
14
|
-
from .astradb import astra_db_destination_entry
|
|
14
|
+
from .astradb import astra_db_destination_entry, astra_db_source_entry
|
|
15
15
|
from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
|
|
16
16
|
from .azure_cognitive_search import azure_cognitive_search_destination_entry
|
|
17
17
|
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
|
|
18
18
|
from .chroma import chroma_destination_entry
|
|
19
19
|
from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
|
|
20
20
|
from .couchbase import couchbase_destination_entry, couchbase_source_entry
|
|
21
|
+
from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
|
|
22
|
+
from .delta_table import delta_table_destination_entry
|
|
21
23
|
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
22
24
|
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
23
25
|
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
@@ -42,11 +44,12 @@ from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
|
|
|
42
44
|
from .salesforce import salesforce_source_entry
|
|
43
45
|
from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
|
|
44
46
|
from .sharepoint import sharepoint_source_entry
|
|
45
|
-
from .
|
|
46
|
-
from .
|
|
47
|
+
from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
|
|
48
|
+
from .slack import slack_source_entry
|
|
47
49
|
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
|
48
50
|
from .weaviate import weaviate_destination_entry
|
|
49
51
|
|
|
52
|
+
add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
|
|
50
53
|
add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
|
|
51
54
|
|
|
52
55
|
add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
|
|
@@ -54,6 +57,10 @@ add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_desti
|
|
|
54
57
|
add_source_entry(source_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_source_entry)
|
|
55
58
|
add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
|
|
56
59
|
|
|
60
|
+
add_destination_entry(
|
|
61
|
+
destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
|
|
62
|
+
)
|
|
63
|
+
|
|
57
64
|
add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
|
|
58
65
|
add_destination_entry(
|
|
59
66
|
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
|
|
@@ -80,9 +87,7 @@ add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
|
|
|
80
87
|
|
|
81
88
|
add_destination_entry(destination_type=PINECONE_CONNECTOR_TYPE, entry=pinecone_destination_entry)
|
|
82
89
|
add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_entry)
|
|
83
|
-
|
|
84
|
-
destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
|
|
85
|
-
)
|
|
90
|
+
|
|
86
91
|
add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
|
|
87
92
|
add_destination_entry(
|
|
88
93
|
destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
|
|
@@ -93,3 +98,5 @@ add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destina
|
|
|
93
98
|
add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
|
|
94
99
|
|
|
95
100
|
add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
|
|
101
|
+
|
|
102
|
+
add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
|