unstructured-ingest 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (34) hide show
  1. test/integration/connectors/conftest.py +13 -0
  2. test/integration/connectors/databricks_tests/test_volumes_native.py +8 -4
  3. test/integration/connectors/sql/__init__.py +0 -0
  4. test/integration/connectors/{test_postgres.py → sql/test_postgres.py} +76 -2
  5. test/integration/connectors/sql/test_snowflake.py +205 -0
  6. test/integration/connectors/{test_sqlite.py → sql/test_sqlite.py} +68 -12
  7. test/integration/connectors/test_delta_table.py +138 -0
  8. test/integration/connectors/utils/constants.py +1 -1
  9. test/integration/connectors/utils/docker.py +78 -0
  10. test/integration/connectors/utils/validation.py +100 -4
  11. unstructured_ingest/__version__.py +1 -1
  12. unstructured_ingest/v2/cli/utils/click.py +32 -1
  13. unstructured_ingest/v2/cli/utils/model_conversion.py +10 -3
  14. unstructured_ingest/v2/interfaces/indexer.py +4 -1
  15. unstructured_ingest/v2/pipeline/pipeline.py +10 -2
  16. unstructured_ingest/v2/pipeline/steps/index.py +18 -1
  17. unstructured_ingest/v2/processes/connectors/__init__.py +10 -0
  18. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +1 -1
  19. unstructured_ingest/v2/processes/connectors/delta_table.py +185 -0
  20. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
  21. unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
  22. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  23. unstructured_ingest/v2/processes/connectors/sql/__init__.py +10 -2
  24. unstructured_ingest/v2/processes/connectors/sql/postgres.py +77 -25
  25. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +164 -0
  26. unstructured_ingest/v2/processes/connectors/sql/sql.py +163 -6
  27. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +86 -24
  28. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/METADATA +16 -14
  29. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/RECORD +33 -27
  30. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -250
  31. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/LICENSE.md +0 -0
  32. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/WHEEL +0 -0
  33. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/entry_points.txt +0 -0
  34. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,78 @@
1
+ import time
2
+ from contextlib import contextmanager
3
+ from typing import Optional
4
+
5
+ import docker
6
+ from docker.models.containers import Container
7
+
8
+
9
+ def get_container(
10
+ docker_client: docker.DockerClient,
11
+ image: str,
12
+ ports: dict,
13
+ environment: Optional[dict] = None,
14
+ volumes: Optional[dict] = None,
15
+ healthcheck: Optional[dict] = None,
16
+ ) -> Container:
17
+ run_kwargs = {
18
+ "image": image,
19
+ "detach": True,
20
+ "ports": ports,
21
+ }
22
+ if environment:
23
+ run_kwargs["environment"] = environment
24
+ if volumes:
25
+ run_kwargs["volumes"] = volumes
26
+ if healthcheck:
27
+ run_kwargs["healthcheck"] = healthcheck
28
+ container: Container = docker_client.containers.run(**run_kwargs)
29
+ return container
30
+
31
+
32
+ def has_healthcheck(container: Container) -> bool:
33
+ return container.attrs.get("Config", {}).get("Healthcheck", None) is not None
34
+
35
+
36
+ def healthcheck_wait(container: Container, timeout: int = 10) -> None:
37
+ health = container.health
38
+ start = time.time()
39
+ while health != "healthy" and time.time() - start < timeout:
40
+ time.sleep(1)
41
+ container.reload()
42
+ health = container.health
43
+ if health != "healthy":
44
+ health_dict = container.attrs.get("State", {}).get("Health", {})
45
+ raise TimeoutError(f"Docker container never came up healthy: {health_dict}")
46
+
47
+
48
+ @contextmanager
49
+ def container_context(
50
+ docker_client: docker.DockerClient,
51
+ image: str,
52
+ ports: dict,
53
+ environment: Optional[dict] = None,
54
+ volumes: Optional[dict] = None,
55
+ healthcheck: Optional[dict] = None,
56
+ healthcheck_timeout: int = 10,
57
+ ):
58
+ container: Optional[Container] = None
59
+ try:
60
+ container = get_container(
61
+ docker_client=docker_client,
62
+ image=image,
63
+ ports=ports,
64
+ environment=environment,
65
+ volumes=volumes,
66
+ healthcheck=healthcheck,
67
+ )
68
+ if has_healthcheck(container):
69
+ healthcheck_wait(container=container, timeout=healthcheck_timeout)
70
+ yield container
71
+ except AssertionError as e:
72
+ if container:
73
+ logs = container.logs()
74
+ print(logs.decode("utf-8"))
75
+ raise e
76
+ finally:
77
+ if container:
78
+ container.kill()
@@ -1,3 +1,4 @@
1
+ import filecmp
1
2
  import json
2
3
  import os
3
4
  import shutil
@@ -5,15 +6,31 @@ from dataclasses import dataclass, field, replace
5
6
  from pathlib import Path
6
7
  from typing import Callable, Optional
7
8
 
9
+ import pandas as pd
8
10
  from deepdiff import DeepDiff
9
11
 
10
12
  from test.integration.connectors.utils.constants import expected_results_path
11
13
  from unstructured_ingest.v2.interfaces import Downloader, FileData, Indexer
12
14
 
13
15
 
16
+ def pandas_df_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
17
+ expected_df = pd.read_csv(expected_filepath)
18
+ current_df = pd.read_csv(current_filepath)
19
+ if expected_df.equals(current_df):
20
+ return True
21
+ # Print diff
22
+ diff = expected_df.merge(current_df, indicator=True, how="left").loc[
23
+ lambda x: x["_merge"] != "both"
24
+ ]
25
+ print("diff between expected and current df:")
26
+ print(diff)
27
+ return False
28
+
29
+
14
30
  @dataclass
15
31
  class ValidationConfigs:
16
32
  test_id: str
33
+ expected_number_indexed_file_data: Optional[int] = None
17
34
  expected_num_files: Optional[int] = None
18
35
  predownload_file_data_check: Optional[Callable[[FileData], None]] = None
19
36
  postdownload_file_data_check: Optional[Callable[[FileData], None]] = None
@@ -21,6 +38,8 @@ class ValidationConfigs:
21
38
  default_factory=lambda: ["local_download_path", "metadata.date_processed"]
22
39
  )
23
40
  exclude_fields_extend: list[str] = field(default_factory=list)
41
+ validate_downloaded_files: bool = False
42
+ downloaded_file_equality_check: Optional[Callable[[Path, Path], bool]] = None
24
43
 
25
44
  def get_exclude_fields(self) -> list[str]:
26
45
  exclude_fields = self.exclude_fields
@@ -78,6 +97,13 @@ def check_files(expected_output_dir: Path, all_file_data: list[FileData]):
78
97
  assert not diff, "diff in files that exist: {}".format(", ".join(diff))
79
98
 
80
99
 
100
+ def check_files_in_paths(expected_output_dir: Path, current_output_dir: Path):
101
+ expected_files = get_files(dir_path=expected_output_dir)
102
+ current_files = get_files(dir_path=current_output_dir)
103
+ diff = set(expected_files) ^ set(current_files)
104
+ assert not diff, "diff in files that exist: {}".format(", ".join(diff))
105
+
106
+
81
107
  def check_contents(
82
108
  expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
83
109
  ):
@@ -96,6 +122,32 @@ def check_contents(
96
122
  assert not found_diff, f"Diffs found between files: {found_diff}"
97
123
 
98
124
 
125
+ def check_raw_file_contents(
126
+ expected_output_dir: Path,
127
+ current_output_dir: Path,
128
+ configs: ValidationConfigs,
129
+ ):
130
+ current_files = get_files(dir_path=current_output_dir)
131
+ found_diff = False
132
+ files = []
133
+ for current_file in current_files:
134
+ current_file_path = current_output_dir / current_file
135
+ expected_file_path = expected_output_dir / current_file
136
+ if downloaded_file_equality_check := configs.downloaded_file_equality_check:
137
+ is_different = downloaded_file_equality_check(expected_file_path, current_file_path)
138
+ elif expected_file_path.suffix == ".csv" and current_file_path.suffix == ".csv":
139
+ is_different = not pandas_df_equality_check(
140
+ expected_filepath=expected_file_path, current_filepath=current_file_path
141
+ )
142
+ else:
143
+ is_different = not filecmp.cmp(expected_file_path, current_file_path, shallow=False)
144
+ if is_different:
145
+ found_diff = True
146
+ files.append(str(expected_file_path))
147
+ print(f"diffs between files {expected_file_path} and {current_file_path}")
148
+ assert not found_diff, "Diffs found between files: {}".format(", ".join(files))
149
+
150
+
99
151
  def run_expected_results_validation(
100
152
  expected_output_dir: Path, all_file_data: list[FileData], configs: ValidationConfigs
101
153
  ):
@@ -105,6 +157,21 @@ def run_expected_results_validation(
105
157
  )
106
158
 
107
159
 
160
+ def run_expected_download_files_validation(
161
+ expected_output_dir: Path,
162
+ current_download_dir: Path,
163
+ configs: ValidationConfigs,
164
+ ):
165
+ check_files_in_paths(
166
+ expected_output_dir=expected_output_dir, current_output_dir=current_download_dir
167
+ )
168
+ check_raw_file_contents(
169
+ expected_output_dir=expected_output_dir,
170
+ current_output_dir=current_download_dir,
171
+ configs=configs,
172
+ )
173
+
174
+
108
175
  def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
109
176
  directory_record = expected_output_dir / "directory_structure.json"
110
177
  with directory_record.open("r") as directory_file:
@@ -113,13 +180,18 @@ def run_directory_structure_validation(expected_output_dir: Path, download_files
113
180
  assert directory_structure == download_files
114
181
 
115
182
 
116
- def update_fixtures(output_dir: Path, download_dir: Path, all_file_data: list[FileData]):
183
+ def update_fixtures(
184
+ output_dir: Path,
185
+ download_dir: Path,
186
+ all_file_data: list[FileData],
187
+ save_downloads: bool = False,
188
+ ):
117
189
  # Delete current files
118
190
  shutil.rmtree(path=output_dir, ignore_errors=True)
119
191
  output_dir.mkdir(parents=True)
120
192
  # Rewrite the current file data
121
193
  file_data_output_path = output_dir / "file_data"
122
- file_data_output_path.mkdir(parents=True)
194
+ file_data_output_path.mkdir(parents=True, exist_ok=True)
123
195
  for file_data in all_file_data:
124
196
  file_data_path = file_data_output_path / f"{file_data.identifier}.json"
125
197
  with file_data_path.open(mode="w") as f:
@@ -132,6 +204,11 @@ def update_fixtures(output_dir: Path, download_dir: Path, all_file_data: list[Fi
132
204
  with download_dir_record.open(mode="w") as f:
133
205
  json.dump({"directory_structure": download_files}, f, indent=2)
134
206
 
207
+ # If applicable, save raw downloads
208
+ if save_downloads:
209
+ raw_download_output_path = output_dir / "downloads"
210
+ shutil.copytree(download_dir, raw_download_output_path)
211
+
135
212
 
136
213
  def run_all_validations(
137
214
  configs: ValidationConfigs,
@@ -140,6 +217,13 @@ def run_all_validations(
140
217
  download_dir: Path,
141
218
  test_output_dir: Path,
142
219
  ):
220
+ if expected_number_indexed_file_data := configs.expected_number_indexed_file_data:
221
+ assert (
222
+ len(predownload_file_data) == expected_number_indexed_file_data
223
+ ), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
224
+ if expected_num_files := configs.expected_num_files:
225
+ assert len(postdownload_file_data) == expected_num_files
226
+
143
227
  for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
144
228
  configs.run_file_data_validation(
145
229
  predownload_file_data=pre_data, postdownload_file_data=post_data
@@ -155,6 +239,12 @@ def run_all_validations(
155
239
  run_directory_structure_validation(
156
240
  expected_output_dir=configs.test_output_dir(), download_files=download_files
157
241
  )
242
+ if configs.validate_downloaded_files:
243
+ run_expected_download_files_validation(
244
+ expected_output_dir=test_output_dir / "downloads",
245
+ current_download_dir=download_dir,
246
+ configs=configs,
247
+ )
158
248
 
159
249
 
160
250
  async def source_connector_validation(
@@ -180,8 +270,13 @@ async def source_connector_validation(
180
270
  resp = await downloader.run_async(file_data=file_data)
181
271
  else:
182
272
  resp = downloader.run(file_data=file_data)
183
- postdownload_file_data = replace(resp["file_data"])
184
- all_postdownload_file_data.append(postdownload_file_data)
273
+ if isinstance(resp, list):
274
+ for r in resp:
275
+ postdownload_file_data = replace(r["file_data"])
276
+ all_postdownload_file_data.append(postdownload_file_data)
277
+ else:
278
+ postdownload_file_data = replace(resp["file_data"])
279
+ all_postdownload_file_data.append(postdownload_file_data)
185
280
  if not overwrite_fixtures:
186
281
  run_all_validations(
187
282
  configs=configs,
@@ -195,4 +290,5 @@ async def source_connector_validation(
195
290
  output_dir=test_output_dir,
196
291
  download_dir=download_dir,
197
292
  all_file_data=all_postdownload_file_data,
293
+ save_downloads=configs.validate_downloaded_files,
198
294
  )
@@ -1 +1 @@
1
- __version__ = "0.1.0" # pragma: no cover
1
+ __version__ = "0.2.0" # pragma: no cover
@@ -1,12 +1,13 @@
1
1
  import json
2
2
  import os.path
3
+ from datetime import date, datetime
3
4
  from gettext import gettext, ngettext
4
5
  from gettext import gettext as _
5
6
  from pathlib import Path
6
7
  from typing import Any, Optional, Type, TypeVar, Union
7
8
 
8
9
  import click
9
- from pydantic import BaseModel, ConfigDict, Secret
10
+ from pydantic import BaseModel, ConfigDict, Secret, TypeAdapter, ValidationError
10
11
 
11
12
 
12
13
  def conform_click_options(options: dict):
@@ -109,6 +110,36 @@ class DelimitedString(click.ParamType):
109
110
  return split
110
111
 
111
112
 
113
+ class PydanticDateTime(click.ParamType):
114
+ name = "datetime"
115
+
116
+ def convert(
117
+ self,
118
+ value: Any,
119
+ param: Optional[click.Parameter] = None,
120
+ ctx: Optional[click.Context] = None,
121
+ ) -> Any:
122
+ try:
123
+ return TypeAdapter(datetime).validate_strings(value)
124
+ except ValidationError:
125
+ self.fail(f"{value} is not a valid datetime", param, ctx)
126
+
127
+
128
+ class PydanticDate(click.ParamType):
129
+ name = "date"
130
+
131
+ def convert(
132
+ self,
133
+ value: Any,
134
+ param: Optional[click.Parameter] = None,
135
+ ctx: Optional[click.Context] = None,
136
+ ) -> Any:
137
+ try:
138
+ return TypeAdapter(date).validate_strings(value)
139
+ except ValidationError:
140
+ self.fail(f"{value} is not a valid date", param, ctx)
141
+
142
+
112
143
  BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
113
144
 
114
145
 
@@ -25,7 +25,12 @@ from pydantic.fields import FieldInfo
25
25
  from pydantic.types import _SecretBase
26
26
  from pydantic_core import PydanticUndefined
27
27
 
28
- from unstructured_ingest.v2.cli.utils.click import DelimitedString, Dict
28
+ from unstructured_ingest.v2.cli.utils.click import (
29
+ DelimitedString,
30
+ Dict,
31
+ PydanticDate,
32
+ PydanticDateTime,
33
+ )
29
34
 
30
35
  NoneType = type(None)
31
36
 
@@ -135,8 +140,10 @@ def get_type_from_annotation(field_type: Any) -> click.ParamType:
135
140
  return click.UUID
136
141
  if field_type is Path:
137
142
  return click.Path(path_type=Path)
138
- if field_type in (datetime.datetime, datetime.date):
139
- return click.DateTime()
143
+ if field_type is datetime.datetime:
144
+ return PydanticDateTime()
145
+ if field_type is datetime.date:
146
+ return PydanticDate()
140
147
  if field_origin is Literal:
141
148
  return click.Choice(field_args)
142
149
  if isinstance(field_type, EnumMeta):
@@ -1,5 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Any, Generator, Optional, TypeVar
2
+ from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
3
3
 
4
4
  from pydantic import BaseModel
5
5
 
@@ -25,3 +25,6 @@ class Indexer(BaseProcess, BaseConnector, ABC):
25
25
  @abstractmethod
26
26
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
27
27
  pass
28
+
29
+ async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
30
+ raise NotImplementedError()
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  import logging
4
5
  import multiprocessing as mp
5
6
  import shutil
@@ -186,6 +187,14 @@ class Pipeline:
186
187
  filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
187
188
  return filtered_records
188
189
 
190
+ def get_indices(self) -> list[dict]:
191
+ if self.indexer_step.process.is_async():
192
+ indices = asyncio.run(self.indexer_step.run_async())
193
+ else:
194
+ indices = self.indexer_step.run()
195
+ indices_inputs = [{"file_data_path": i} for i in indices]
196
+ return indices_inputs
197
+
189
198
  def _run(self):
190
199
  logger.info(
191
200
  f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
@@ -197,8 +206,7 @@ class Pipeline:
197
206
  self.context.status = {}
198
207
 
199
208
  # Index into data source
200
- indices = self.indexer_step.run()
201
- indices_inputs = [{"file_data_path": i} for i in indices]
209
+ indices_inputs = self.get_indices()
202
210
  if not indices_inputs:
203
211
  logger.info("No files to process after indexer, exiting")
204
212
  return
@@ -1,7 +1,7 @@
1
1
  import hashlib
2
2
  import json
3
3
  from dataclasses import dataclass
4
- from typing import Generator, Optional, TypeVar
4
+ from typing import AsyncGenerator, Generator, Optional, TypeVar
5
5
 
6
6
  from unstructured_ingest.v2.interfaces.indexer import Indexer
7
7
  from unstructured_ingest.v2.logger import logger
@@ -52,6 +52,23 @@ class IndexStep(PipelineStep):
52
52
  raise e
53
53
  continue
54
54
 
55
+ async def run_async(self) -> AsyncGenerator[str, None]:
56
+ async for file_data in self.process.run_async():
57
+ logger.debug(f"generated file data: {file_data.to_dict()}")
58
+ try:
59
+ record_hash = self.get_hash(extras=[file_data.identifier])
60
+ filename = f"{record_hash}.json"
61
+ filepath = (self.cache_dir / filename).resolve()
62
+ filepath.parent.mkdir(parents=True, exist_ok=True)
63
+ with open(str(filepath), "w") as f:
64
+ json.dump(file_data.to_dict(), f, indent=2)
65
+ yield str(filepath)
66
+ except Exception as e:
67
+ logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
68
+ if self.context.raise_on_error:
69
+ raise e
70
+ continue
71
+
55
72
  def get_hash(self, extras: Optional[list[str]]) -> str:
56
73
  index_config_dict = json.loads(
57
74
  serialize_base_model_json(model=self.process.index_config, sort_keys=True)
@@ -18,6 +18,8 @@ from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
18
18
  from .chroma import chroma_destination_entry
19
19
  from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
20
20
  from .couchbase import couchbase_destination_entry, couchbase_source_entry
21
+ from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
22
+ from .delta_table import delta_table_destination_entry
21
23
  from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
22
24
  from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
23
25
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -44,6 +46,8 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
44
46
  from .sharepoint import sharepoint_source_entry
45
47
  from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
46
48
  from .singlestore import singlestore_destination_entry
49
+ from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
50
+ from .slack import slack_source_entry
47
51
  from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
48
52
  from .weaviate import weaviate_destination_entry
49
53
 
@@ -54,6 +58,10 @@ add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_desti
54
58
  add_source_entry(source_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_source_entry)
55
59
  add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
56
60
 
61
+ add_destination_entry(
62
+ destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
63
+ )
64
+
57
65
  add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
58
66
  add_destination_entry(
59
67
  destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
@@ -93,3 +101,5 @@ add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destina
93
101
  add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
94
102
 
95
103
  add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
104
+
105
+ add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
@@ -166,7 +166,7 @@ class DatabricksVolumesUploader(Uploader, ABC):
166
166
  raise DestinationConnectionError(f"failed to validate connection: {e}")
167
167
 
168
168
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
169
- output_path = os.path.join(self.upload_config.path, path.name)
169
+ output_path = os.path.join(self.upload_config.path, file_data.source_identifiers.filename)
170
170
  with open(path, "rb") as elements_file:
171
171
  self.connection_config.get_client().files.upload(
172
172
  file_path=output_path,
@@ -0,0 +1,185 @@
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass, field
4
+ from multiprocessing import Process
5
+ from pathlib import Path
6
+ from typing import Any, Optional
7
+
8
+ import pandas as pd
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.error import DestinationConnectionError
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.utils.table import convert_to_pandas_dataframe
14
+ from unstructured_ingest.v2.interfaces import (
15
+ AccessConfig,
16
+ ConnectionConfig,
17
+ FileData,
18
+ Uploader,
19
+ UploaderConfig,
20
+ UploadStager,
21
+ UploadStagerConfig,
22
+ )
23
+ from unstructured_ingest.v2.logger import logger
24
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
25
+
26
+ CONNECTOR_TYPE = "delta_table"
27
+
28
+
29
+ class DeltaTableAccessConfig(AccessConfig):
30
+ aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
31
+ aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
32
+
33
+
34
+ class DeltaTableConnectionConfig(ConnectionConfig):
35
+ access_config: Secret[DeltaTableAccessConfig] = Field(
36
+ default=DeltaTableAccessConfig(), validate_default=True
37
+ )
38
+ aws_region: Optional[str] = Field(default=None, description="AWS Region")
39
+ table_uri: str = Field(
40
+ default=None,
41
+ description=(
42
+ "Local path or path to the target folder in the S3 bucket, "
43
+ "formatted as s3://my-bucket/my-folder/"
44
+ ),
45
+ )
46
+
47
+ def update_storage_options(self, storage_options: dict) -> None:
48
+ secrets = self.access_config.get_secret_value()
49
+ if self.aws_region and secrets.aws_access_key_id and secrets.aws_secret_access_key:
50
+ storage_options["AWS_REGION"] = self.aws_region
51
+ storage_options["AWS_ACCESS_KEY_ID"] = secrets.aws_access_key_id
52
+ storage_options["AWS_SECRET_ACCESS_KEY"] = secrets.aws_secret_access_key
53
+ # Delta-rs doesn't support concurrent S3 writes without external locks (DynamoDB).
54
+ # This flag allows single-writer uploads to S3 without using locks, according to:
55
+ # https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/
56
+ storage_options["AWS_S3_ALLOW_UNSAFE_RENAME"] = "true"
57
+
58
+
59
+ class DeltaTableUploadStagerConfig(UploadStagerConfig):
60
+ pass
61
+
62
+
63
+ @dataclass
64
+ class DeltaTableUploadStager(UploadStager):
65
+ upload_stager_config: DeltaTableUploadStagerConfig = field(
66
+ default_factory=lambda: DeltaTableUploadStagerConfig()
67
+ )
68
+
69
+ def run(
70
+ self,
71
+ elements_filepath: Path,
72
+ output_dir: Path,
73
+ output_filename: str,
74
+ **kwargs: Any,
75
+ ) -> Path:
76
+ with open(elements_filepath) as elements_file:
77
+ elements_contents = json.load(elements_file)
78
+
79
+ output_path = Path(output_dir) / Path(f"{output_filename}.parquet")
80
+
81
+ df = convert_to_pandas_dataframe(elements_dict=elements_contents)
82
+ df.to_parquet(output_path)
83
+
84
+ return output_path
85
+
86
+
87
+ class DeltaTableUploaderConfig(UploaderConfig):
88
+ pass
89
+
90
+
91
+ @dataclass
92
+ class DeltaTableUploader(Uploader):
93
+ upload_config: DeltaTableUploaderConfig
94
+ connection_config: DeltaTableConnectionConfig
95
+ connector_type: str = CONNECTOR_TYPE
96
+
97
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
98
+ def precheck(self):
99
+ secrets = self.connection_config.access_config.get_secret_value()
100
+ if (
101
+ self.connection_config.aws_region
102
+ and secrets.aws_access_key_id
103
+ and secrets.aws_secret_access_key
104
+ ):
105
+ from fsspec import get_filesystem_class
106
+
107
+ try:
108
+ fs = get_filesystem_class("s3")(
109
+ key=secrets.aws_access_key_id, secret=secrets.aws_secret_access_key
110
+ )
111
+ fs.write_bytes(path=self.connection_config.table_uri, value=b"")
112
+
113
+ except Exception as e:
114
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
115
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
116
+
117
+ def process_csv(self, csv_paths: list[Path]) -> pd.DataFrame:
118
+ logger.debug(f"uploading content from {len(csv_paths)} csv files")
119
+ df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
120
+ return df
121
+
122
+ def process_json(self, json_paths: list[Path]) -> pd.DataFrame:
123
+ logger.debug(f"uploading content from {len(json_paths)} json files")
124
+ all_records = []
125
+ for p in json_paths:
126
+ with open(p) as json_file:
127
+ all_records.extend(json.load(json_file))
128
+
129
+ return pd.DataFrame(data=all_records)
130
+
131
+ def process_parquet(self, parquet_paths: list[Path]) -> pd.DataFrame:
132
+ logger.debug(f"uploading content from {len(parquet_paths)} parquet files")
133
+ df = pd.concat((pd.read_parquet(path) for path in parquet_paths), ignore_index=True)
134
+ return df
135
+
136
+ def read_dataframe(self, path: Path) -> pd.DataFrame:
137
+ if path.suffix == ".csv":
138
+ return self.process_csv(csv_paths=[path])
139
+ elif path.suffix == ".json":
140
+ return self.process_json(json_paths=[path])
141
+ elif path.suffix == ".parquet":
142
+ return self.process_parquet(parquet_paths=[path])
143
+ else:
144
+ raise ValueError(f"Unsupported file type, must be parquet, json or csv file: {path}")
145
+
146
+ @requires_dependencies(["deltalake"], extras="delta-table")
147
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
148
+ from deltalake.writer import write_deltalake
149
+
150
+ df = self.read_dataframe(path)
151
+ updated_upload_path = os.path.join(
152
+ self.connection_config.table_uri, file_data.source_identifiers.relative_path
153
+ )
154
+ logger.info(
155
+ f"writing {len(df)} rows to destination table "
156
+ f"at {updated_upload_path}\ndtypes: {df.dtypes}",
157
+ )
158
+ storage_options = {}
159
+ self.connection_config.update_storage_options(storage_options=storage_options)
160
+
161
+ writer_kwargs = {
162
+ "table_or_uri": updated_upload_path,
163
+ "data": df,
164
+ "mode": "overwrite",
165
+ "storage_options": storage_options,
166
+ }
167
+ # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
168
+ # ingest to fail, even though all tasks are completed normally. Putting the writer into a
169
+ # process mitigates this issue by ensuring python interpreter waits properly for deltalake's
170
+ # rust backend to finish
171
+ writer = Process(
172
+ target=write_deltalake,
173
+ kwargs=writer_kwargs,
174
+ )
175
+ writer.start()
176
+ writer.join()
177
+
178
+
179
+ delta_table_destination_entry = DestinationRegistryEntry(
180
+ connection_config=DeltaTableConnectionConfig,
181
+ uploader=DeltaTableUploader,
182
+ uploader_config=DeltaTableUploaderConfig,
183
+ upload_stager=DeltaTableUploadStager,
184
+ upload_stager_config=DeltaTableUploadStagerConfig,
185
+ )