unstructured-ingest 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_databricks_delta_tables.py +10 -10
- test/integration/connectors/utils/validation/equality.py +2 -1
- test/unit/v2/connectors/databricks/__init__.py +0 -0
- test/unit/v2/connectors/databricks/test_volumes_table.py +44 -0
- test/unit/v2/connectors/sql/test_sql.py +4 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/data_prep.py +11 -3
- unstructured_ingest/utils/html.py +109 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -13
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -4
- unstructured_ingest/v2/processes/connectors/confluence.py +95 -25
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +14 -11
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +7 -7
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +9 -9
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +41 -9
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +7 -7
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +8 -8
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +4 -0
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +15 -15
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +14 -7
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +270 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/METADATA +23 -20
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/RECORD +35 -30
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/top_level.txt +0 -0
|
@@ -3,10 +3,11 @@ import os
|
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Generator
|
|
6
|
+
from typing import Any, Generator
|
|
7
7
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
10
11
|
from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
|
|
11
12
|
from unstructured_ingest.v2.logger import logger
|
|
12
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -14,9 +15,9 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
14
15
|
)
|
|
15
16
|
from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
|
|
16
17
|
from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
DatabricksDeltaTablesConnectionConfig,
|
|
19
|
+
DatabricksDeltaTablesUploadStager,
|
|
20
|
+
DatabricksDeltaTablesUploadStagerConfig,
|
|
20
21
|
)
|
|
21
22
|
|
|
22
23
|
CONNECTOR_TYPE = "databricks_volume_delta_tables"
|
|
@@ -28,17 +29,16 @@ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMix
|
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
@dataclass
|
|
31
|
-
class DatabricksVolumeDeltaTableStager(
|
|
32
|
-
def write_output(self, output_path: Path, data: list[dict]
|
|
32
|
+
class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
|
|
33
|
+
def write_output(self, output_path: Path, data: list[dict]) -> None:
|
|
33
34
|
# To avoid new line issues when migrating from volumes into delta tables, omit indenting
|
|
34
35
|
# and always write it as a json file
|
|
35
|
-
|
|
36
|
-
json.dump(data, f)
|
|
36
|
+
write_data(path=output_path.with_suffix(".json"), data=data, indent=None)
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
@dataclass
|
|
40
40
|
class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
41
|
-
connection_config:
|
|
41
|
+
connection_config: DatabricksDeltaTablesConnectionConfig
|
|
42
42
|
upload_config: DatabricksVolumeDeltaTableUploaderConfig
|
|
43
43
|
connector_type: str = CONNECTOR_TYPE
|
|
44
44
|
|
|
@@ -78,7 +78,10 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
78
78
|
@contextmanager
|
|
79
79
|
def get_cursor(self, **connect_kwargs) -> Generator[Any, None, None]:
|
|
80
80
|
with self.connection_config.get_cursor(**connect_kwargs) as cursor:
|
|
81
|
+
logger.debug(f"executing: USE CATALOG: '{self.upload_config.catalog}'")
|
|
81
82
|
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
83
|
+
logger.debug(f"executing: USE DATABASE: {self.upload_config.database}")
|
|
84
|
+
cursor.execute(f"USE DATABASE {self.upload_config.database}")
|
|
82
85
|
yield cursor
|
|
83
86
|
|
|
84
87
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -98,9 +101,9 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
98
101
|
|
|
99
102
|
|
|
100
103
|
databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
101
|
-
connection_config=
|
|
104
|
+
connection_config=DatabricksDeltaTablesConnectionConfig,
|
|
102
105
|
uploader=DatabricksVolumeDeltaTableUploader,
|
|
103
106
|
uploader_config=DatabricksVolumeDeltaTableUploaderConfig,
|
|
104
107
|
upload_stager=DatabricksVolumeDeltaTableStager,
|
|
105
|
-
upload_stager_config=
|
|
108
|
+
upload_stager_config=DatabricksDeltaTablesUploadStagerConfig,
|
|
106
109
|
)
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from unstructured_ingest.utils.data_prep import get_data
|
|
7
|
+
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData, UploadStager
|
|
9
9
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
10
10
|
|
|
@@ -96,5 +96,5 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
96
96
|
df[column] = df[column].apply(str)
|
|
97
97
|
|
|
98
98
|
data = df.to_dict(orient="records")
|
|
99
|
-
|
|
99
|
+
write_data(path=output_path, data=data)
|
|
100
100
|
return output_path
|
|
@@ -128,22 +128,22 @@ class AzureIndexer(FsspecIndexer):
|
|
|
128
128
|
def sterilize_info(self, file_data: dict) -> dict:
|
|
129
129
|
return sterilize_dict(data=file_data, default=azure_json_serial)
|
|
130
130
|
|
|
131
|
-
def get_metadata(self,
|
|
132
|
-
path =
|
|
131
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
132
|
+
path = file_info["name"]
|
|
133
133
|
date_created = (
|
|
134
|
-
str(
|
|
135
|
-
if "creation_time" in
|
|
134
|
+
str(file_info.get("creation_time").timestamp())
|
|
135
|
+
if "creation_time" in file_info
|
|
136
136
|
else None
|
|
137
137
|
)
|
|
138
138
|
date_modified = (
|
|
139
|
-
str(
|
|
140
|
-
if "last_modified" in
|
|
139
|
+
str(file_info.get("last_modified").timestamp())
|
|
140
|
+
if "last_modified" in file_info
|
|
141
141
|
else None
|
|
142
142
|
)
|
|
143
143
|
|
|
144
|
-
file_size =
|
|
144
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
145
145
|
|
|
146
|
-
version =
|
|
146
|
+
version = file_info.get("etag")
|
|
147
147
|
record_locator = {
|
|
148
148
|
"protocol": self.index_config.protocol,
|
|
149
149
|
"remote_file_path": self.index_config.remote_url,
|
|
@@ -104,22 +104,22 @@ class BoxIndexer(FsspecIndexer):
|
|
|
104
104
|
index_config: BoxIndexerConfig
|
|
105
105
|
connector_type: str = CONNECTOR_TYPE
|
|
106
106
|
|
|
107
|
-
def get_metadata(self,
|
|
108
|
-
path =
|
|
107
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
108
|
+
path = file_info["name"]
|
|
109
109
|
date_created = None
|
|
110
110
|
date_modified = None
|
|
111
|
-
if modified_at_str :=
|
|
111
|
+
if modified_at_str := file_info.get("modified_at"):
|
|
112
112
|
date_modified = str(parser.parse(modified_at_str).timestamp())
|
|
113
|
-
if created_at_str :=
|
|
113
|
+
if created_at_str := file_info.get("created_at"):
|
|
114
114
|
date_created = str(parser.parse(created_at_str).timestamp())
|
|
115
115
|
|
|
116
|
-
file_size =
|
|
116
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
117
117
|
|
|
118
|
-
version =
|
|
118
|
+
version = file_info.get("id")
|
|
119
119
|
record_locator = {
|
|
120
120
|
"protocol": self.index_config.protocol,
|
|
121
121
|
"remote_file_path": self.index_config.remote_url,
|
|
122
|
-
"file_id":
|
|
122
|
+
"file_id": file_info.get("id"),
|
|
123
123
|
}
|
|
124
124
|
return FileDataSourceMetadata(
|
|
125
125
|
date_created=date_created,
|
|
@@ -93,15 +93,15 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
93
93
|
index_config: DropboxIndexerConfig
|
|
94
94
|
connector_type: str = CONNECTOR_TYPE
|
|
95
95
|
|
|
96
|
-
def get_path(self,
|
|
97
|
-
return
|
|
96
|
+
def get_path(self, file_info: dict) -> str:
|
|
97
|
+
return file_info["name"]
|
|
98
98
|
|
|
99
|
-
def get_metadata(self,
|
|
100
|
-
path =
|
|
99
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
100
|
+
path = file_info["name"].lstrip("/")
|
|
101
101
|
date_created = None
|
|
102
102
|
date_modified = None
|
|
103
|
-
server_modified =
|
|
104
|
-
client_modified =
|
|
103
|
+
server_modified = file_info.get("server_modified")
|
|
104
|
+
client_modified = file_info.get("client_modified")
|
|
105
105
|
if server_modified and client_modified and server_modified > client_modified:
|
|
106
106
|
date_created = str(client_modified.timestamp())
|
|
107
107
|
date_modified = str(server_modified.timestamp())
|
|
@@ -109,13 +109,13 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
109
109
|
date_created = str(server_modified.timestamp())
|
|
110
110
|
date_modified = str(client_modified.timestamp())
|
|
111
111
|
|
|
112
|
-
file_size =
|
|
112
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
113
113
|
|
|
114
|
-
version =
|
|
114
|
+
version = file_info.get("content_hash")
|
|
115
115
|
record_locator = {
|
|
116
116
|
"protocol": self.index_config.protocol,
|
|
117
117
|
"remote_file_path": self.index_config.remote_url,
|
|
118
|
-
"file_id":
|
|
118
|
+
"file_id": file_info.get("id"),
|
|
119
119
|
}
|
|
120
120
|
return FileDataSourceMetadata(
|
|
121
121
|
date_created=date_created,
|
|
@@ -119,7 +119,7 @@ class FsspecIndexer(Indexer):
|
|
|
119
119
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
120
120
|
raise self.wrap_error(e=e)
|
|
121
121
|
|
|
122
|
-
def
|
|
122
|
+
def get_file_info(self) -> list[dict[str, Any]]:
|
|
123
123
|
if not self.index_config.recursive:
|
|
124
124
|
# fs.ls does not walk directories
|
|
125
125
|
# directories that are listed in cloud storage can cause problems
|
|
@@ -156,24 +156,56 @@ class FsspecIndexer(Indexer):
|
|
|
156
156
|
|
|
157
157
|
return random.sample(files, n)
|
|
158
158
|
|
|
159
|
-
def get_metadata(self,
|
|
159
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
160
160
|
raise NotImplementedError()
|
|
161
161
|
|
|
162
|
-
def get_path(self,
|
|
163
|
-
return
|
|
162
|
+
def get_path(self, file_info: dict) -> str:
|
|
163
|
+
return file_info["name"]
|
|
164
164
|
|
|
165
165
|
def sterilize_info(self, file_data: dict) -> dict:
|
|
166
166
|
return sterilize_dict(data=file_data)
|
|
167
167
|
|
|
168
|
+
def create_init_file_data(self, remote_filepath: Optional[str] = None) -> FileData:
|
|
169
|
+
# Create initial file data that requires no network calls and is constructed purely
|
|
170
|
+
# with information that exists in the config
|
|
171
|
+
remote_filepath = remote_filepath or self.index_config.remote_url
|
|
172
|
+
path_without_protocol = remote_filepath.split("://")[1]
|
|
173
|
+
rel_path = remote_filepath.replace(path_without_protocol, "").lstrip("/")
|
|
174
|
+
return FileData(
|
|
175
|
+
identifier=str(uuid5(NAMESPACE_DNS, remote_filepath)),
|
|
176
|
+
connector_type=self.connector_type,
|
|
177
|
+
display_name=remote_filepath,
|
|
178
|
+
source_identifiers=SourceIdentifiers(
|
|
179
|
+
filename=Path(remote_filepath).name,
|
|
180
|
+
rel_path=rel_path or None,
|
|
181
|
+
fullpath=remote_filepath,
|
|
182
|
+
),
|
|
183
|
+
metadata=FileDataSourceMetadata(url=remote_filepath),
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
def hydrate_file_data(self, init_file_data: FileData):
|
|
187
|
+
# Get file info
|
|
188
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
189
|
+
files = client.ls(self.index_config.path_without_protocol, detail=True)
|
|
190
|
+
filtered_files = [
|
|
191
|
+
file for file in files if file.get("size") > 0 and file.get("type") == "file"
|
|
192
|
+
]
|
|
193
|
+
if not filtered_files:
|
|
194
|
+
raise ValueError(f"{init_file_data} did not reference any valid file")
|
|
195
|
+
if len(filtered_files) > 1:
|
|
196
|
+
raise ValueError(f"{init_file_data} referenced more than one file")
|
|
197
|
+
file_info = filtered_files[0]
|
|
198
|
+
init_file_data.additional_metadata = self.get_metadata(file_info=file_info)
|
|
199
|
+
|
|
168
200
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
169
|
-
files = self.
|
|
170
|
-
for
|
|
171
|
-
file_path = self.get_path(
|
|
201
|
+
files = self.get_file_info()
|
|
202
|
+
for file_info in files:
|
|
203
|
+
file_path = self.get_path(file_info=file_info)
|
|
172
204
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
173
205
|
# to get a valid relative path
|
|
174
206
|
rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
|
|
175
207
|
|
|
176
|
-
additional_metadata = self.sterilize_info(file_data=
|
|
208
|
+
additional_metadata = self.sterilize_info(file_data=file_info)
|
|
177
209
|
additional_metadata["original_file_path"] = file_path
|
|
178
210
|
yield FileData(
|
|
179
211
|
identifier=str(uuid5(NAMESPACE_DNS, file_path)),
|
|
@@ -183,7 +215,7 @@ class FsspecIndexer(Indexer):
|
|
|
183
215
|
rel_path=rel_path or None,
|
|
184
216
|
fullpath=file_path,
|
|
185
217
|
),
|
|
186
|
-
metadata=self.get_metadata(
|
|
218
|
+
metadata=self.get_metadata(file_info=file_info),
|
|
187
219
|
additional_metadata=additional_metadata,
|
|
188
220
|
display_name=file_path,
|
|
189
221
|
)
|
|
@@ -131,22 +131,22 @@ class GcsIndexer(FsspecIndexer):
|
|
|
131
131
|
index_config: GcsIndexerConfig
|
|
132
132
|
connector_type: str = CONNECTOR_TYPE
|
|
133
133
|
|
|
134
|
-
def get_metadata(self,
|
|
135
|
-
path =
|
|
134
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
135
|
+
path = file_info["name"]
|
|
136
136
|
date_created = None
|
|
137
137
|
date_modified = None
|
|
138
|
-
if modified_at_str :=
|
|
138
|
+
if modified_at_str := file_info.get("updated"):
|
|
139
139
|
date_modified = str(parser.parse(modified_at_str).timestamp())
|
|
140
|
-
if created_at_str :=
|
|
140
|
+
if created_at_str := file_info.get("timeCreated"):
|
|
141
141
|
date_created = str(parser.parse(created_at_str).timestamp())
|
|
142
142
|
|
|
143
|
-
file_size =
|
|
143
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
144
144
|
|
|
145
|
-
version =
|
|
145
|
+
version = file_info.get("etag")
|
|
146
146
|
record_locator = {
|
|
147
147
|
"protocol": self.index_config.protocol,
|
|
148
148
|
"remote_file_path": self.index_config.remote_url,
|
|
149
|
-
"file_id":
|
|
149
|
+
"file_id": file_info.get("id"),
|
|
150
150
|
}
|
|
151
151
|
return FileDataSourceMetadata(
|
|
152
152
|
date_created=date_created,
|
|
@@ -110,22 +110,22 @@ class S3Indexer(FsspecIndexer):
|
|
|
110
110
|
def wrap_error(self, e: Exception) -> Exception:
|
|
111
111
|
return self.connection_config.wrap_error(e=e)
|
|
112
112
|
|
|
113
|
-
def get_path(self,
|
|
114
|
-
return
|
|
113
|
+
def get_path(self, file_info: dict) -> str:
|
|
114
|
+
return file_info["Key"]
|
|
115
115
|
|
|
116
|
-
def get_metadata(self,
|
|
117
|
-
path =
|
|
116
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
117
|
+
path = file_info["Key"]
|
|
118
118
|
date_created = None
|
|
119
119
|
date_modified = None
|
|
120
|
-
modified =
|
|
120
|
+
modified = file_info.get("LastModified")
|
|
121
121
|
if modified:
|
|
122
122
|
date_created = str(modified.timestamp())
|
|
123
123
|
date_modified = str(modified.timestamp())
|
|
124
124
|
|
|
125
|
-
file_size =
|
|
126
|
-
file_size = file_size or
|
|
125
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
126
|
+
file_size = file_size or file_info.get("Size")
|
|
127
127
|
|
|
128
|
-
version =
|
|
128
|
+
version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
|
|
129
129
|
metadata: dict[str, str] = {}
|
|
130
130
|
with contextlib.suppress(AttributeError):
|
|
131
131
|
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
@@ -107,12 +107,12 @@ class SftpIndexer(FsspecIndexer):
|
|
|
107
107
|
file.identifier = new_identifier
|
|
108
108
|
yield file
|
|
109
109
|
|
|
110
|
-
def get_metadata(self,
|
|
111
|
-
path =
|
|
112
|
-
date_created = str(
|
|
113
|
-
date_modified = str(
|
|
110
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
111
|
+
path = file_info["name"]
|
|
112
|
+
date_created = str(file_info.get("time").timestamp()) if "time" in file_info else None
|
|
113
|
+
date_modified = str(file_info.get("mtime").timestamp()) if "mtime" in file_info else None
|
|
114
114
|
|
|
115
|
-
file_size =
|
|
115
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
116
116
|
|
|
117
117
|
record_locator = {
|
|
118
118
|
"protocol": self.index_config.protocol,
|
|
@@ -15,11 +15,14 @@ from .snowflake import CONNECTOR_TYPE as SNOWFLAKE_CONNECTOR_TYPE
|
|
|
15
15
|
from .snowflake import snowflake_destination_entry, snowflake_source_entry
|
|
16
16
|
from .sqlite import CONNECTOR_TYPE as SQLITE_CONNECTOR_TYPE
|
|
17
17
|
from .sqlite import sqlite_destination_entry, sqlite_source_entry
|
|
18
|
+
from .vastdb import CONNECTOR_TYPE as VASTDB_CONNECTOR_TYPE
|
|
19
|
+
from .vastdb import vastdb_destination_entry, vastdb_source_entry
|
|
18
20
|
|
|
19
21
|
add_source_entry(source_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_source_entry)
|
|
20
22
|
add_source_entry(source_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_source_entry)
|
|
21
23
|
add_source_entry(source_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_source_entry)
|
|
22
24
|
add_source_entry(source_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_source_entry)
|
|
25
|
+
add_source_entry(source_type=VASTDB_CONNECTOR_TYPE, entry=vastdb_source_entry)
|
|
23
26
|
|
|
24
27
|
add_destination_entry(destination_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_destination_entry)
|
|
25
28
|
add_destination_entry(destination_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_destination_entry)
|
|
@@ -31,3 +34,4 @@ add_destination_entry(
|
|
|
31
34
|
destination_type=DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE,
|
|
32
35
|
entry=databricks_delta_tables_destination_entry,
|
|
33
36
|
)
|
|
37
|
+
add_destination_entry(destination_type=VASTDB_CONNECTOR_TYPE, entry=vastdb_destination_entry)
|
|
@@ -31,7 +31,7 @@ if TYPE_CHECKING:
|
|
|
31
31
|
CONNECTOR_TYPE = "databricks_delta_tables"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
class
|
|
34
|
+
class DatabricksDeltaTablesAccessConfig(SQLAccessConfig):
|
|
35
35
|
token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
|
|
36
36
|
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
37
37
|
client_secret: Optional[str] = Field(
|
|
@@ -39,8 +39,8 @@ class DatabrickDeltaTablesAccessConfig(SQLAccessConfig):
|
|
|
39
39
|
)
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class
|
|
43
|
-
access_config: Secret[
|
|
42
|
+
class DatabricksDeltaTablesConnectionConfig(SQLConnectionConfig):
|
|
43
|
+
access_config: Secret[DatabricksDeltaTablesAccessConfig]
|
|
44
44
|
server_hostname: str = Field(description="server hostname connection config value")
|
|
45
45
|
http_path: str = Field(description="http path connection config value")
|
|
46
46
|
user_agent: str = "unstructuredio_oss"
|
|
@@ -102,24 +102,24 @@ class DatabrickDeltaTablesConnectionConfig(SQLConnectionConfig):
|
|
|
102
102
|
yield cursor
|
|
103
103
|
|
|
104
104
|
|
|
105
|
-
class
|
|
105
|
+
class DatabricksDeltaTablesUploadStagerConfig(SQLUploadStagerConfig):
|
|
106
106
|
pass
|
|
107
107
|
|
|
108
108
|
|
|
109
|
-
class
|
|
110
|
-
upload_stager_config:
|
|
109
|
+
class DatabricksDeltaTablesUploadStager(SQLUploadStager):
|
|
110
|
+
upload_stager_config: DatabricksDeltaTablesUploadStagerConfig
|
|
111
111
|
|
|
112
112
|
|
|
113
|
-
class
|
|
113
|
+
class DatabricksDeltaTablesUploaderConfig(SQLUploaderConfig):
|
|
114
114
|
catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
|
|
115
115
|
database: str = Field(description="Database name", default="default")
|
|
116
116
|
table_name: str = Field(description="Table name")
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
@dataclass
|
|
120
|
-
class
|
|
121
|
-
upload_config:
|
|
122
|
-
connection_config:
|
|
120
|
+
class DatabricksDeltaTablesUploader(SQLUploader):
|
|
121
|
+
upload_config: DatabricksDeltaTablesUploaderConfig
|
|
122
|
+
connection_config: DatabricksDeltaTablesConnectionConfig
|
|
123
123
|
connector_type: str = CONNECTOR_TYPE
|
|
124
124
|
|
|
125
125
|
@contextmanager
|
|
@@ -205,9 +205,9 @@ class DatabrickDeltaTablesUploader(SQLUploader):
|
|
|
205
205
|
|
|
206
206
|
|
|
207
207
|
databricks_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
208
|
-
connection_config=
|
|
209
|
-
uploader=
|
|
210
|
-
uploader_config=
|
|
211
|
-
upload_stager=
|
|
212
|
-
upload_stager_config=
|
|
208
|
+
connection_config=DatabricksDeltaTablesConnectionConfig,
|
|
209
|
+
uploader=DatabricksDeltaTablesUploader,
|
|
210
|
+
uploader_config=DatabricksDeltaTablesUploaderConfig,
|
|
211
|
+
upload_stager=DatabricksDeltaTablesUploadStager,
|
|
212
|
+
upload_stager_config=DatabricksDeltaTablesUploadStagerConfig,
|
|
213
213
|
)
|
|
@@ -3,6 +3,7 @@ from contextlib import contextmanager
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
+
import pandas as pd
|
|
6
7
|
from pydantic import Field, Secret
|
|
7
8
|
|
|
8
9
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -139,7 +140,7 @@ class SingleStoreUploader(SQLUploader):
|
|
|
139
140
|
if isinstance(value, (list, dict)):
|
|
140
141
|
value = json.dumps(value)
|
|
141
142
|
if column_name in _DATE_COLUMNS:
|
|
142
|
-
if value is None:
|
|
143
|
+
if value is None or pd.isna(value):
|
|
143
144
|
parsed.append(None)
|
|
144
145
|
else:
|
|
145
146
|
parsed.append(parse_date_string(value))
|
|
@@ -14,7 +14,7 @@ from dateutil import parser
|
|
|
14
14
|
from pydantic import BaseModel, Field, Secret
|
|
15
15
|
|
|
16
16
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
17
|
-
from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dataframe
|
|
17
|
+
from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dataframe, write_data
|
|
18
18
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
19
19
|
from unstructured_ingest.v2.interfaces import (
|
|
20
20
|
AccessConfig,
|
|
@@ -292,6 +292,9 @@ class SQLUploadStager(UploadStager):
|
|
|
292
292
|
df[column] = df[column].apply(str)
|
|
293
293
|
return df
|
|
294
294
|
|
|
295
|
+
def write_output(self, output_path: Path, data: list[dict]) -> None:
|
|
296
|
+
write_data(path=output_path, data=data)
|
|
297
|
+
|
|
295
298
|
def run(
|
|
296
299
|
self,
|
|
297
300
|
elements_filepath: Path,
|
|
@@ -332,6 +335,7 @@ class SQLUploader(Uploader):
|
|
|
332
335
|
upload_config: SQLUploaderConfig
|
|
333
336
|
connection_config: SQLConnectionConfig
|
|
334
337
|
values_delimiter: str = "?"
|
|
338
|
+
_columns: list[str] = field(init=False, default=None)
|
|
335
339
|
|
|
336
340
|
def precheck(self) -> None:
|
|
337
341
|
try:
|
|
@@ -354,7 +358,7 @@ class SQLUploader(Uploader):
|
|
|
354
358
|
parsed = []
|
|
355
359
|
for column_name, value in zip(columns, row):
|
|
356
360
|
if column_name in _DATE_COLUMNS:
|
|
357
|
-
if value is None:
|
|
361
|
+
if value is None or pd.isna(value): # pandas is nan
|
|
358
362
|
parsed.append(None)
|
|
359
363
|
else:
|
|
360
364
|
parsed.append(parse_date_string(value))
|
|
@@ -364,8 +368,9 @@ class SQLUploader(Uploader):
|
|
|
364
368
|
return output
|
|
365
369
|
|
|
366
370
|
def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
371
|
+
table_columns = self.get_table_columns()
|
|
367
372
|
columns = set(df.columns)
|
|
368
|
-
schema_fields = set(
|
|
373
|
+
schema_fields = set(table_columns)
|
|
369
374
|
columns_to_drop = columns - schema_fields
|
|
370
375
|
missing_columns = schema_fields - columns
|
|
371
376
|
|
|
@@ -395,8 +400,8 @@ class SQLUploader(Uploader):
|
|
|
395
400
|
f"record id column "
|
|
396
401
|
f"{self.upload_config.record_id_key}, skipping delete"
|
|
397
402
|
)
|
|
403
|
+
df = self._fit_to_schema(df=df)
|
|
398
404
|
df.replace({np.nan: None}, inplace=True)
|
|
399
|
-
self._fit_to_schema(df=df)
|
|
400
405
|
|
|
401
406
|
columns = list(df.columns)
|
|
402
407
|
stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
|
|
@@ -424,9 +429,11 @@ class SQLUploader(Uploader):
|
|
|
424
429
|
cursor.executemany(stmt, values)
|
|
425
430
|
|
|
426
431
|
def get_table_columns(self) -> list[str]:
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
432
|
+
if self._columns is None:
|
|
433
|
+
with self.get_cursor() as cursor:
|
|
434
|
+
cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
|
|
435
|
+
self._columns = [desc[0] for desc in cursor.description]
|
|
436
|
+
return self._columns
|
|
430
437
|
|
|
431
438
|
def can_delete(self) -> bool:
|
|
432
439
|
return self.upload_config.record_id_key in self.get_table_columns()
|
|
@@ -4,6 +4,7 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generator
|
|
6
6
|
|
|
7
|
+
import pandas as pd
|
|
7
8
|
from pydantic import Field, Secret, model_validator
|
|
8
9
|
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -141,7 +142,7 @@ class SQLiteUploader(SQLUploader):
|
|
|
141
142
|
if isinstance(value, (list, dict)):
|
|
142
143
|
value = json.dumps(value)
|
|
143
144
|
if column_name in _DATE_COLUMNS:
|
|
144
|
-
if value is None:
|
|
145
|
+
if value is None or pd.isna(value):
|
|
145
146
|
parsed.append(None)
|
|
146
147
|
else:
|
|
147
148
|
parsed.append(parse_date_string(value))
|