unstructured-ingest 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/utils/validation/equality.py +2 -1
- test/unit/v2/connectors/sql/__init__.py +0 -0
- test/unit/v2/connectors/sql/test_sql.py +74 -0
- test/unit/v2/connectors/test_confluence.py +6 -6
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/data_prep.py +11 -3
- unstructured_ingest/utils/html.py +109 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -13
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -4
- unstructured_ingest/v2/processes/connectors/confluence.py +125 -35
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +7 -7
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +9 -9
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +41 -9
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +7 -7
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +8 -8
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +4 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +14 -8
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +270 -0
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/METADATA +18 -15
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/RECORD +34 -29
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
1
2
|
from dataclasses import dataclass, field
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import TYPE_CHECKING, Generator, List, Optional
|
|
@@ -17,6 +18,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
17
18
|
Indexer,
|
|
18
19
|
IndexerConfig,
|
|
19
20
|
SourceIdentifiers,
|
|
21
|
+
download_responses,
|
|
20
22
|
)
|
|
21
23
|
from unstructured_ingest.v2.logger import logger
|
|
22
24
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -30,38 +32,60 @@ CONNECTOR_TYPE = "confluence"
|
|
|
30
32
|
|
|
31
33
|
|
|
32
34
|
class ConfluenceAccessConfig(AccessConfig):
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
35
|
+
password: Optional[str] = Field(
|
|
36
|
+
description="Confluence password or Cloud API token",
|
|
37
|
+
default=None,
|
|
38
|
+
)
|
|
39
|
+
token: Optional[str] = Field(
|
|
40
|
+
description="Confluence Personal Access Token",
|
|
41
|
+
default=None,
|
|
36
42
|
)
|
|
37
43
|
|
|
38
44
|
|
|
39
45
|
class ConfluenceConnectionConfig(ConnectionConfig):
|
|
40
46
|
url: str = Field(description="URL of the Confluence instance")
|
|
41
|
-
|
|
47
|
+
username: Optional[str] = Field(
|
|
48
|
+
description="Username or email for authentication",
|
|
49
|
+
default=None,
|
|
50
|
+
)
|
|
51
|
+
cloud: bool = Field(description="Authenticate to Confluence Cloud", default=False)
|
|
42
52
|
access_config: Secret[ConfluenceAccessConfig] = Field(
|
|
43
53
|
description="Access configuration for Confluence"
|
|
44
54
|
)
|
|
45
55
|
|
|
46
56
|
def model_post_init(self, __context):
|
|
47
57
|
access_configs = self.access_config.get_secret_value()
|
|
48
|
-
basic_auth = self.
|
|
49
|
-
pat_auth = access_configs.
|
|
58
|
+
basic_auth = self.username and access_configs.password
|
|
59
|
+
pat_auth = access_configs.token
|
|
60
|
+
if self.cloud and not basic_auth:
|
|
61
|
+
raise ValueError(
|
|
62
|
+
"cloud authentication requires username and API token (--password), "
|
|
63
|
+
"see: https://atlassian-python-api.readthedocs.io/"
|
|
64
|
+
)
|
|
50
65
|
if basic_auth and pat_auth:
|
|
51
|
-
raise ValueError(
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"both password and token provided, only one allowed, "
|
|
68
|
+
"see: https://atlassian-python-api.readthedocs.io/"
|
|
69
|
+
)
|
|
52
70
|
if not (basic_auth or pat_auth):
|
|
53
|
-
raise ValueError(
|
|
71
|
+
raise ValueError(
|
|
72
|
+
"no form of auth provided, see: https://atlassian-python-api.readthedocs.io/"
|
|
73
|
+
)
|
|
54
74
|
|
|
55
75
|
@requires_dependencies(["atlassian"], extras="confluence")
|
|
76
|
+
@contextmanager
|
|
56
77
|
def get_client(self) -> "Confluence":
|
|
57
78
|
from atlassian import Confluence
|
|
58
79
|
|
|
59
80
|
access_configs = self.access_config.get_secret_value()
|
|
60
|
-
|
|
81
|
+
with Confluence(
|
|
61
82
|
url=self.url,
|
|
62
|
-
username=self.
|
|
63
|
-
password=access_configs.
|
|
64
|
-
|
|
83
|
+
username=self.username,
|
|
84
|
+
password=access_configs.password,
|
|
85
|
+
token=access_configs.token,
|
|
86
|
+
cloud=self.cloud,
|
|
87
|
+
) as client:
|
|
88
|
+
yield client
|
|
65
89
|
|
|
66
90
|
|
|
67
91
|
class ConfluenceIndexerConfig(IndexerConfig):
|
|
@@ -83,8 +107,8 @@ class ConfluenceIndexer(Indexer):
|
|
|
83
107
|
|
|
84
108
|
# Attempt to retrieve a list of spaces with limit=1.
|
|
85
109
|
# This should only succeed if all creds are valid
|
|
86
|
-
|
|
87
|
-
|
|
110
|
+
with self.connection_config.get_client() as client:
|
|
111
|
+
client.get_all_spaces(limit=1)
|
|
88
112
|
logger.info("Connection to Confluence successful.")
|
|
89
113
|
return True
|
|
90
114
|
except Exception as e:
|
|
@@ -96,21 +120,21 @@ class ConfluenceIndexer(Indexer):
|
|
|
96
120
|
if spaces:
|
|
97
121
|
return spaces
|
|
98
122
|
else:
|
|
99
|
-
|
|
100
|
-
|
|
123
|
+
with self.connection_config.get_client() as client:
|
|
124
|
+
all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
|
|
101
125
|
space_ids = [space["key"] for space in all_spaces["results"]]
|
|
102
126
|
return space_ids
|
|
103
127
|
|
|
104
128
|
def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
129
|
+
with self.connection_config.get_client() as client:
|
|
130
|
+
pages = client.get_all_pages_from_space(
|
|
131
|
+
space=space_id,
|
|
132
|
+
start=0,
|
|
133
|
+
limit=self.index_config.max_num_of_docs_from_each_space,
|
|
134
|
+
expand=None,
|
|
135
|
+
content_type="page",
|
|
136
|
+
status=None,
|
|
137
|
+
)
|
|
114
138
|
doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
|
|
115
139
|
return doc_ids
|
|
116
140
|
|
|
@@ -157,7 +181,18 @@ class ConfluenceIndexer(Indexer):
|
|
|
157
181
|
|
|
158
182
|
|
|
159
183
|
class ConfluenceDownloaderConfig(DownloaderConfig):
|
|
160
|
-
|
|
184
|
+
extract_images: bool = Field(
|
|
185
|
+
default=False,
|
|
186
|
+
description="if true, will download images and replace "
|
|
187
|
+
"the html content with base64 encoded images",
|
|
188
|
+
)
|
|
189
|
+
extract_files: bool = Field(
|
|
190
|
+
default=False, description="if true, will download any embedded files"
|
|
191
|
+
)
|
|
192
|
+
force_download: bool = Field(
|
|
193
|
+
default=False,
|
|
194
|
+
description="if true, will redownload extracted files even if they already exist locally",
|
|
195
|
+
)
|
|
161
196
|
|
|
162
197
|
|
|
163
198
|
@dataclass
|
|
@@ -166,14 +201,37 @@ class ConfluenceDownloader(Downloader):
|
|
|
166
201
|
download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
|
|
167
202
|
connector_type: str = CONNECTOR_TYPE
|
|
168
203
|
|
|
169
|
-
def
|
|
204
|
+
def download_embedded_files(
|
|
205
|
+
self, session, html: str, current_file_data: FileData
|
|
206
|
+
) -> list[DownloadResponse]:
|
|
207
|
+
if not self.download_config.extract_files:
|
|
208
|
+
return []
|
|
209
|
+
from unstructured_ingest.utils.html import download_embedded_files
|
|
210
|
+
|
|
211
|
+
filepath = current_file_data.source_identifiers.relative_path
|
|
212
|
+
download_path = Path(self.download_dir) / filepath
|
|
213
|
+
download_dir = download_path.with_suffix("")
|
|
214
|
+
return download_embedded_files(
|
|
215
|
+
download_dir=download_dir,
|
|
216
|
+
original_filedata=current_file_data,
|
|
217
|
+
original_html=html,
|
|
218
|
+
session=session,
|
|
219
|
+
force_download=self.download_config.force_download,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def run(self, file_data: FileData, **kwargs) -> download_responses:
|
|
223
|
+
from bs4 import BeautifulSoup
|
|
224
|
+
|
|
225
|
+
from unstructured_ingest.utils.html import convert_image_tags
|
|
226
|
+
|
|
170
227
|
doc_id = file_data.identifier
|
|
171
228
|
try:
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
229
|
+
with self.connection_config.get_client() as client:
|
|
230
|
+
page = client.get_page_by_id(
|
|
231
|
+
page_id=doc_id,
|
|
232
|
+
expand="history.lastUpdated,version,body.view",
|
|
233
|
+
)
|
|
234
|
+
|
|
177
235
|
except Exception as e:
|
|
178
236
|
logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
|
|
179
237
|
raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
|
|
@@ -182,20 +240,52 @@ class ConfluenceDownloader(Downloader):
|
|
|
182
240
|
raise ValueError(f"Page with ID {doc_id} does not exist.")
|
|
183
241
|
|
|
184
242
|
content = page["body"]["view"]["value"]
|
|
243
|
+
# This supports v2 html parsing in unstructured
|
|
244
|
+
title = page["title"]
|
|
245
|
+
title_html = f"<title>{title}</title>"
|
|
246
|
+
content = f"<body class='Document' >{title_html}{content}</body>"
|
|
247
|
+
if self.download_config.extract_images:
|
|
248
|
+
with self.connection_config.get_client() as client:
|
|
249
|
+
content = convert_image_tags(
|
|
250
|
+
url=file_data.metadata.url, original_html=content, session=client._session
|
|
251
|
+
)
|
|
185
252
|
|
|
186
253
|
filepath = file_data.source_identifiers.relative_path
|
|
187
254
|
download_path = Path(self.download_dir) / filepath
|
|
188
255
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
189
256
|
with open(download_path, "w", encoding="utf8") as f:
|
|
190
|
-
|
|
257
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
258
|
+
f.write(soup.prettify())
|
|
191
259
|
|
|
192
260
|
# Update file_data with metadata
|
|
193
261
|
file_data.metadata.date_created = page["history"]["createdDate"]
|
|
194
262
|
file_data.metadata.date_modified = page["version"]["when"]
|
|
195
263
|
file_data.metadata.version = str(page["version"]["number"])
|
|
196
|
-
file_data.display_name =
|
|
264
|
+
file_data.display_name = title
|
|
197
265
|
|
|
198
|
-
|
|
266
|
+
download_response = self.generate_download_response(
|
|
267
|
+
file_data=file_data, download_path=download_path
|
|
268
|
+
)
|
|
269
|
+
if self.download_config.extract_files:
|
|
270
|
+
with self.connection_config.get_client() as client:
|
|
271
|
+
extracted_download_responses = self.download_embedded_files(
|
|
272
|
+
html=content,
|
|
273
|
+
current_file_data=download_response["file_data"],
|
|
274
|
+
session=client._session,
|
|
275
|
+
)
|
|
276
|
+
if extracted_download_responses:
|
|
277
|
+
for dr in extracted_download_responses:
|
|
278
|
+
fd = dr["file_data"]
|
|
279
|
+
source_file_path = Path(file_data.source_identifiers.fullpath).with_suffix(
|
|
280
|
+
""
|
|
281
|
+
)
|
|
282
|
+
new_fullpath = source_file_path / fd.source_identifiers.filename
|
|
283
|
+
fd.source_identifiers = SourceIdentifiers(
|
|
284
|
+
fullpath=new_fullpath.as_posix(), filename=new_fullpath.name
|
|
285
|
+
)
|
|
286
|
+
extracted_download_responses.append(download_response)
|
|
287
|
+
return extracted_download_responses
|
|
288
|
+
return download_response
|
|
199
289
|
|
|
200
290
|
|
|
201
291
|
confluence_source_entry = SourceRegistryEntry(
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from unstructured_ingest.utils.data_prep import get_data
|
|
7
|
+
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData, UploadStager
|
|
9
9
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
10
10
|
|
|
@@ -96,5 +96,5 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
96
96
|
df[column] = df[column].apply(str)
|
|
97
97
|
|
|
98
98
|
data = df.to_dict(orient="records")
|
|
99
|
-
|
|
99
|
+
write_data(path=output_path, data=data)
|
|
100
100
|
return output_path
|
|
@@ -128,22 +128,22 @@ class AzureIndexer(FsspecIndexer):
|
|
|
128
128
|
def sterilize_info(self, file_data: dict) -> dict:
|
|
129
129
|
return sterilize_dict(data=file_data, default=azure_json_serial)
|
|
130
130
|
|
|
131
|
-
def get_metadata(self,
|
|
132
|
-
path =
|
|
131
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
132
|
+
path = file_info["name"]
|
|
133
133
|
date_created = (
|
|
134
|
-
str(
|
|
135
|
-
if "creation_time" in
|
|
134
|
+
str(file_info.get("creation_time").timestamp())
|
|
135
|
+
if "creation_time" in file_info
|
|
136
136
|
else None
|
|
137
137
|
)
|
|
138
138
|
date_modified = (
|
|
139
|
-
str(
|
|
140
|
-
if "last_modified" in
|
|
139
|
+
str(file_info.get("last_modified").timestamp())
|
|
140
|
+
if "last_modified" in file_info
|
|
141
141
|
else None
|
|
142
142
|
)
|
|
143
143
|
|
|
144
|
-
file_size =
|
|
144
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
145
145
|
|
|
146
|
-
version =
|
|
146
|
+
version = file_info.get("etag")
|
|
147
147
|
record_locator = {
|
|
148
148
|
"protocol": self.index_config.protocol,
|
|
149
149
|
"remote_file_path": self.index_config.remote_url,
|
|
@@ -104,22 +104,22 @@ class BoxIndexer(FsspecIndexer):
|
|
|
104
104
|
index_config: BoxIndexerConfig
|
|
105
105
|
connector_type: str = CONNECTOR_TYPE
|
|
106
106
|
|
|
107
|
-
def get_metadata(self,
|
|
108
|
-
path =
|
|
107
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
108
|
+
path = file_info["name"]
|
|
109
109
|
date_created = None
|
|
110
110
|
date_modified = None
|
|
111
|
-
if modified_at_str :=
|
|
111
|
+
if modified_at_str := file_info.get("modified_at"):
|
|
112
112
|
date_modified = str(parser.parse(modified_at_str).timestamp())
|
|
113
|
-
if created_at_str :=
|
|
113
|
+
if created_at_str := file_info.get("created_at"):
|
|
114
114
|
date_created = str(parser.parse(created_at_str).timestamp())
|
|
115
115
|
|
|
116
|
-
file_size =
|
|
116
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
117
117
|
|
|
118
|
-
version =
|
|
118
|
+
version = file_info.get("id")
|
|
119
119
|
record_locator = {
|
|
120
120
|
"protocol": self.index_config.protocol,
|
|
121
121
|
"remote_file_path": self.index_config.remote_url,
|
|
122
|
-
"file_id":
|
|
122
|
+
"file_id": file_info.get("id"),
|
|
123
123
|
}
|
|
124
124
|
return FileDataSourceMetadata(
|
|
125
125
|
date_created=date_created,
|
|
@@ -93,15 +93,15 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
93
93
|
index_config: DropboxIndexerConfig
|
|
94
94
|
connector_type: str = CONNECTOR_TYPE
|
|
95
95
|
|
|
96
|
-
def get_path(self,
|
|
97
|
-
return
|
|
96
|
+
def get_path(self, file_info: dict) -> str:
|
|
97
|
+
return file_info["name"]
|
|
98
98
|
|
|
99
|
-
def get_metadata(self,
|
|
100
|
-
path =
|
|
99
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
100
|
+
path = file_info["name"].lstrip("/")
|
|
101
101
|
date_created = None
|
|
102
102
|
date_modified = None
|
|
103
|
-
server_modified =
|
|
104
|
-
client_modified =
|
|
103
|
+
server_modified = file_info.get("server_modified")
|
|
104
|
+
client_modified = file_info.get("client_modified")
|
|
105
105
|
if server_modified and client_modified and server_modified > client_modified:
|
|
106
106
|
date_created = str(client_modified.timestamp())
|
|
107
107
|
date_modified = str(server_modified.timestamp())
|
|
@@ -109,13 +109,13 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
109
109
|
date_created = str(server_modified.timestamp())
|
|
110
110
|
date_modified = str(client_modified.timestamp())
|
|
111
111
|
|
|
112
|
-
file_size =
|
|
112
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
113
113
|
|
|
114
|
-
version =
|
|
114
|
+
version = file_info.get("content_hash")
|
|
115
115
|
record_locator = {
|
|
116
116
|
"protocol": self.index_config.protocol,
|
|
117
117
|
"remote_file_path": self.index_config.remote_url,
|
|
118
|
-
"file_id":
|
|
118
|
+
"file_id": file_info.get("id"),
|
|
119
119
|
}
|
|
120
120
|
return FileDataSourceMetadata(
|
|
121
121
|
date_created=date_created,
|
|
@@ -119,7 +119,7 @@ class FsspecIndexer(Indexer):
|
|
|
119
119
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
120
120
|
raise self.wrap_error(e=e)
|
|
121
121
|
|
|
122
|
-
def
|
|
122
|
+
def get_file_info(self) -> list[dict[str, Any]]:
|
|
123
123
|
if not self.index_config.recursive:
|
|
124
124
|
# fs.ls does not walk directories
|
|
125
125
|
# directories that are listed in cloud storage can cause problems
|
|
@@ -156,24 +156,56 @@ class FsspecIndexer(Indexer):
|
|
|
156
156
|
|
|
157
157
|
return random.sample(files, n)
|
|
158
158
|
|
|
159
|
-
def get_metadata(self,
|
|
159
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
160
160
|
raise NotImplementedError()
|
|
161
161
|
|
|
162
|
-
def get_path(self,
|
|
163
|
-
return
|
|
162
|
+
def get_path(self, file_info: dict) -> str:
|
|
163
|
+
return file_info["name"]
|
|
164
164
|
|
|
165
165
|
def sterilize_info(self, file_data: dict) -> dict:
|
|
166
166
|
return sterilize_dict(data=file_data)
|
|
167
167
|
|
|
168
|
+
def create_init_file_data(self, remote_filepath: Optional[str] = None) -> FileData:
|
|
169
|
+
# Create initial file data that requires no network calls and is constructed purely
|
|
170
|
+
# with information that exists in the config
|
|
171
|
+
remote_filepath = remote_filepath or self.index_config.remote_url
|
|
172
|
+
path_without_protocol = remote_filepath.split("://")[1]
|
|
173
|
+
rel_path = remote_filepath.replace(path_without_protocol, "").lstrip("/")
|
|
174
|
+
return FileData(
|
|
175
|
+
identifier=str(uuid5(NAMESPACE_DNS, remote_filepath)),
|
|
176
|
+
connector_type=self.connector_type,
|
|
177
|
+
display_name=remote_filepath,
|
|
178
|
+
source_identifiers=SourceIdentifiers(
|
|
179
|
+
filename=Path(remote_filepath).name,
|
|
180
|
+
rel_path=rel_path or None,
|
|
181
|
+
fullpath=remote_filepath,
|
|
182
|
+
),
|
|
183
|
+
metadata=FileDataSourceMetadata(url=remote_filepath),
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
def hydrate_file_data(self, init_file_data: FileData):
|
|
187
|
+
# Get file info
|
|
188
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
189
|
+
files = client.ls(self.index_config.path_without_protocol, detail=True)
|
|
190
|
+
filtered_files = [
|
|
191
|
+
file for file in files if file.get("size") > 0 and file.get("type") == "file"
|
|
192
|
+
]
|
|
193
|
+
if not filtered_files:
|
|
194
|
+
raise ValueError(f"{init_file_data} did not reference any valid file")
|
|
195
|
+
if len(filtered_files) > 1:
|
|
196
|
+
raise ValueError(f"{init_file_data} referenced more than one file")
|
|
197
|
+
file_info = filtered_files[0]
|
|
198
|
+
init_file_data.additional_metadata = self.get_metadata(file_info=file_info)
|
|
199
|
+
|
|
168
200
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
169
|
-
files = self.
|
|
170
|
-
for
|
|
171
|
-
file_path = self.get_path(
|
|
201
|
+
files = self.get_file_info()
|
|
202
|
+
for file_info in files:
|
|
203
|
+
file_path = self.get_path(file_info=file_info)
|
|
172
204
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
173
205
|
# to get a valid relative path
|
|
174
206
|
rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
|
|
175
207
|
|
|
176
|
-
additional_metadata = self.sterilize_info(file_data=
|
|
208
|
+
additional_metadata = self.sterilize_info(file_data=file_info)
|
|
177
209
|
additional_metadata["original_file_path"] = file_path
|
|
178
210
|
yield FileData(
|
|
179
211
|
identifier=str(uuid5(NAMESPACE_DNS, file_path)),
|
|
@@ -183,7 +215,7 @@ class FsspecIndexer(Indexer):
|
|
|
183
215
|
rel_path=rel_path or None,
|
|
184
216
|
fullpath=file_path,
|
|
185
217
|
),
|
|
186
|
-
metadata=self.get_metadata(
|
|
218
|
+
metadata=self.get_metadata(file_info=file_info),
|
|
187
219
|
additional_metadata=additional_metadata,
|
|
188
220
|
display_name=file_path,
|
|
189
221
|
)
|
|
@@ -131,22 +131,22 @@ class GcsIndexer(FsspecIndexer):
|
|
|
131
131
|
index_config: GcsIndexerConfig
|
|
132
132
|
connector_type: str = CONNECTOR_TYPE
|
|
133
133
|
|
|
134
|
-
def get_metadata(self,
|
|
135
|
-
path =
|
|
134
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
135
|
+
path = file_info["name"]
|
|
136
136
|
date_created = None
|
|
137
137
|
date_modified = None
|
|
138
|
-
if modified_at_str :=
|
|
138
|
+
if modified_at_str := file_info.get("updated"):
|
|
139
139
|
date_modified = str(parser.parse(modified_at_str).timestamp())
|
|
140
|
-
if created_at_str :=
|
|
140
|
+
if created_at_str := file_info.get("timeCreated"):
|
|
141
141
|
date_created = str(parser.parse(created_at_str).timestamp())
|
|
142
142
|
|
|
143
|
-
file_size =
|
|
143
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
144
144
|
|
|
145
|
-
version =
|
|
145
|
+
version = file_info.get("etag")
|
|
146
146
|
record_locator = {
|
|
147
147
|
"protocol": self.index_config.protocol,
|
|
148
148
|
"remote_file_path": self.index_config.remote_url,
|
|
149
|
-
"file_id":
|
|
149
|
+
"file_id": file_info.get("id"),
|
|
150
150
|
}
|
|
151
151
|
return FileDataSourceMetadata(
|
|
152
152
|
date_created=date_created,
|
|
@@ -110,22 +110,22 @@ class S3Indexer(FsspecIndexer):
|
|
|
110
110
|
def wrap_error(self, e: Exception) -> Exception:
|
|
111
111
|
return self.connection_config.wrap_error(e=e)
|
|
112
112
|
|
|
113
|
-
def get_path(self,
|
|
114
|
-
return
|
|
113
|
+
def get_path(self, file_info: dict) -> str:
|
|
114
|
+
return file_info["Key"]
|
|
115
115
|
|
|
116
|
-
def get_metadata(self,
|
|
117
|
-
path =
|
|
116
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
117
|
+
path = file_info["Key"]
|
|
118
118
|
date_created = None
|
|
119
119
|
date_modified = None
|
|
120
|
-
modified =
|
|
120
|
+
modified = file_info.get("LastModified")
|
|
121
121
|
if modified:
|
|
122
122
|
date_created = str(modified.timestamp())
|
|
123
123
|
date_modified = str(modified.timestamp())
|
|
124
124
|
|
|
125
|
-
file_size =
|
|
126
|
-
file_size = file_size or
|
|
125
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
126
|
+
file_size = file_size or file_info.get("Size")
|
|
127
127
|
|
|
128
|
-
version =
|
|
128
|
+
version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
|
|
129
129
|
metadata: dict[str, str] = {}
|
|
130
130
|
with contextlib.suppress(AttributeError):
|
|
131
131
|
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
@@ -107,12 +107,12 @@ class SftpIndexer(FsspecIndexer):
|
|
|
107
107
|
file.identifier = new_identifier
|
|
108
108
|
yield file
|
|
109
109
|
|
|
110
|
-
def get_metadata(self,
|
|
111
|
-
path =
|
|
112
|
-
date_created = str(
|
|
113
|
-
date_modified = str(
|
|
110
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
111
|
+
path = file_info["name"]
|
|
112
|
+
date_created = str(file_info.get("time").timestamp()) if "time" in file_info else None
|
|
113
|
+
date_modified = str(file_info.get("mtime").timestamp()) if "mtime" in file_info else None
|
|
114
114
|
|
|
115
|
-
file_size =
|
|
115
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
116
116
|
|
|
117
117
|
record_locator = {
|
|
118
118
|
"protocol": self.index_config.protocol,
|
|
@@ -15,11 +15,14 @@ from .snowflake import CONNECTOR_TYPE as SNOWFLAKE_CONNECTOR_TYPE
|
|
|
15
15
|
from .snowflake import snowflake_destination_entry, snowflake_source_entry
|
|
16
16
|
from .sqlite import CONNECTOR_TYPE as SQLITE_CONNECTOR_TYPE
|
|
17
17
|
from .sqlite import sqlite_destination_entry, sqlite_source_entry
|
|
18
|
+
from .vastdb import CONNECTOR_TYPE as VASTDB_CONNECTOR_TYPE
|
|
19
|
+
from .vastdb import vastdb_destination_entry, vastdb_source_entry
|
|
18
20
|
|
|
19
21
|
add_source_entry(source_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_source_entry)
|
|
20
22
|
add_source_entry(source_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_source_entry)
|
|
21
23
|
add_source_entry(source_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_source_entry)
|
|
22
24
|
add_source_entry(source_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_source_entry)
|
|
25
|
+
add_source_entry(source_type=VASTDB_CONNECTOR_TYPE, entry=vastdb_source_entry)
|
|
23
26
|
|
|
24
27
|
add_destination_entry(destination_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_destination_entry)
|
|
25
28
|
add_destination_entry(destination_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_destination_entry)
|
|
@@ -31,3 +34,4 @@ add_destination_entry(
|
|
|
31
34
|
destination_type=DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE,
|
|
32
35
|
entry=databricks_delta_tables_destination_entry,
|
|
33
36
|
)
|
|
37
|
+
add_destination_entry(destination_type=VASTDB_CONNECTOR_TYPE, entry=vastdb_destination_entry)
|
|
@@ -3,6 +3,7 @@ from contextlib import contextmanager
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
+
import pandas as pd
|
|
6
7
|
from pydantic import Field, Secret
|
|
7
8
|
|
|
8
9
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -139,7 +140,7 @@ class SingleStoreUploader(SQLUploader):
|
|
|
139
140
|
if isinstance(value, (list, dict)):
|
|
140
141
|
value = json.dumps(value)
|
|
141
142
|
if column_name in _DATE_COLUMNS:
|
|
142
|
-
if value is None:
|
|
143
|
+
if value is None or pd.isna(value):
|
|
143
144
|
parsed.append(None)
|
|
144
145
|
else:
|
|
145
146
|
parsed.append(parse_date_string(value))
|
|
@@ -170,7 +170,7 @@ class SnowflakeUploader(SQLUploader):
|
|
|
170
170
|
f"{self.upload_config.record_id_key}, skipping delete"
|
|
171
171
|
)
|
|
172
172
|
df.replace({np.nan: None}, inplace=True)
|
|
173
|
-
self._fit_to_schema(df=df
|
|
173
|
+
self._fit_to_schema(df=df)
|
|
174
174
|
|
|
175
175
|
columns = list(df.columns)
|
|
176
176
|
stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
|
|
@@ -14,7 +14,7 @@ from dateutil import parser
|
|
|
14
14
|
from pydantic import BaseModel, Field, Secret
|
|
15
15
|
|
|
16
16
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
17
|
-
from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dataframe
|
|
17
|
+
from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dataframe, write_data
|
|
18
18
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
19
19
|
from unstructured_ingest.v2.interfaces import (
|
|
20
20
|
AccessConfig,
|
|
@@ -310,9 +310,11 @@ class SQLUploadStager(UploadStager):
|
|
|
310
310
|
)
|
|
311
311
|
df = self.conform_dataframe(df=df)
|
|
312
312
|
|
|
313
|
+
output_filename_suffix = Path(elements_filepath).suffix
|
|
314
|
+
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
313
315
|
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
314
316
|
|
|
315
|
-
|
|
317
|
+
write_data(path=output_path, data=df.to_dict(orient="records"))
|
|
316
318
|
return output_path
|
|
317
319
|
|
|
318
320
|
|
|
@@ -330,6 +332,7 @@ class SQLUploader(Uploader):
|
|
|
330
332
|
upload_config: SQLUploaderConfig
|
|
331
333
|
connection_config: SQLConnectionConfig
|
|
332
334
|
values_delimiter: str = "?"
|
|
335
|
+
_columns: list[str] = field(init=False, default=None)
|
|
333
336
|
|
|
334
337
|
def precheck(self) -> None:
|
|
335
338
|
try:
|
|
@@ -352,7 +355,7 @@ class SQLUploader(Uploader):
|
|
|
352
355
|
parsed = []
|
|
353
356
|
for column_name, value in zip(columns, row):
|
|
354
357
|
if column_name in _DATE_COLUMNS:
|
|
355
|
-
if value is None:
|
|
358
|
+
if value is None or pd.isna(value): # pandas is nan
|
|
356
359
|
parsed.append(None)
|
|
357
360
|
else:
|
|
358
361
|
parsed.append(parse_date_string(value))
|
|
@@ -362,8 +365,9 @@ class SQLUploader(Uploader):
|
|
|
362
365
|
return output
|
|
363
366
|
|
|
364
367
|
def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
368
|
+
table_columns = self.get_table_columns()
|
|
365
369
|
columns = set(df.columns)
|
|
366
|
-
schema_fields = set(
|
|
370
|
+
schema_fields = set(table_columns)
|
|
367
371
|
columns_to_drop = columns - schema_fields
|
|
368
372
|
missing_columns = schema_fields - columns
|
|
369
373
|
|
|
@@ -393,8 +397,8 @@ class SQLUploader(Uploader):
|
|
|
393
397
|
f"record id column "
|
|
394
398
|
f"{self.upload_config.record_id_key}, skipping delete"
|
|
395
399
|
)
|
|
400
|
+
df = self._fit_to_schema(df=df)
|
|
396
401
|
df.replace({np.nan: None}, inplace=True)
|
|
397
|
-
self._fit_to_schema(df=df)
|
|
398
402
|
|
|
399
403
|
columns = list(df.columns)
|
|
400
404
|
stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
|
|
@@ -422,9 +426,11 @@ class SQLUploader(Uploader):
|
|
|
422
426
|
cursor.executemany(stmt, values)
|
|
423
427
|
|
|
424
428
|
def get_table_columns(self) -> list[str]:
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
429
|
+
if self._columns is None:
|
|
430
|
+
with self.get_cursor() as cursor:
|
|
431
|
+
cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
|
|
432
|
+
self._columns = [desc[0] for desc in cursor.description]
|
|
433
|
+
return self._columns
|
|
428
434
|
|
|
429
435
|
def can_delete(self) -> bool:
|
|
430
436
|
return self.upload_config.record_id_key in self.get_table_columns()
|