unstructured-ingest 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_vastdb.py +34 -0
- test/integration/connectors/test_google_drive.py +257 -0
- test/unit/v2/connectors/motherduck/__init__.py +0 -0
- test/unit/v2/connectors/motherduck/test_base.py +74 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/bedrock.py +13 -6
- unstructured_ingest/embed/huggingface.py +11 -4
- unstructured_ingest/embed/interfaces.py +2 -21
- unstructured_ingest/embed/mixedbreadai.py +13 -4
- unstructured_ingest/embed/octoai.py +13 -6
- unstructured_ingest/embed/openai.py +13 -6
- unstructured_ingest/embed/togetherai.py +13 -4
- unstructured_ingest/embed/vertexai.py +13 -6
- unstructured_ingest/embed/voyageai.py +13 -4
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -0
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -4
- unstructured_ingest/v2/processes/connectors/google_drive.py +144 -13
- unstructured_ingest/v2/processes/connectors/pinecone.py +1 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +53 -3
- unstructured_ingest/v2/processes/connectors/sql/sql.py +3 -47
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +4 -12
- {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/METADATA +18 -18
- {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/RECORD +27 -23
- {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.0.dist-info → unstructured_ingest-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -9,6 +9,7 @@ from pydantic import Field, Secret, ValidationError
|
|
|
9
9
|
from pydantic.functional_validators import BeforeValidator
|
|
10
10
|
|
|
11
11
|
from unstructured_ingest.embed.interfaces import (
|
|
12
|
+
EMBEDDINGS_KEY,
|
|
12
13
|
AsyncBaseEmbeddingEncoder,
|
|
13
14
|
BaseEmbeddingEncoder,
|
|
14
15
|
EmbeddingConfig,
|
|
@@ -75,9 +76,12 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
75
76
|
return self._embed_documents(elements=[query])[0]
|
|
76
77
|
|
|
77
78
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
79
|
+
elements = elements.copy()
|
|
80
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
81
|
+
embeddings = self._embed_documents([e["text"] for e in elements_with_text])
|
|
82
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
83
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
84
|
+
return elements
|
|
81
85
|
|
|
82
86
|
@requires_dependencies(
|
|
83
87
|
["vertexai"],
|
|
@@ -110,9 +114,12 @@ class AsyncVertexAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
110
114
|
return embedding[0]
|
|
111
115
|
|
|
112
116
|
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
117
|
+
elements = elements.copy()
|
|
118
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
119
|
+
embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
|
|
120
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
121
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
122
|
+
return elements
|
|
116
123
|
|
|
117
124
|
@requires_dependencies(
|
|
118
125
|
["vertexai"],
|
|
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Optional
|
|
|
4
4
|
from pydantic import Field, SecretStr
|
|
5
5
|
|
|
6
6
|
from unstructured_ingest.embed.interfaces import (
|
|
7
|
+
EMBEDDINGS_KEY,
|
|
7
8
|
AsyncBaseEmbeddingEncoder,
|
|
8
9
|
BaseEmbeddingEncoder,
|
|
9
10
|
EmbeddingConfig,
|
|
@@ -107,8 +108,12 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
107
108
|
return embeddings
|
|
108
109
|
|
|
109
110
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
110
|
-
|
|
111
|
-
|
|
111
|
+
elements = elements.copy()
|
|
112
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
113
|
+
embeddings = self._embed_documents([e["text"] for e in elements_with_text])
|
|
114
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
115
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
116
|
+
return elements
|
|
112
117
|
|
|
113
118
|
def embed_query(self, query: str) -> list[float]:
|
|
114
119
|
return self._embed_documents(elements=[query])[0]
|
|
@@ -135,8 +140,12 @@ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
|
|
|
135
140
|
return embeddings
|
|
136
141
|
|
|
137
142
|
async def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
138
|
-
|
|
139
|
-
|
|
143
|
+
elements = elements.copy()
|
|
144
|
+
elements_with_text = [e for e in elements if e.get("text")]
|
|
145
|
+
embeddings = await self._embed_documents([e["text"] for e in elements_with_text])
|
|
146
|
+
for element, embedding in zip(elements_with_text, embeddings):
|
|
147
|
+
element[EMBEDDINGS_KEY] = embedding
|
|
148
|
+
return elements
|
|
140
149
|
|
|
141
150
|
async def embed_query(self, query: str) -> list[float]:
|
|
142
151
|
embedding = await self._embed_documents(elements=[query])
|
|
@@ -81,6 +81,8 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
81
81
|
**kwargs: Any,
|
|
82
82
|
) -> Path:
|
|
83
83
|
elements_contents = get_data(path=elements_filepath)
|
|
84
|
+
output_filename_suffix = Path(elements_filepath).suffix
|
|
85
|
+
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
84
86
|
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
85
87
|
|
|
86
88
|
output = [
|
|
@@ -61,7 +61,7 @@ class MotherDuckConnectionConfig(ConnectionConfig):
|
|
|
61
61
|
"custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
|
|
62
62
|
},
|
|
63
63
|
) as conn:
|
|
64
|
-
conn.sql(f
|
|
64
|
+
conn.sql(f'USE "{self.database}"')
|
|
65
65
|
yield conn
|
|
66
66
|
|
|
67
67
|
@contextmanager
|
|
@@ -102,11 +102,12 @@ class MotherDuckUploader(Uploader):
|
|
|
102
102
|
|
|
103
103
|
def upload_dataframe(self, df: pd.DataFrame) -> None:
|
|
104
104
|
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
105
|
+
database = self.connection_config.database
|
|
106
|
+
db_schema = self.connection_config.db_schema
|
|
107
|
+
table = self.connection_config.table
|
|
105
108
|
|
|
106
109
|
with self.connection_config.get_client() as conn:
|
|
107
|
-
conn.query(
|
|
108
|
-
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
|
|
109
|
-
)
|
|
110
|
+
conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
|
|
110
111
|
|
|
111
112
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
112
113
|
df = pd.DataFrame(data=data)
|
|
@@ -132,12 +132,141 @@ class GoogleDriveIndexer(Indexer):
|
|
|
132
132
|
]
|
|
133
133
|
)
|
|
134
134
|
|
|
135
|
+
@staticmethod
|
|
136
|
+
def verify_drive_api_enabled(client) -> None:
|
|
137
|
+
from googleapiclient.errors import HttpError
|
|
138
|
+
|
|
139
|
+
"""
|
|
140
|
+
Makes a lightweight API call to verify that the Drive API is enabled.
|
|
141
|
+
If the API is not enabled, an HttpError should be raised.
|
|
142
|
+
"""
|
|
143
|
+
try:
|
|
144
|
+
# A very minimal call: list 1 file from the drive.
|
|
145
|
+
client.list(spaces="drive", pageSize=1, fields="files(id)").execute()
|
|
146
|
+
except HttpError as e:
|
|
147
|
+
error_content = e.content.decode() if hasattr(e, "content") else ""
|
|
148
|
+
lower_error = error_content.lower()
|
|
149
|
+
if "drive api" in lower_error and (
|
|
150
|
+
"not enabled" in lower_error or "not been used" in lower_error
|
|
151
|
+
):
|
|
152
|
+
raise SourceConnectionError(
|
|
153
|
+
"Google Drive API is not enabled for your project. \
|
|
154
|
+
Please enable it in the Google Cloud Console."
|
|
155
|
+
)
|
|
156
|
+
else:
|
|
157
|
+
raise SourceConnectionError("Google drive API unreachable for an unknown reason!")
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def count_files_recursively(files_client, folder_id: str, extensions: list[str] = None) -> int:
|
|
161
|
+
"""
|
|
162
|
+
Count non-folder files recursively under the given folder.
|
|
163
|
+
If `extensions` is provided, only count files
|
|
164
|
+
whose `fileExtension` matches one of the values.
|
|
165
|
+
"""
|
|
166
|
+
count = 0
|
|
167
|
+
stack = [folder_id]
|
|
168
|
+
while stack:
|
|
169
|
+
current_folder = stack.pop()
|
|
170
|
+
# Always list all items under the current folder.
|
|
171
|
+
query = f"'{current_folder}' in parents"
|
|
172
|
+
page_token = None
|
|
173
|
+
while True:
|
|
174
|
+
response = files_client.list(
|
|
175
|
+
spaces="drive",
|
|
176
|
+
q=query,
|
|
177
|
+
fields="nextPageToken, files(id, mimeType, fileExtension)",
|
|
178
|
+
pageToken=page_token,
|
|
179
|
+
pageSize=1000,
|
|
180
|
+
).execute()
|
|
181
|
+
for item in response.get("files", []):
|
|
182
|
+
if item.get("mimeType") == "application/vnd.google-apps.folder":
|
|
183
|
+
# Always traverse sub-folders regardless of extension filter.
|
|
184
|
+
stack.append(item["id"])
|
|
185
|
+
else:
|
|
186
|
+
if extensions:
|
|
187
|
+
# Use a case-insensitive comparison for the file extension.
|
|
188
|
+
file_ext = (item.get("fileExtension") or "").lower()
|
|
189
|
+
valid_exts = [e.lower() for e in extensions]
|
|
190
|
+
if file_ext in valid_exts:
|
|
191
|
+
count += 1
|
|
192
|
+
else:
|
|
193
|
+
count += 1
|
|
194
|
+
page_token = response.get("nextPageToken")
|
|
195
|
+
if not page_token:
|
|
196
|
+
break
|
|
197
|
+
return count
|
|
198
|
+
|
|
135
199
|
def precheck(self) -> None:
|
|
200
|
+
"""
|
|
201
|
+
Enhanced precheck that verifies not only connectivity
|
|
202
|
+
but also that the provided drive_id is valid and accessible.
|
|
203
|
+
"""
|
|
136
204
|
try:
|
|
137
|
-
self.connection_config.get_client()
|
|
205
|
+
with self.connection_config.get_client() as client:
|
|
206
|
+
# First, verify that the Drive API is enabled.
|
|
207
|
+
self.verify_drive_api_enabled(client)
|
|
208
|
+
|
|
209
|
+
# Try to retrieve metadata for the drive id.
|
|
210
|
+
# This will catch errors such as an invalid drive id or insufficient permissions.
|
|
211
|
+
root_info = self.get_root_info(
|
|
212
|
+
files_client=client, object_id=self.connection_config.drive_id
|
|
213
|
+
)
|
|
214
|
+
logger.info(
|
|
215
|
+
f"Successfully retrieved drive root info: "
|
|
216
|
+
f"{root_info.get('name', 'Unnamed')} (ID: {root_info.get('id')})"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# If the target is a folder, perform file count check.
|
|
220
|
+
if self.is_dir(root_info):
|
|
221
|
+
if self.index_config.recursive:
|
|
222
|
+
file_count = self.count_files_recursively(
|
|
223
|
+
client,
|
|
224
|
+
self.connection_config.drive_id,
|
|
225
|
+
extensions=self.index_config.extensions,
|
|
226
|
+
)
|
|
227
|
+
if file_count == 0:
|
|
228
|
+
logger.warning(
|
|
229
|
+
"Empty folder: no files found recursively in the folder. \
|
|
230
|
+
Please verify that the folder contains files and \
|
|
231
|
+
that the service account has proper permissions."
|
|
232
|
+
)
|
|
233
|
+
# raise SourceConnectionError(
|
|
234
|
+
# "Empty folder: no files found recursively in the folder. "
|
|
235
|
+
# "Please verify that the folder contains files and \
|
|
236
|
+
# that the service account has proper permissions."
|
|
237
|
+
# )
|
|
238
|
+
else:
|
|
239
|
+
logger.info(f"Found {file_count} files recursively in the folder.")
|
|
240
|
+
else:
|
|
241
|
+
# Non-recursive: check for at least one immediate non-folder child.
|
|
242
|
+
response = client.list(
|
|
243
|
+
spaces="drive",
|
|
244
|
+
fields="files(id)",
|
|
245
|
+
pageSize=1,
|
|
246
|
+
q=f"'{self.connection_config.drive_id}' in parents",
|
|
247
|
+
).execute()
|
|
248
|
+
if not response.get("files"):
|
|
249
|
+
logger.warning(
|
|
250
|
+
"Empty folder: no files found at the folder's root level. "
|
|
251
|
+
"Please verify that the folder contains files and \
|
|
252
|
+
that the service account has proper permissions."
|
|
253
|
+
)
|
|
254
|
+
# raise SourceConnectionError(
|
|
255
|
+
# "Empty folder: no files found at the folder's root level. "
|
|
256
|
+
# "Please verify that the folder contains files and \
|
|
257
|
+
# that the service account has proper permissions."
|
|
258
|
+
# )
|
|
259
|
+
else:
|
|
260
|
+
logger.info("Found files at the folder's root level.")
|
|
261
|
+
else:
|
|
262
|
+
# If the target is a file, precheck passes.
|
|
263
|
+
logger.info("Drive ID corresponds to a file. Precheck passed.")
|
|
264
|
+
|
|
138
265
|
except Exception as e:
|
|
139
|
-
logger.error(
|
|
140
|
-
|
|
266
|
+
logger.error(
|
|
267
|
+
"Failed to validate Google Drive connection during precheck", exc_info=True
|
|
268
|
+
)
|
|
269
|
+
raise SourceConnectionError(f"Precheck failed: {e}")
|
|
141
270
|
|
|
142
271
|
@staticmethod
|
|
143
272
|
def is_dir(record: dict) -> bool:
|
|
@@ -310,20 +439,22 @@ class GoogleDriveDownloader(Downloader):
|
|
|
310
439
|
from googleapiclient.http import MediaIoBaseDownload
|
|
311
440
|
|
|
312
441
|
logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
|
|
313
|
-
mime_type = file_data.additional_metadata["mimeType"]
|
|
314
442
|
record_id = file_data.identifier
|
|
443
|
+
mime_type = file_data.additional_metadata["mimeType"]
|
|
444
|
+
if not mime_type:
|
|
445
|
+
raise TypeError(
|
|
446
|
+
f"File not supported. Name: {file_data.source_identifiers.filename} "
|
|
447
|
+
f"ID: {record_id} "
|
|
448
|
+
f"MimeType: {mime_type}"
|
|
449
|
+
)
|
|
315
450
|
with self.connection_config.get_client() as client:
|
|
316
|
-
if
|
|
451
|
+
if (
|
|
452
|
+
mime_type.startswith("application/vnd.google-apps")
|
|
453
|
+
and mime_type in GOOGLE_DRIVE_EXPORT_TYPES
|
|
454
|
+
):
|
|
317
455
|
export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
|
|
318
|
-
|
|
456
|
+
mime_type, # type: ignore
|
|
319
457
|
)
|
|
320
|
-
if not export_mime:
|
|
321
|
-
raise TypeError(
|
|
322
|
-
f"File not supported. Name: {file_data.source_identifiers.filename} "
|
|
323
|
-
f"ID: {record_id} "
|
|
324
|
-
f"MimeType: {mime_type}"
|
|
325
|
-
)
|
|
326
|
-
|
|
327
458
|
request = client.export_media(
|
|
328
459
|
fileId=record_id,
|
|
329
460
|
mimeType=export_mime,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from contextlib import contextmanager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import TYPE_CHECKING, Generator, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
import pandas as pd
|
|
@@ -15,6 +16,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
15
16
|
SourceRegistryEntry,
|
|
16
17
|
)
|
|
17
18
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
19
|
+
_DATE_COLUMNS,
|
|
18
20
|
SQLAccessConfig,
|
|
19
21
|
SqlBatchFileData,
|
|
20
22
|
SQLConnectionConfig,
|
|
@@ -26,6 +28,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
26
28
|
SQLUploaderConfig,
|
|
27
29
|
SQLUploadStager,
|
|
28
30
|
SQLUploadStagerConfig,
|
|
31
|
+
parse_date_string,
|
|
29
32
|
)
|
|
30
33
|
|
|
31
34
|
if TYPE_CHECKING:
|
|
@@ -34,6 +37,17 @@ if TYPE_CHECKING:
|
|
|
34
37
|
|
|
35
38
|
CONNECTOR_TYPE = "snowflake"
|
|
36
39
|
|
|
40
|
+
_ARRAY_COLUMNS = (
|
|
41
|
+
"embeddings",
|
|
42
|
+
"languages",
|
|
43
|
+
"link_urls",
|
|
44
|
+
"link_texts",
|
|
45
|
+
"sent_from",
|
|
46
|
+
"sent_to",
|
|
47
|
+
"emphasized_text_contents",
|
|
48
|
+
"emphasized_text_tags",
|
|
49
|
+
)
|
|
50
|
+
|
|
37
51
|
|
|
38
52
|
class SnowflakeAccessConfig(SQLAccessConfig):
|
|
39
53
|
password: Optional[str] = Field(default=None, description="DB password")
|
|
@@ -160,6 +174,42 @@ class SnowflakeUploader(SQLUploader):
|
|
|
160
174
|
connector_type: str = CONNECTOR_TYPE
|
|
161
175
|
values_delimiter: str = "?"
|
|
162
176
|
|
|
177
|
+
def prepare_data(
|
|
178
|
+
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
179
|
+
) -> list[tuple[Any, ...]]:
|
|
180
|
+
output = []
|
|
181
|
+
for row in data:
|
|
182
|
+
parsed = []
|
|
183
|
+
for column_name, value in zip(columns, row):
|
|
184
|
+
if column_name in _DATE_COLUMNS:
|
|
185
|
+
if value is None or pd.isna(value): # pandas is nan
|
|
186
|
+
parsed.append(None)
|
|
187
|
+
else:
|
|
188
|
+
parsed.append(parse_date_string(value))
|
|
189
|
+
elif column_name in _ARRAY_COLUMNS:
|
|
190
|
+
if not isinstance(value, list) and (
|
|
191
|
+
value is None or pd.isna(value)
|
|
192
|
+
): # pandas is nan
|
|
193
|
+
parsed.append(None)
|
|
194
|
+
else:
|
|
195
|
+
parsed.append(json.dumps(value))
|
|
196
|
+
else:
|
|
197
|
+
parsed.append(value)
|
|
198
|
+
output.append(tuple(parsed))
|
|
199
|
+
return output
|
|
200
|
+
|
|
201
|
+
def _parse_values(self, columns: list[str]) -> str:
|
|
202
|
+
return ",".join(
|
|
203
|
+
[
|
|
204
|
+
(
|
|
205
|
+
f"PARSE_JSON({self.values_delimiter})"
|
|
206
|
+
if col in _ARRAY_COLUMNS
|
|
207
|
+
else self.values_delimiter
|
|
208
|
+
)
|
|
209
|
+
for col in columns
|
|
210
|
+
]
|
|
211
|
+
)
|
|
212
|
+
|
|
163
213
|
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
164
214
|
if self.can_delete():
|
|
165
215
|
self.delete_by_record_id(file_data=file_data)
|
|
@@ -173,10 +223,10 @@ class SnowflakeUploader(SQLUploader):
|
|
|
173
223
|
self._fit_to_schema(df=df)
|
|
174
224
|
|
|
175
225
|
columns = list(df.columns)
|
|
176
|
-
stmt = "INSERT INTO {table_name} ({columns})
|
|
226
|
+
stmt = "INSERT INTO {table_name} ({columns}) SELECT {values}".format(
|
|
177
227
|
table_name=self.upload_config.table_name,
|
|
178
228
|
columns=",".join(columns),
|
|
179
|
-
values=
|
|
229
|
+
values=self._parse_values(columns),
|
|
180
230
|
)
|
|
181
231
|
logger.info(
|
|
182
232
|
f"writing a total of {len(df)} elements via"
|
|
@@ -38,48 +38,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
38
38
|
from unstructured_ingest.v2.logger import logger
|
|
39
39
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
40
40
|
|
|
41
|
-
_COLUMNS = (
|
|
42
|
-
"id",
|
|
43
|
-
"element_id",
|
|
44
|
-
"text",
|
|
45
|
-
"embeddings",
|
|
46
|
-
"type",
|
|
47
|
-
"system",
|
|
48
|
-
"layout_width",
|
|
49
|
-
"layout_height",
|
|
50
|
-
"points",
|
|
51
|
-
"url",
|
|
52
|
-
"version",
|
|
53
|
-
"date_created",
|
|
54
|
-
"date_modified",
|
|
55
|
-
"date_processed",
|
|
56
|
-
"permissions_data",
|
|
57
|
-
"record_locator",
|
|
58
|
-
"category_depth",
|
|
59
|
-
"parent_id",
|
|
60
|
-
"attached_filename",
|
|
61
|
-
"filetype",
|
|
62
|
-
"last_modified",
|
|
63
|
-
"file_directory",
|
|
64
|
-
"filename",
|
|
65
|
-
"languages",
|
|
66
|
-
"page_number",
|
|
67
|
-
"links",
|
|
68
|
-
"page_name",
|
|
69
|
-
"link_urls",
|
|
70
|
-
"link_texts",
|
|
71
|
-
"sent_from",
|
|
72
|
-
"sent_to",
|
|
73
|
-
"subject",
|
|
74
|
-
"section",
|
|
75
|
-
"header_footer_type",
|
|
76
|
-
"emphasized_text_contents",
|
|
77
|
-
"emphasized_text_tags",
|
|
78
|
-
"text_as_html",
|
|
79
|
-
"regex_metadata",
|
|
80
|
-
"detection_class_prob",
|
|
81
|
-
)
|
|
82
|
-
|
|
83
41
|
_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
|
|
84
42
|
|
|
85
43
|
|
|
@@ -270,10 +228,8 @@ class SQLUploadStager(UploadStager):
|
|
|
270
228
|
|
|
271
229
|
data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
|
|
272
230
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
element[RECORD_ID_LABEL] = file_data.identifier
|
|
276
|
-
return element
|
|
231
|
+
data[RECORD_ID_LABEL] = file_data.identifier
|
|
232
|
+
return data
|
|
277
233
|
|
|
278
234
|
def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
279
235
|
for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
|
|
@@ -375,7 +331,7 @@ class SQLUploader(Uploader):
|
|
|
375
331
|
missing_columns = schema_fields - columns
|
|
376
332
|
|
|
377
333
|
if columns_to_drop:
|
|
378
|
-
logger.
|
|
334
|
+
logger.info(
|
|
379
335
|
"Following columns will be dropped to match the table's schema: "
|
|
380
336
|
f"{', '.join(columns_to_drop)}"
|
|
381
337
|
)
|
|
@@ -19,7 +19,6 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
19
19
|
SourceRegistryEntry,
|
|
20
20
|
)
|
|
21
21
|
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
22
|
-
_COLUMNS,
|
|
23
22
|
SQLAccessConfig,
|
|
24
23
|
SqlBatchFileData,
|
|
25
24
|
SQLConnectionConfig,
|
|
@@ -149,13 +148,11 @@ class VastdbUploadStagerConfig(SQLUploadStagerConfig):
|
|
|
149
148
|
default=None,
|
|
150
149
|
description="Map of column names to rename, ex: {'old_name': 'new_name'}",
|
|
151
150
|
)
|
|
152
|
-
additional_columns: Optional[list[str]] = Field(
|
|
153
|
-
default_factory=list, description="Additional columns to include in the upload"
|
|
154
|
-
)
|
|
155
151
|
|
|
156
152
|
|
|
153
|
+
@dataclass
|
|
157
154
|
class VastdbUploadStager(SQLUploadStager):
|
|
158
|
-
upload_stager_config: VastdbUploadStagerConfig
|
|
155
|
+
upload_stager_config: VastdbUploadStagerConfig = field(default_factory=VastdbUploadStagerConfig)
|
|
159
156
|
|
|
160
157
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
161
158
|
data = element_dict.copy()
|
|
@@ -168,13 +165,8 @@ class VastdbUploadStager(SQLUploadStager):
|
|
|
168
165
|
data.update(coordinates)
|
|
169
166
|
|
|
170
167
|
data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
# but also allow for additional columns
|
|
174
|
-
approved_columns = set(_COLUMNS).union(self.upload_stager_config.additional_columns)
|
|
175
|
-
element = {k: v for k, v in data.items() if k in approved_columns}
|
|
176
|
-
element[RECORD_ID_LABEL] = file_data.identifier
|
|
177
|
-
return element
|
|
168
|
+
data[RECORD_ID_LABEL] = file_data.identifier
|
|
169
|
+
return data
|
|
178
170
|
|
|
179
171
|
def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
180
172
|
df = super().conform_dataframe(df=df)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -23,37 +23,37 @@ Requires-Python: >=3.9.0,<3.14
|
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
25
|
Requires-Dist: pandas
|
|
26
|
-
Requires-Dist: dataclasses-json
|
|
27
26
|
Requires-Dist: pydantic>=2.7
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist: tqdm
|
|
27
|
+
Requires-Dist: dataclasses-json
|
|
30
28
|
Requires-Dist: python-dateutil
|
|
31
29
|
Requires-Dist: opentelemetry-sdk
|
|
30
|
+
Requires-Dist: click
|
|
31
|
+
Requires-Dist: tqdm
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: adlfs; extra == "azure"
|
|
38
37
|
Requires-Dist: fsspec; extra == "azure"
|
|
38
|
+
Requires-Dist: adlfs; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-ai-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
|
-
Requires-Dist: aioboto3; extra == "bedrock"
|
|
43
42
|
Requires-Dist: boto3; extra == "bedrock"
|
|
43
|
+
Requires-Dist: aioboto3; extra == "bedrock"
|
|
44
44
|
Provides-Extra: biomed
|
|
45
45
|
Requires-Dist: requests; extra == "biomed"
|
|
46
46
|
Requires-Dist: bs4; extra == "biomed"
|
|
47
47
|
Provides-Extra: box
|
|
48
|
-
Requires-Dist: boxfs; extra == "box"
|
|
49
48
|
Requires-Dist: fsspec; extra == "box"
|
|
49
|
+
Requires-Dist: boxfs; extra == "box"
|
|
50
50
|
Provides-Extra: chroma
|
|
51
51
|
Requires-Dist: chromadb; extra == "chroma"
|
|
52
52
|
Provides-Extra: clarifai
|
|
53
53
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
54
54
|
Provides-Extra: confluence
|
|
55
|
-
Requires-Dist: requests; extra == "confluence"
|
|
56
55
|
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
56
|
+
Requires-Dist: requests; extra == "confluence"
|
|
57
57
|
Provides-Extra: couchbase
|
|
58
58
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
59
59
|
Provides-Extra: csv
|
|
@@ -63,8 +63,8 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
|
63
63
|
Provides-Extra: databricks-volumes
|
|
64
64
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
65
65
|
Provides-Extra: delta-table
|
|
66
|
-
Requires-Dist: deltalake; extra == "delta-table"
|
|
67
66
|
Requires-Dist: boto3; extra == "delta-table"
|
|
67
|
+
Requires-Dist: deltalake; extra == "delta-table"
|
|
68
68
|
Provides-Extra: discord
|
|
69
69
|
Requires-Dist: discord.py; extra == "discord"
|
|
70
70
|
Provides-Extra: doc
|
|
@@ -83,8 +83,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
|
83
83
|
Provides-Extra: embed-mixedbreadai
|
|
84
84
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
85
85
|
Provides-Extra: embed-octoai
|
|
86
|
-
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
87
86
|
Requires-Dist: openai; extra == "embed-octoai"
|
|
87
|
+
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
88
88
|
Provides-Extra: embed-vertexai
|
|
89
89
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
90
90
|
Provides-Extra: embed-voyageai
|
|
@@ -92,9 +92,9 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
92
92
|
Provides-Extra: epub
|
|
93
93
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
94
94
|
Provides-Extra: gcs
|
|
95
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
96
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
97
95
|
Requires-Dist: fsspec; extra == "gcs"
|
|
96
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
97
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
98
98
|
Provides-Extra: github
|
|
99
99
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
100
100
|
Requires-Dist: requests; extra == "github"
|
|
@@ -126,10 +126,10 @@ Requires-Dist: networkx; extra == "neo4j"
|
|
|
126
126
|
Requires-Dist: cymple; extra == "neo4j"
|
|
127
127
|
Requires-Dist: neo4j; extra == "neo4j"
|
|
128
128
|
Provides-Extra: notion
|
|
129
|
-
Requires-Dist:
|
|
129
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
130
130
|
Requires-Dist: backoff; extra == "notion"
|
|
131
131
|
Requires-Dist: notion-client; extra == "notion"
|
|
132
|
-
Requires-Dist:
|
|
132
|
+
Requires-Dist: httpx; extra == "notion"
|
|
133
133
|
Provides-Extra: odt
|
|
134
134
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
135
135
|
Provides-Extra: onedrive
|
|
@@ -137,8 +137,8 @@ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
|
137
137
|
Requires-Dist: bs4; extra == "onedrive"
|
|
138
138
|
Requires-Dist: msal; extra == "onedrive"
|
|
139
139
|
Provides-Extra: openai
|
|
140
|
-
Requires-Dist: tiktoken; extra == "openai"
|
|
141
140
|
Requires-Dist: openai; extra == "openai"
|
|
141
|
+
Requires-Dist: tiktoken; extra == "openai"
|
|
142
142
|
Provides-Extra: opensearch
|
|
143
143
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
144
144
|
Provides-Extra: org
|
|
@@ -174,8 +174,8 @@ Requires-Dist: fsspec; extra == "s3"
|
|
|
174
174
|
Provides-Extra: salesforce
|
|
175
175
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
176
176
|
Provides-Extra: sftp
|
|
177
|
-
Requires-Dist: paramiko; extra == "sftp"
|
|
178
177
|
Requires-Dist: fsspec; extra == "sftp"
|
|
178
|
+
Requires-Dist: paramiko; extra == "sftp"
|
|
179
179
|
Provides-Extra: sharepoint
|
|
180
180
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
181
181
|
Requires-Dist: msal; extra == "sharepoint"
|
|
@@ -192,11 +192,11 @@ Provides-Extra: tsv
|
|
|
192
192
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
193
193
|
Provides-Extra: vastdb
|
|
194
194
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
195
|
-
Requires-Dist: ibis; extra == "vastdb"
|
|
196
195
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
196
|
+
Requires-Dist: ibis; extra == "vastdb"
|
|
197
197
|
Provides-Extra: vectara
|
|
198
|
-
Requires-Dist: httpx; extra == "vectara"
|
|
199
198
|
Requires-Dist: requests; extra == "vectara"
|
|
199
|
+
Requires-Dist: httpx; extra == "vectara"
|
|
200
200
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
201
201
|
Provides-Extra: weaviate
|
|
202
202
|
Requires-Dist: weaviate-client; extra == "weaviate"
|