unstructured-ingest 0.5.20__py3-none-any.whl → 0.5.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +8 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/interfaces.py +7 -3
- unstructured_ingest/utils/data_prep.py +17 -5
- unstructured_ingest/utils/table.py +11 -4
- unstructured_ingest/v2/interfaces/__init__.py +8 -1
- unstructured_ingest/v2/interfaces/file_data.py +13 -116
- unstructured_ingest/v2/processes/connectors/delta_table.py +8 -3
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +4 -3
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +5 -2
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -2
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +7 -1
- unstructured_ingest/v2/processes/connectors/kdbai.py +6 -3
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +10 -2
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +5 -3
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -1
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +7 -3
- unstructured_ingest/v2/processes/connectors/sql/sql.py +22 -9
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -1
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +5 -7
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest/v2/types/file_data.py +116 -0
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/METADATA +168 -17
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/RECORD +29 -27
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/top_level.txt +0 -0
|
@@ -3,8 +3,6 @@ from contextlib import contextmanager
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import numpy as np
|
|
7
|
-
import pandas as pd
|
|
8
6
|
from pydantic import Field, Secret
|
|
9
7
|
|
|
10
8
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
@@ -32,6 +30,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
32
30
|
)
|
|
33
31
|
|
|
34
32
|
if TYPE_CHECKING:
|
|
33
|
+
from pandas import DataFrame
|
|
35
34
|
from snowflake.connector import SnowflakeConnection
|
|
36
35
|
from snowflake.connector.cursor import SnowflakeCursor
|
|
37
36
|
|
|
@@ -174,9 +173,12 @@ class SnowflakeUploader(SQLUploader):
|
|
|
174
173
|
connector_type: str = CONNECTOR_TYPE
|
|
175
174
|
values_delimiter: str = "?"
|
|
176
175
|
|
|
176
|
+
@requires_dependencies(["pandas"], extras="snowflake")
|
|
177
177
|
def prepare_data(
|
|
178
178
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
179
179
|
) -> list[tuple[Any, ...]]:
|
|
180
|
+
import pandas as pd
|
|
181
|
+
|
|
180
182
|
output = []
|
|
181
183
|
for row in data:
|
|
182
184
|
parsed = []
|
|
@@ -210,7 +212,9 @@ class SnowflakeUploader(SQLUploader):
|
|
|
210
212
|
]
|
|
211
213
|
)
|
|
212
214
|
|
|
213
|
-
def upload_dataframe(self, df:
|
|
215
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
216
|
+
import numpy as np
|
|
217
|
+
|
|
214
218
|
if self.can_delete():
|
|
215
219
|
self.delete_by_record_id(file_data=file_data)
|
|
216
220
|
else:
|
|
@@ -6,10 +6,8 @@ from dataclasses import dataclass, field
|
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from time import time
|
|
9
|
-
from typing import Any, Generator, Union
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Generator, Union
|
|
10
10
|
|
|
11
|
-
import numpy as np
|
|
12
|
-
import pandas as pd
|
|
13
11
|
from dateutil import parser
|
|
14
12
|
from pydantic import BaseModel, Field, Secret
|
|
15
13
|
|
|
@@ -38,6 +36,9 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
38
36
|
from unstructured_ingest.v2.logger import logger
|
|
39
37
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
40
38
|
|
|
39
|
+
if TYPE_CHECKING:
|
|
40
|
+
from pandas import DataFrame
|
|
41
|
+
|
|
41
42
|
_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
|
|
42
43
|
|
|
43
44
|
|
|
@@ -154,13 +155,15 @@ class SQLDownloader(Downloader, ABC):
|
|
|
154
155
|
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
155
156
|
pass
|
|
156
157
|
|
|
157
|
-
def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[
|
|
158
|
+
def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list["DataFrame"]:
|
|
159
|
+
import pandas as pd
|
|
160
|
+
|
|
158
161
|
data = [dict(zip(columns, row)) for row in rows]
|
|
159
162
|
df = pd.DataFrame(data)
|
|
160
163
|
dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
|
|
161
164
|
return dfs
|
|
162
165
|
|
|
163
|
-
def get_data(self, file_data: SqlBatchFileData) -> list[
|
|
166
|
+
def get_data(self, file_data: SqlBatchFileData) -> list["DataFrame"]:
|
|
164
167
|
rows, columns = self.query_db(file_data=file_data)
|
|
165
168
|
return self.sql_to_df(rows=rows, columns=columns)
|
|
166
169
|
|
|
@@ -174,7 +177,7 @@ class SQLDownloader(Downloader, ABC):
|
|
|
174
177
|
return f
|
|
175
178
|
|
|
176
179
|
def generate_download_response(
|
|
177
|
-
self, result:
|
|
180
|
+
self, result: "DataFrame", file_data: SqlBatchFileData
|
|
178
181
|
) -> DownloadResponse:
|
|
179
182
|
id_column = file_data.additional_metadata.id_column
|
|
180
183
|
table_name = file_data.additional_metadata.table_name
|
|
@@ -231,7 +234,7 @@ class SQLUploadStager(UploadStager):
|
|
|
231
234
|
data[RECORD_ID_LABEL] = file_data.identifier
|
|
232
235
|
return data
|
|
233
236
|
|
|
234
|
-
def conform_dataframe(self, df:
|
|
237
|
+
def conform_dataframe(self, df: "DataFrame") -> "DataFrame":
|
|
235
238
|
for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
|
|
236
239
|
df[column] = df[column].apply(parse_date_string).apply(lambda date: date.timestamp())
|
|
237
240
|
for column in filter(
|
|
@@ -259,6 +262,8 @@ class SQLUploadStager(UploadStager):
|
|
|
259
262
|
output_filename: str,
|
|
260
263
|
**kwargs: Any,
|
|
261
264
|
) -> Path:
|
|
265
|
+
import pandas as pd
|
|
266
|
+
|
|
262
267
|
elements_contents = get_data(path=elements_filepath)
|
|
263
268
|
|
|
264
269
|
df = pd.DataFrame(
|
|
@@ -309,6 +314,8 @@ class SQLUploader(Uploader):
|
|
|
309
314
|
def prepare_data(
|
|
310
315
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
311
316
|
) -> list[tuple[Any, ...]]:
|
|
317
|
+
import pandas as pd
|
|
318
|
+
|
|
312
319
|
output = []
|
|
313
320
|
for row in data:
|
|
314
321
|
parsed = []
|
|
@@ -323,7 +330,9 @@ class SQLUploader(Uploader):
|
|
|
323
330
|
output.append(tuple(parsed))
|
|
324
331
|
return output
|
|
325
332
|
|
|
326
|
-
def _fit_to_schema(self, df:
|
|
333
|
+
def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
|
|
334
|
+
import pandas as pd
|
|
335
|
+
|
|
327
336
|
table_columns = self.get_table_columns()
|
|
328
337
|
columns = set(df.columns)
|
|
329
338
|
schema_fields = set(table_columns)
|
|
@@ -348,7 +357,9 @@ class SQLUploader(Uploader):
|
|
|
348
357
|
df[column] = pd.Series()
|
|
349
358
|
return df
|
|
350
359
|
|
|
351
|
-
def upload_dataframe(self, df:
|
|
360
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
361
|
+
import numpy as np
|
|
362
|
+
|
|
352
363
|
if self.can_delete():
|
|
353
364
|
self.delete_by_record_id(file_data=file_data)
|
|
354
365
|
else:
|
|
@@ -409,6 +420,8 @@ class SQLUploader(Uploader):
|
|
|
409
420
|
logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
|
|
410
421
|
|
|
411
422
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
423
|
+
import pandas as pd
|
|
424
|
+
|
|
412
425
|
df = pd.DataFrame(data)
|
|
413
426
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
414
427
|
|
|
@@ -4,9 +4,9 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generator
|
|
6
6
|
|
|
7
|
-
import pandas as pd
|
|
8
7
|
from pydantic import Field, Secret, model_validator
|
|
9
8
|
|
|
9
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
11
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
12
|
DestinationRegistryEntry,
|
|
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
|
|
|
32
32
|
from sqlite3 import Connection as SqliteConnection
|
|
33
33
|
from sqlite3 import Cursor as SqliteCursor
|
|
34
34
|
|
|
35
|
+
|
|
35
36
|
CONNECTOR_TYPE = "sqlite"
|
|
36
37
|
|
|
37
38
|
|
|
@@ -132,9 +133,12 @@ class SQLiteUploader(SQLUploader):
|
|
|
132
133
|
connection_config: SQLiteConnectionConfig
|
|
133
134
|
connector_type: str = CONNECTOR_TYPE
|
|
134
135
|
|
|
136
|
+
@requires_dependencies(["pandas"])
|
|
135
137
|
def prepare_data(
|
|
136
138
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
137
139
|
) -> list[tuple[Any, ...]]:
|
|
140
|
+
import pandas as pd
|
|
141
|
+
|
|
138
142
|
output = []
|
|
139
143
|
for row in data:
|
|
140
144
|
parsed = []
|
|
@@ -2,8 +2,6 @@ from contextlib import contextmanager
|
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
4
|
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
5
|
from pydantic import Field, Secret
|
|
8
6
|
|
|
9
7
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -34,6 +32,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
34
32
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
35
33
|
|
|
36
34
|
if TYPE_CHECKING:
|
|
35
|
+
from pandas import DataFrame
|
|
37
36
|
from vastdb import connect as VastdbConnect
|
|
38
37
|
from vastdb import transaction as VastdbTransaction
|
|
39
38
|
from vastdb.table import Table as VastdbTable
|
|
@@ -128,7 +127,6 @@ class VastdbDownloader(SQLDownloader):
|
|
|
128
127
|
ids = tuple([item.identifier for item in file_data.batch_items])
|
|
129
128
|
|
|
130
129
|
with self.connection_config.get_table(table_name) as table:
|
|
131
|
-
|
|
132
130
|
predicate = _[id_column].isin(ids)
|
|
133
131
|
|
|
134
132
|
if self.download_config.fields:
|
|
@@ -168,7 +166,7 @@ class VastdbUploadStager(SQLUploadStager):
|
|
|
168
166
|
data[RECORD_ID_LABEL] = file_data.identifier
|
|
169
167
|
return data
|
|
170
168
|
|
|
171
|
-
def conform_dataframe(self, df:
|
|
169
|
+
def conform_dataframe(self, df: "DataFrame") -> "DataFrame":
|
|
172
170
|
df = super().conform_dataframe(df=df)
|
|
173
171
|
if self.upload_stager_config.rename_columns_map:
|
|
174
172
|
df.rename(columns=self.upload_stager_config.rename_columns_map, inplace=True)
|
|
@@ -193,8 +191,9 @@ class VastdbUploader(SQLUploader):
|
|
|
193
191
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
194
192
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
195
193
|
|
|
196
|
-
@requires_dependencies(["pyarrow"], extras="vastdb")
|
|
197
|
-
def upload_dataframe(self, df:
|
|
194
|
+
@requires_dependencies(["pyarrow", "pandas"], extras="vastdb")
|
|
195
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
196
|
+
import numpy as np
|
|
198
197
|
import pyarrow as pa
|
|
199
198
|
|
|
200
199
|
if self.can_delete():
|
|
@@ -216,7 +215,6 @@ class VastdbUploader(SQLUploader):
|
|
|
216
215
|
)
|
|
217
216
|
|
|
218
217
|
for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
219
|
-
|
|
220
218
|
with self.connection_config.get_table(self.upload_config.table_name) as table:
|
|
221
219
|
pa_table = pa.Table.from_pandas(rows)
|
|
222
220
|
table.insert(pa_table)
|
|
File without changes
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.v2.logger import logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SourceIdentifiers(BaseModel):
|
|
12
|
+
filename: str
|
|
13
|
+
fullpath: str
|
|
14
|
+
rel_path: Optional[str] = None
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def filename_stem(self) -> str:
|
|
18
|
+
return Path(self.filename).stem
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def relative_path(self) -> str:
|
|
22
|
+
return self.rel_path or self.fullpath
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class FileDataSourceMetadata(BaseModel):
|
|
26
|
+
url: Optional[str] = None
|
|
27
|
+
version: Optional[str] = None
|
|
28
|
+
record_locator: Optional[dict[str, Any]] = None
|
|
29
|
+
date_created: Optional[str] = None
|
|
30
|
+
date_modified: Optional[str] = None
|
|
31
|
+
date_processed: Optional[str] = None
|
|
32
|
+
permissions_data: Optional[list[dict[str, Any]]] = None
|
|
33
|
+
filesize_bytes: Optional[int] = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FileData(BaseModel):
|
|
37
|
+
identifier: str
|
|
38
|
+
connector_type: str
|
|
39
|
+
source_identifiers: SourceIdentifiers
|
|
40
|
+
metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
|
|
41
|
+
additional_metadata: dict[str, Any] = Field(default_factory=dict)
|
|
42
|
+
reprocess: bool = False
|
|
43
|
+
local_download_path: Optional[str] = None
|
|
44
|
+
display_name: Optional[str] = None
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def from_file(cls, path: str) -> "FileData":
|
|
48
|
+
path = Path(path).resolve()
|
|
49
|
+
if not path.exists() or not path.is_file():
|
|
50
|
+
raise ValueError(f"file path not valid: {path}")
|
|
51
|
+
with open(str(path.resolve()), "rb") as f:
|
|
52
|
+
file_data_dict = json.load(f)
|
|
53
|
+
file_data = cls.model_validate(file_data_dict)
|
|
54
|
+
return file_data
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
|
|
58
|
+
file_data_dict = file_data.model_dump()
|
|
59
|
+
return cls.model_validate(file_data_dict, **kwargs)
|
|
60
|
+
|
|
61
|
+
def to_file(self, path: str) -> None:
|
|
62
|
+
path = Path(path).resolve()
|
|
63
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
with open(str(path.resolve()), "w") as f:
|
|
65
|
+
json.dump(self.model_dump(), f, indent=2)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class BatchItem(BaseModel):
|
|
69
|
+
identifier: str
|
|
70
|
+
version: Optional[str] = None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BatchFileData(FileData):
|
|
74
|
+
identifier: str = Field(init=False)
|
|
75
|
+
batch_items: list[BatchItem]
|
|
76
|
+
source_identifiers: Optional[SourceIdentifiers] = None
|
|
77
|
+
|
|
78
|
+
@field_validator("batch_items")
|
|
79
|
+
@classmethod
|
|
80
|
+
def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
|
|
81
|
+
if not v:
|
|
82
|
+
raise ValueError("batch items cannot be empty")
|
|
83
|
+
all_identifiers = [item.identifier for item in v]
|
|
84
|
+
if len(all_identifiers) != len(set(all_identifiers)):
|
|
85
|
+
raise ValueError(f"duplicate identifiers: {all_identifiers}")
|
|
86
|
+
sorted_batch_items = sorted(v, key=lambda item: item.identifier)
|
|
87
|
+
return sorted_batch_items
|
|
88
|
+
|
|
89
|
+
@model_validator(mode="before")
|
|
90
|
+
@classmethod
|
|
91
|
+
def populate_identifier(cls, data: Any) -> Any:
|
|
92
|
+
if isinstance(data, dict) and "identifier" not in data:
|
|
93
|
+
batch_items = data["batch_items"]
|
|
94
|
+
identifier_data = json.dumps(
|
|
95
|
+
{item.identifier: item.version for item in batch_items}, sort_keys=True
|
|
96
|
+
)
|
|
97
|
+
data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
|
|
98
|
+
return data
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def file_data_from_file(path: str) -> FileData:
|
|
102
|
+
try:
|
|
103
|
+
return BatchFileData.from_file(path=path)
|
|
104
|
+
except ValidationError:
|
|
105
|
+
logger.debug(f"{path} not detected as batch file data")
|
|
106
|
+
|
|
107
|
+
return FileData.from_file(path=path)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def file_data_from_dict(data: dict) -> FileData:
|
|
111
|
+
try:
|
|
112
|
+
return BatchFileData.model_validate(data)
|
|
113
|
+
except ValidationError:
|
|
114
|
+
logger.debug(f"{data} not valid for batch file data")
|
|
115
|
+
|
|
116
|
+
return FileData.model_validate(data)
|