unstructured-ingest 0.5.20__py3-none-any.whl → 0.5.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (29) hide show
  1. test/integration/connectors/test_astradb.py +8 -2
  2. unstructured_ingest/__version__.py +1 -1
  3. unstructured_ingest/embed/interfaces.py +7 -3
  4. unstructured_ingest/utils/data_prep.py +17 -5
  5. unstructured_ingest/utils/table.py +11 -4
  6. unstructured_ingest/v2/interfaces/__init__.py +8 -1
  7. unstructured_ingest/v2/interfaces/file_data.py +13 -116
  8. unstructured_ingest/v2/processes/connectors/delta_table.py +8 -3
  9. unstructured_ingest/v2/processes/connectors/duckdb/base.py +4 -3
  10. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +5 -2
  11. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -2
  12. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -1
  13. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +7 -1
  14. unstructured_ingest/v2/processes/connectors/kdbai.py +6 -3
  15. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +10 -2
  16. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +5 -3
  17. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -1
  18. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +7 -3
  19. unstructured_ingest/v2/processes/connectors/sql/sql.py +22 -9
  20. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -1
  21. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +5 -7
  22. unstructured_ingest/v2/types/__init__.py +0 -0
  23. unstructured_ingest/v2/types/file_data.py +116 -0
  24. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/METADATA +168 -17
  25. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/RECORD +29 -27
  26. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/LICENSE.md +0 -0
  27. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/WHEEL +0 -0
  28. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/entry_points.txt +0 -0
  29. {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,6 @@ from contextlib import contextmanager
3
3
  from dataclasses import dataclass, field
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import numpy as np
7
- import pandas as pd
8
6
  from pydantic import Field, Secret
9
7
 
10
8
  from unstructured_ingest.utils.data_prep import split_dataframe
@@ -32,6 +30,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
32
30
  )
33
31
 
34
32
  if TYPE_CHECKING:
33
+ from pandas import DataFrame
35
34
  from snowflake.connector import SnowflakeConnection
36
35
  from snowflake.connector.cursor import SnowflakeCursor
37
36
 
@@ -174,9 +173,12 @@ class SnowflakeUploader(SQLUploader):
174
173
  connector_type: str = CONNECTOR_TYPE
175
174
  values_delimiter: str = "?"
176
175
 
176
+ @requires_dependencies(["pandas"], extras="snowflake")
177
177
  def prepare_data(
178
178
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]
179
179
  ) -> list[tuple[Any, ...]]:
180
+ import pandas as pd
181
+
180
182
  output = []
181
183
  for row in data:
182
184
  parsed = []
@@ -210,7 +212,9 @@ class SnowflakeUploader(SQLUploader):
210
212
  ]
211
213
  )
212
214
 
213
- def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
215
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
216
+ import numpy as np
217
+
214
218
  if self.can_delete():
215
219
  self.delete_by_record_id(file_data=file_data)
216
220
  else:
@@ -6,10 +6,8 @@ from dataclasses import dataclass, field
6
6
  from datetime import datetime
7
7
  from pathlib import Path
8
8
  from time import time
9
- from typing import Any, Generator, Union
9
+ from typing import TYPE_CHECKING, Any, Generator, Union
10
10
 
11
- import numpy as np
12
- import pandas as pd
13
11
  from dateutil import parser
14
12
  from pydantic import BaseModel, Field, Secret
15
13
 
@@ -38,6 +36,9 @@ from unstructured_ingest.v2.interfaces import (
38
36
  from unstructured_ingest.v2.logger import logger
39
37
  from unstructured_ingest.v2.utils import get_enhanced_element_id
40
38
 
39
+ if TYPE_CHECKING:
40
+ from pandas import DataFrame
41
+
41
42
  _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
42
43
 
43
44
 
@@ -154,13 +155,15 @@ class SQLDownloader(Downloader, ABC):
154
155
  def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
155
156
  pass
156
157
 
157
- def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[pd.DataFrame]:
158
+ def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list["DataFrame"]:
159
+ import pandas as pd
160
+
158
161
  data = [dict(zip(columns, row)) for row in rows]
159
162
  df = pd.DataFrame(data)
160
163
  dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
161
164
  return dfs
162
165
 
163
- def get_data(self, file_data: SqlBatchFileData) -> list[pd.DataFrame]:
166
+ def get_data(self, file_data: SqlBatchFileData) -> list["DataFrame"]:
164
167
  rows, columns = self.query_db(file_data=file_data)
165
168
  return self.sql_to_df(rows=rows, columns=columns)
166
169
 
@@ -174,7 +177,7 @@ class SQLDownloader(Downloader, ABC):
174
177
  return f
175
178
 
176
179
  def generate_download_response(
177
- self, result: pd.DataFrame, file_data: SqlBatchFileData
180
+ self, result: "DataFrame", file_data: SqlBatchFileData
178
181
  ) -> DownloadResponse:
179
182
  id_column = file_data.additional_metadata.id_column
180
183
  table_name = file_data.additional_metadata.table_name
@@ -231,7 +234,7 @@ class SQLUploadStager(UploadStager):
231
234
  data[RECORD_ID_LABEL] = file_data.identifier
232
235
  return data
233
236
 
234
- def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
237
+ def conform_dataframe(self, df: "DataFrame") -> "DataFrame":
235
238
  for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
236
239
  df[column] = df[column].apply(parse_date_string).apply(lambda date: date.timestamp())
237
240
  for column in filter(
@@ -259,6 +262,8 @@ class SQLUploadStager(UploadStager):
259
262
  output_filename: str,
260
263
  **kwargs: Any,
261
264
  ) -> Path:
265
+ import pandas as pd
266
+
262
267
  elements_contents = get_data(path=elements_filepath)
263
268
 
264
269
  df = pd.DataFrame(
@@ -309,6 +314,8 @@ class SQLUploader(Uploader):
309
314
  def prepare_data(
310
315
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]
311
316
  ) -> list[tuple[Any, ...]]:
317
+ import pandas as pd
318
+
312
319
  output = []
313
320
  for row in data:
314
321
  parsed = []
@@ -323,7 +330,9 @@ class SQLUploader(Uploader):
323
330
  output.append(tuple(parsed))
324
331
  return output
325
332
 
326
- def _fit_to_schema(self, df: pd.DataFrame, add_missing_columns: bool = True) -> pd.DataFrame:
333
+ def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
334
+ import pandas as pd
335
+
327
336
  table_columns = self.get_table_columns()
328
337
  columns = set(df.columns)
329
338
  schema_fields = set(table_columns)
@@ -348,7 +357,9 @@ class SQLUploader(Uploader):
348
357
  df[column] = pd.Series()
349
358
  return df
350
359
 
351
- def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
360
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
361
+ import numpy as np
362
+
352
363
  if self.can_delete():
353
364
  self.delete_by_record_id(file_data=file_data)
354
365
  else:
@@ -409,6 +420,8 @@ class SQLUploader(Uploader):
409
420
  logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
410
421
 
411
422
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
423
+ import pandas as pd
424
+
412
425
  df = pd.DataFrame(data)
413
426
  self.upload_dataframe(df=df, file_data=file_data)
414
427
 
@@ -4,9 +4,9 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import TYPE_CHECKING, Any, Generator
6
6
 
7
- import pandas as pd
8
7
  from pydantic import Field, Secret, model_validator
9
8
 
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
10
10
  from unstructured_ingest.v2.logger import logger
11
11
  from unstructured_ingest.v2.processes.connector_registry import (
12
12
  DestinationRegistryEntry,
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
32
32
  from sqlite3 import Connection as SqliteConnection
33
33
  from sqlite3 import Cursor as SqliteCursor
34
34
 
35
+
35
36
  CONNECTOR_TYPE = "sqlite"
36
37
 
37
38
 
@@ -132,9 +133,12 @@ class SQLiteUploader(SQLUploader):
132
133
  connection_config: SQLiteConnectionConfig
133
134
  connector_type: str = CONNECTOR_TYPE
134
135
 
136
+ @requires_dependencies(["pandas"])
135
137
  def prepare_data(
136
138
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]
137
139
  ) -> list[tuple[Any, ...]]:
140
+ import pandas as pd
141
+
138
142
  output = []
139
143
  for row in data:
140
144
  parsed = []
@@ -2,8 +2,6 @@ from contextlib import contextmanager
2
2
  from dataclasses import dataclass, field
3
3
  from typing import TYPE_CHECKING, Any, Optional
4
4
 
5
- import numpy as np
6
- import pandas as pd
7
5
  from pydantic import Field, Secret
8
6
 
9
7
  from unstructured_ingest.error import DestinationConnectionError
@@ -34,6 +32,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
34
32
  from unstructured_ingest.v2.utils import get_enhanced_element_id
35
33
 
36
34
  if TYPE_CHECKING:
35
+ from pandas import DataFrame
37
36
  from vastdb import connect as VastdbConnect
38
37
  from vastdb import transaction as VastdbTransaction
39
38
  from vastdb.table import Table as VastdbTable
@@ -128,7 +127,6 @@ class VastdbDownloader(SQLDownloader):
128
127
  ids = tuple([item.identifier for item in file_data.batch_items])
129
128
 
130
129
  with self.connection_config.get_table(table_name) as table:
131
-
132
130
  predicate = _[id_column].isin(ids)
133
131
 
134
132
  if self.download_config.fields:
@@ -168,7 +166,7 @@ class VastdbUploadStager(SQLUploadStager):
168
166
  data[RECORD_ID_LABEL] = file_data.identifier
169
167
  return data
170
168
 
171
- def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
169
+ def conform_dataframe(self, df: "DataFrame") -> "DataFrame":
172
170
  df = super().conform_dataframe(df=df)
173
171
  if self.upload_stager_config.rename_columns_map:
174
172
  df.rename(columns=self.upload_stager_config.rename_columns_map, inplace=True)
@@ -193,8 +191,9 @@ class VastdbUploader(SQLUploader):
193
191
  logger.error(f"failed to validate connection: {e}", exc_info=True)
194
192
  raise DestinationConnectionError(f"failed to validate connection: {e}")
195
193
 
196
- @requires_dependencies(["pyarrow"], extras="vastdb")
197
- def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
194
+ @requires_dependencies(["pyarrow", "pandas"], extras="vastdb")
195
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
196
+ import numpy as np
198
197
  import pyarrow as pa
199
198
 
200
199
  if self.can_delete():
@@ -216,7 +215,6 @@ class VastdbUploader(SQLUploader):
216
215
  )
217
216
 
218
217
  for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
219
-
220
218
  with self.connection_config.get_table(self.upload_config.table_name) as table:
221
219
  pa_table = pa.Table.from_pandas(rows)
222
220
  table.insert(pa_table)
File without changes
@@ -0,0 +1,116 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, Optional
4
+ from uuid import NAMESPACE_DNS, uuid5
5
+
6
+ from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
7
+
8
+ from unstructured_ingest.v2.logger import logger
9
+
10
+
11
+ class SourceIdentifiers(BaseModel):
12
+ filename: str
13
+ fullpath: str
14
+ rel_path: Optional[str] = None
15
+
16
+ @property
17
+ def filename_stem(self) -> str:
18
+ return Path(self.filename).stem
19
+
20
+ @property
21
+ def relative_path(self) -> str:
22
+ return self.rel_path or self.fullpath
23
+
24
+
25
+ class FileDataSourceMetadata(BaseModel):
26
+ url: Optional[str] = None
27
+ version: Optional[str] = None
28
+ record_locator: Optional[dict[str, Any]] = None
29
+ date_created: Optional[str] = None
30
+ date_modified: Optional[str] = None
31
+ date_processed: Optional[str] = None
32
+ permissions_data: Optional[list[dict[str, Any]]] = None
33
+ filesize_bytes: Optional[int] = None
34
+
35
+
36
+ class FileData(BaseModel):
37
+ identifier: str
38
+ connector_type: str
39
+ source_identifiers: SourceIdentifiers
40
+ metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
41
+ additional_metadata: dict[str, Any] = Field(default_factory=dict)
42
+ reprocess: bool = False
43
+ local_download_path: Optional[str] = None
44
+ display_name: Optional[str] = None
45
+
46
+ @classmethod
47
+ def from_file(cls, path: str) -> "FileData":
48
+ path = Path(path).resolve()
49
+ if not path.exists() or not path.is_file():
50
+ raise ValueError(f"file path not valid: {path}")
51
+ with open(str(path.resolve()), "rb") as f:
52
+ file_data_dict = json.load(f)
53
+ file_data = cls.model_validate(file_data_dict)
54
+ return file_data
55
+
56
+ @classmethod
57
+ def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
58
+ file_data_dict = file_data.model_dump()
59
+ return cls.model_validate(file_data_dict, **kwargs)
60
+
61
+ def to_file(self, path: str) -> None:
62
+ path = Path(path).resolve()
63
+ path.parent.mkdir(parents=True, exist_ok=True)
64
+ with open(str(path.resolve()), "w") as f:
65
+ json.dump(self.model_dump(), f, indent=2)
66
+
67
+
68
+ class BatchItem(BaseModel):
69
+ identifier: str
70
+ version: Optional[str] = None
71
+
72
+
73
+ class BatchFileData(FileData):
74
+ identifier: str = Field(init=False)
75
+ batch_items: list[BatchItem]
76
+ source_identifiers: Optional[SourceIdentifiers] = None
77
+
78
+ @field_validator("batch_items")
79
+ @classmethod
80
+ def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
81
+ if not v:
82
+ raise ValueError("batch items cannot be empty")
83
+ all_identifiers = [item.identifier for item in v]
84
+ if len(all_identifiers) != len(set(all_identifiers)):
85
+ raise ValueError(f"duplicate identifiers: {all_identifiers}")
86
+ sorted_batch_items = sorted(v, key=lambda item: item.identifier)
87
+ return sorted_batch_items
88
+
89
+ @model_validator(mode="before")
90
+ @classmethod
91
+ def populate_identifier(cls, data: Any) -> Any:
92
+ if isinstance(data, dict) and "identifier" not in data:
93
+ batch_items = data["batch_items"]
94
+ identifier_data = json.dumps(
95
+ {item.identifier: item.version for item in batch_items}, sort_keys=True
96
+ )
97
+ data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
98
+ return data
99
+
100
+
101
+ def file_data_from_file(path: str) -> FileData:
102
+ try:
103
+ return BatchFileData.from_file(path=path)
104
+ except ValidationError:
105
+ logger.debug(f"{path} not detected as batch file data")
106
+
107
+ return FileData.from_file(path=path)
108
+
109
+
110
+ def file_data_from_dict(data: dict) -> FileData:
111
+ try:
112
+ return BatchFileData.model_validate(data)
113
+ except ValidationError:
114
+ logger.debug(f"{data} not valid for batch file data")
115
+
116
+ return FileData.model_validate(data)