unstructured-ingest 0.5.20__py3-none-any.whl → 0.5.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +8 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/interfaces.py +7 -3
- unstructured_ingest/utils/data_prep.py +17 -5
- unstructured_ingest/utils/table.py +11 -4
- unstructured_ingest/v2/interfaces/__init__.py +8 -1
- unstructured_ingest/v2/interfaces/file_data.py +13 -116
- unstructured_ingest/v2/processes/connectors/delta_table.py +8 -3
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +4 -3
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +5 -2
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -2
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +7 -1
- unstructured_ingest/v2/processes/connectors/kdbai.py +6 -3
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +10 -2
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +5 -3
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -1
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +7 -3
- unstructured_ingest/v2/processes/connectors/sql/sql.py +22 -9
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -1
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +5 -7
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest/v2/types/file_data.py +116 -0
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/METADATA +168 -17
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/RECORD +29 -27
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.23.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import contextlib
|
|
1
2
|
import json
|
|
2
3
|
import os
|
|
3
4
|
from dataclasses import dataclass
|
|
@@ -231,6 +232,13 @@ def test_astra_create_destination():
|
|
|
231
232
|
)
|
|
232
233
|
collection_name = "system_created-123"
|
|
233
234
|
formatted_collection_name = "system_created_123"
|
|
235
|
+
|
|
236
|
+
client = AstraDBClient()
|
|
237
|
+
db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
|
|
238
|
+
with contextlib.suppress(Exception):
|
|
239
|
+
# drop collection before trying to create it
|
|
240
|
+
db.drop_collection(formatted_collection_name)
|
|
241
|
+
|
|
234
242
|
created = uploader.create_destination(destination_name=collection_name, vector_length=3072)
|
|
235
243
|
assert created
|
|
236
244
|
assert uploader.upload_config.collection_name == formatted_collection_name
|
|
@@ -239,8 +247,6 @@ def test_astra_create_destination():
|
|
|
239
247
|
assert not created
|
|
240
248
|
|
|
241
249
|
# cleanup
|
|
242
|
-
client = AstraDBClient()
|
|
243
|
-
db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
|
|
244
250
|
db.drop_collection(formatted_collection_name)
|
|
245
251
|
|
|
246
252
|
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.23" # pragma: no cover
|
|
@@ -2,10 +2,10 @@ from abc import ABC
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Any, Optional
|
|
4
4
|
|
|
5
|
-
import numpy as np
|
|
6
5
|
from pydantic import BaseModel, Field
|
|
7
6
|
|
|
8
7
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
|
|
10
10
|
EMBEDDINGS_KEY = "embeddings"
|
|
11
11
|
|
|
@@ -32,7 +32,6 @@ class BaseEncoder(ABC):
|
|
|
32
32
|
|
|
33
33
|
@dataclass
|
|
34
34
|
class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
35
|
-
|
|
36
35
|
def initialize(self):
|
|
37
36
|
"""Initializes the embedding encoder class. Should also validate the instance
|
|
38
37
|
is properly configured: e.g., embed a single a element"""
|
|
@@ -46,8 +45,11 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
46
45
|
return self.embed_query(query="Q")
|
|
47
46
|
|
|
48
47
|
@property
|
|
48
|
+
@requires_dependencies(["numpy"])
|
|
49
49
|
def is_unit_vector(self) -> bool:
|
|
50
50
|
"""Denotes if the embedding vector is a unit vector."""
|
|
51
|
+
import numpy as np
|
|
52
|
+
|
|
51
53
|
exemplary_embedding = self.get_exemplary_embedding()
|
|
52
54
|
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
|
|
53
55
|
|
|
@@ -86,7 +88,6 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
86
88
|
|
|
87
89
|
@dataclass
|
|
88
90
|
class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
89
|
-
|
|
90
91
|
async def initialize(self):
|
|
91
92
|
"""Initializes the embedding encoder class. Should also validate the instance
|
|
92
93
|
is properly configured: e.g., embed a single a element"""
|
|
@@ -100,8 +101,11 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
100
101
|
return await self.embed_query(query="Q")
|
|
101
102
|
|
|
102
103
|
@property
|
|
104
|
+
@requires_dependencies(["numpy"])
|
|
103
105
|
async def is_unit_vector(self) -> bool:
|
|
104
106
|
"""Denotes if the embedding vector is a unit vector."""
|
|
107
|
+
import numpy as np
|
|
108
|
+
|
|
105
109
|
exemplary_embedding = await self.get_exemplary_embedding()
|
|
106
110
|
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
|
|
107
111
|
|
|
@@ -2,20 +2,22 @@ import itertools
|
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
|
|
6
|
-
|
|
7
|
-
import pandas as pd
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
|
|
8
6
|
|
|
9
7
|
from unstructured_ingest.utils import ndjson
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
9
|
from unstructured_ingest.v2.logger import logger
|
|
11
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from pandas import DataFrame
|
|
13
|
+
|
|
12
14
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
13
15
|
|
|
14
16
|
T = TypeVar("T")
|
|
15
17
|
IterableT = Iterable[T]
|
|
16
18
|
|
|
17
19
|
|
|
18
|
-
def split_dataframe(df:
|
|
20
|
+
def split_dataframe(df: "DataFrame", chunk_size: int = 100) -> Generator["DataFrame", None, None]:
|
|
19
21
|
num_chunks = len(df) // chunk_size + 1
|
|
20
22
|
for i in range(num_chunks):
|
|
21
23
|
yield df[i * chunk_size : (i + 1) * chunk_size]
|
|
@@ -144,9 +146,13 @@ def get_data_by_suffix(path: Path) -> list[dict]:
|
|
|
144
146
|
elif path.suffix == ".ndjson":
|
|
145
147
|
return ndjson.load(f)
|
|
146
148
|
elif path.suffix == ".csv":
|
|
149
|
+
import pandas as pd
|
|
150
|
+
|
|
147
151
|
df = pd.read_csv(path)
|
|
148
152
|
return df.to_dict(orient="records")
|
|
149
153
|
elif path.suffix == ".parquet":
|
|
154
|
+
import pandas as pd
|
|
155
|
+
|
|
150
156
|
df = pd.read_parquet(path)
|
|
151
157
|
return df.to_dict(orient="records")
|
|
152
158
|
else:
|
|
@@ -180,6 +186,9 @@ def get_data(path: Union[Path, str]) -> list[dict]:
|
|
|
180
186
|
return ndjson.load(f)
|
|
181
187
|
except Exception as e:
|
|
182
188
|
logger.warning(f"failed to read {path} as ndjson: {e}")
|
|
189
|
+
|
|
190
|
+
import pandas as pd
|
|
191
|
+
|
|
183
192
|
try:
|
|
184
193
|
df = pd.read_csv(path)
|
|
185
194
|
return df.to_dict(orient="records")
|
|
@@ -202,7 +211,10 @@ def get_json_data(path: Path) -> list[dict]:
|
|
|
202
211
|
raise ValueError(f"Unsupported file type: {path}")
|
|
203
212
|
|
|
204
213
|
|
|
205
|
-
|
|
214
|
+
@requires_dependencies(["pandas"])
|
|
215
|
+
def get_data_df(path: Path) -> "DataFrame":
|
|
216
|
+
import pandas as pd
|
|
217
|
+
|
|
206
218
|
with path.open() as f:
|
|
207
219
|
if path.suffix == ".json":
|
|
208
220
|
data = json.load(f)
|
|
@@ -1,11 +1,16 @@
|
|
|
1
|
-
from typing import Any
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
1
|
+
from typing import TYPE_CHECKING, Any
|
|
4
2
|
|
|
5
3
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
4
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from pandas import DataFrame
|
|
6
8
|
|
|
7
9
|
|
|
10
|
+
@requires_dependencies(["pandas"])
|
|
8
11
|
def get_default_pandas_dtypes() -> dict[str, Any]:
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
9
14
|
return {
|
|
10
15
|
"text": pd.StringDtype(), # type: ignore
|
|
11
16
|
"type": pd.StringDtype(), # type: ignore
|
|
@@ -57,7 +62,9 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
|
|
|
57
62
|
def convert_to_pandas_dataframe(
|
|
58
63
|
elements_dict: list[dict[str, Any]],
|
|
59
64
|
drop_empty_cols: bool = False,
|
|
60
|
-
) ->
|
|
65
|
+
) -> "DataFrame":
|
|
66
|
+
import pandas as pd
|
|
67
|
+
|
|
61
68
|
# Flatten metadata if it hasn't already been flattened
|
|
62
69
|
for d in elements_dict:
|
|
63
70
|
if metadata := d.pop("metadata", None):
|
|
@@ -1,6 +1,13 @@
|
|
|
1
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
2
|
+
BatchFileData,
|
|
3
|
+
BatchItem,
|
|
4
|
+
FileData,
|
|
5
|
+
FileDataSourceMetadata,
|
|
6
|
+
SourceIdentifiers,
|
|
7
|
+
)
|
|
8
|
+
|
|
1
9
|
from .connector import AccessConfig, BaseConnector, ConnectionConfig
|
|
2
10
|
from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
|
|
3
|
-
from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
|
|
4
11
|
from .indexer import Indexer, IndexerConfig
|
|
5
12
|
from .process import BaseProcess
|
|
6
13
|
from .processor import ProcessorConfig
|
|
@@ -1,116 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
rel_path: Optional[str] = None
|
|
15
|
-
|
|
16
|
-
@property
|
|
17
|
-
def filename_stem(self) -> str:
|
|
18
|
-
return Path(self.filename).stem
|
|
19
|
-
|
|
20
|
-
@property
|
|
21
|
-
def relative_path(self) -> str:
|
|
22
|
-
return self.rel_path or self.fullpath
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class FileDataSourceMetadata(BaseModel):
|
|
26
|
-
url: Optional[str] = None
|
|
27
|
-
version: Optional[str] = None
|
|
28
|
-
record_locator: Optional[dict[str, Any]] = None
|
|
29
|
-
date_created: Optional[str] = None
|
|
30
|
-
date_modified: Optional[str] = None
|
|
31
|
-
date_processed: Optional[str] = None
|
|
32
|
-
permissions_data: Optional[list[dict[str, Any]]] = None
|
|
33
|
-
filesize_bytes: Optional[int] = None
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class FileData(BaseModel):
|
|
37
|
-
identifier: str
|
|
38
|
-
connector_type: str
|
|
39
|
-
source_identifiers: SourceIdentifiers
|
|
40
|
-
metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
|
|
41
|
-
additional_metadata: dict[str, Any] = Field(default_factory=dict)
|
|
42
|
-
reprocess: bool = False
|
|
43
|
-
local_download_path: Optional[str] = None
|
|
44
|
-
display_name: Optional[str] = None
|
|
45
|
-
|
|
46
|
-
@classmethod
|
|
47
|
-
def from_file(cls, path: str) -> "FileData":
|
|
48
|
-
path = Path(path).resolve()
|
|
49
|
-
if not path.exists() or not path.is_file():
|
|
50
|
-
raise ValueError(f"file path not valid: {path}")
|
|
51
|
-
with open(str(path.resolve()), "rb") as f:
|
|
52
|
-
file_data_dict = json.load(f)
|
|
53
|
-
file_data = cls.model_validate(file_data_dict)
|
|
54
|
-
return file_data
|
|
55
|
-
|
|
56
|
-
@classmethod
|
|
57
|
-
def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
|
|
58
|
-
file_data_dict = file_data.model_dump()
|
|
59
|
-
return cls.model_validate(file_data_dict, **kwargs)
|
|
60
|
-
|
|
61
|
-
def to_file(self, path: str) -> None:
|
|
62
|
-
path = Path(path).resolve()
|
|
63
|
-
path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
-
with open(str(path.resolve()), "w") as f:
|
|
65
|
-
json.dump(self.model_dump(), f, indent=2)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class BatchItem(BaseModel):
|
|
69
|
-
identifier: str
|
|
70
|
-
version: Optional[str] = None
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
class BatchFileData(FileData):
|
|
74
|
-
identifier: str = Field(init=False)
|
|
75
|
-
batch_items: list[BatchItem]
|
|
76
|
-
source_identifiers: Optional[SourceIdentifiers] = None
|
|
77
|
-
|
|
78
|
-
@field_validator("batch_items")
|
|
79
|
-
@classmethod
|
|
80
|
-
def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
|
|
81
|
-
if not v:
|
|
82
|
-
raise ValueError("batch items cannot be empty")
|
|
83
|
-
all_identifiers = [item.identifier for item in v]
|
|
84
|
-
if len(all_identifiers) != len(set(all_identifiers)):
|
|
85
|
-
raise ValueError(f"duplicate identifiers: {all_identifiers}")
|
|
86
|
-
sorted_batch_items = sorted(v, key=lambda item: item.identifier)
|
|
87
|
-
return sorted_batch_items
|
|
88
|
-
|
|
89
|
-
@model_validator(mode="before")
|
|
90
|
-
@classmethod
|
|
91
|
-
def populate_identifier(cls, data: Any) -> Any:
|
|
92
|
-
if isinstance(data, dict) and "identifier" not in data:
|
|
93
|
-
batch_items = data["batch_items"]
|
|
94
|
-
identifier_data = json.dumps(
|
|
95
|
-
{item.identifier: item.version for item in batch_items}, sort_keys=True
|
|
96
|
-
)
|
|
97
|
-
data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
|
|
98
|
-
return data
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def file_data_from_file(path: str) -> FileData:
|
|
102
|
-
try:
|
|
103
|
-
return BatchFileData.from_file(path=path)
|
|
104
|
-
except ValidationError:
|
|
105
|
-
logger.debug(f"{path} not detected as batch file data")
|
|
106
|
-
|
|
107
|
-
return FileData.from_file(path=path)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def file_data_from_dict(data: dict) -> FileData:
|
|
111
|
-
try:
|
|
112
|
-
return BatchFileData.model_validate(data)
|
|
113
|
-
except ValidationError:
|
|
114
|
-
logger.debug(f"{data} not valid for batch file data")
|
|
115
|
-
|
|
116
|
-
return FileData.model_validate(data)
|
|
1
|
+
"""
|
|
2
|
+
COMPATABILITY NOTICE:
|
|
3
|
+
This file has moved to the v2/types/ module.
|
|
4
|
+
The following line exists for backward compatibility.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.types.file_data import * # noqa - star imports are bad, but this is for maximal backward compatability
|
|
8
|
+
|
|
9
|
+
# Eventually this file should go away. Let's start warning users now:
|
|
10
|
+
logger.warning( # noqa - using logger from the star import
|
|
11
|
+
"Importing file_data.py through interfaces is deprecated. "
|
|
12
|
+
"Please use unstructured_ingest.v2.types.file_data instead!"
|
|
13
|
+
)
|
|
@@ -3,10 +3,9 @@ import traceback
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from multiprocessing import Process, Queue
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
|
-
import pandas as pd
|
|
10
9
|
from pydantic import Field, Secret
|
|
11
10
|
|
|
12
11
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -27,6 +26,9 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
|
|
|
27
26
|
|
|
28
27
|
CONNECTOR_TYPE = "delta_table"
|
|
29
28
|
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from pandas import DataFrame
|
|
31
|
+
|
|
30
32
|
|
|
31
33
|
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
32
34
|
def write_deltalake_with_error_handling(queue, **kwargs):
|
|
@@ -136,7 +138,7 @@ class DeltaTableUploader(Uploader):
|
|
|
136
138
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
137
139
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
138
140
|
|
|
139
|
-
def upload_dataframe(self, df:
|
|
141
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
140
142
|
updated_upload_path = os.path.join(
|
|
141
143
|
self.connection_config.table_uri, file_data.source_identifiers.relative_path
|
|
142
144
|
)
|
|
@@ -172,7 +174,10 @@ class DeltaTableUploader(Uploader):
|
|
|
172
174
|
logger.error(f"Exception occurred in write_deltalake: {error_message}")
|
|
173
175
|
raise RuntimeError(f"Error in write_deltalake: {error_message}")
|
|
174
176
|
|
|
177
|
+
@requires_dependencies(["pandas"], extras="delta-table")
|
|
175
178
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
179
|
+
import pandas as pd
|
|
180
|
+
|
|
176
181
|
df = pd.DataFrame(data=data)
|
|
177
182
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
178
183
|
|
|
@@ -2,9 +2,8 @@ from dataclasses import dataclass
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
import pandas as pd
|
|
6
|
-
|
|
7
5
|
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
6
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
7
|
from unstructured_ingest.v2.interfaces import FileData, UploadStager
|
|
9
8
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
10
9
|
|
|
@@ -55,7 +54,6 @@ _COLUMNS = (
|
|
|
55
54
|
|
|
56
55
|
@dataclass
|
|
57
56
|
class BaseDuckDBUploadStager(UploadStager):
|
|
58
|
-
|
|
59
57
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
60
58
|
data = element_dict.copy()
|
|
61
59
|
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
@@ -72,6 +70,7 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
72
70
|
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
73
71
|
return data
|
|
74
72
|
|
|
73
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
75
74
|
def run(
|
|
76
75
|
self,
|
|
77
76
|
elements_filepath: Path,
|
|
@@ -80,6 +79,8 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
80
79
|
output_filename: str,
|
|
81
80
|
**kwargs: Any,
|
|
82
81
|
) -> Path:
|
|
82
|
+
import pandas as pd
|
|
83
|
+
|
|
83
84
|
elements_contents = get_data(path=elements_filepath)
|
|
84
85
|
output_filename_suffix = Path(elements_filepath).suffix
|
|
85
86
|
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUp
|
|
|
23
22
|
|
|
24
23
|
if TYPE_CHECKING:
|
|
25
24
|
from duckdb import DuckDBPyConnection as DuckDBConnection
|
|
25
|
+
from pandas import DataFrame
|
|
26
26
|
|
|
27
27
|
CONNECTOR_TYPE = "duckdb"
|
|
28
28
|
|
|
@@ -101,7 +101,7 @@ class DuckDBUploader(Uploader):
|
|
|
101
101
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
102
102
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
103
103
|
|
|
104
|
-
def upload_dataframe(self, df:
|
|
104
|
+
def upload_dataframe(self, df: "DataFrame") -> None:
|
|
105
105
|
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
106
106
|
|
|
107
107
|
with self.connection_config.get_client() as conn:
|
|
@@ -109,7 +109,10 @@ class DuckDBUploader(Uploader):
|
|
|
109
109
|
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
|
|
110
110
|
)
|
|
111
111
|
|
|
112
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
112
113
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
114
|
+
import pandas as pd
|
|
115
|
+
|
|
113
116
|
df = pd.DataFrame(data=data)
|
|
114
117
|
self.upload_dataframe(df=df)
|
|
115
118
|
|
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
|
|
@@ -24,6 +23,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUp
|
|
|
24
23
|
|
|
25
24
|
if TYPE_CHECKING:
|
|
26
25
|
from duckdb import DuckDBPyConnection as MotherDuckConnection
|
|
26
|
+
from pandas import DataFrame
|
|
27
27
|
|
|
28
28
|
CONNECTOR_TYPE = "motherduck"
|
|
29
29
|
|
|
@@ -100,7 +100,7 @@ class MotherDuckUploader(Uploader):
|
|
|
100
100
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
101
101
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
102
102
|
|
|
103
|
-
def upload_dataframe(self, df:
|
|
103
|
+
def upload_dataframe(self, df: "DataFrame") -> None:
|
|
104
104
|
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
105
105
|
database = self.connection_config.database
|
|
106
106
|
db_schema = self.connection_config.db_schema
|
|
@@ -109,7 +109,10 @@ class MotherDuckUploader(Uploader):
|
|
|
109
109
|
with self.connection_config.get_client() as conn:
|
|
110
110
|
conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
|
|
111
111
|
|
|
112
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
112
113
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
114
|
+
import pandas as pd
|
|
115
|
+
|
|
113
116
|
df = pd.DataFrame(data=data)
|
|
114
117
|
self.upload_dataframe(df=df)
|
|
115
118
|
|
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from time import time
|
|
7
7
|
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
8
8
|
|
|
9
|
-
from pydantic import BaseModel, Field, Secret, SecretStr
|
|
9
|
+
from pydantic import BaseModel, Field, Secret, SecretStr, field_validator
|
|
10
10
|
|
|
11
11
|
from unstructured_ingest.error import (
|
|
12
12
|
DestinationConnectionError,
|
|
@@ -98,6 +98,12 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
|
|
|
98
98
|
ca_certs: Optional[Path] = None
|
|
99
99
|
access_config: Secret[ElasticsearchAccessConfig]
|
|
100
100
|
|
|
101
|
+
@field_validator("hosts", mode="before")
|
|
102
|
+
def to_list(cls, value):
|
|
103
|
+
if isinstance(value, str):
|
|
104
|
+
return [value]
|
|
105
|
+
return value
|
|
106
|
+
|
|
101
107
|
def get_client_kwargs(self) -> dict:
|
|
102
108
|
# Update auth related fields to conform to what the SDK expects based on the
|
|
103
109
|
# supported methods:
|
|
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import TYPE_CHECKING, Optional
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel, Field, Secret
|
|
5
|
+
from pydantic import BaseModel, Field, Secret, field_validator
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.error import (
|
|
8
8
|
DestinationConnectionError,
|
|
@@ -78,6 +78,12 @@ class OpenSearchConnectionConfig(ConnectionConfig):
|
|
|
78
78
|
|
|
79
79
|
access_config: Secret[OpenSearchAccessConfig]
|
|
80
80
|
|
|
81
|
+
@field_validator("hosts", mode="before")
|
|
82
|
+
def to_list(cls, value):
|
|
83
|
+
if isinstance(value, str):
|
|
84
|
+
return [value]
|
|
85
|
+
return value
|
|
86
|
+
|
|
81
87
|
def get_client_kwargs(self) -> dict:
|
|
82
88
|
# Update auth related fields to conform to what the SDK expects based on the
|
|
83
89
|
# supported methods:
|
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -26,6 +25,7 @@ from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
|
26
25
|
|
|
27
26
|
if TYPE_CHECKING:
|
|
28
27
|
from kdbai_client import Database, Session, Table
|
|
28
|
+
from pandas import DataFrame
|
|
29
29
|
|
|
30
30
|
CONNECTOR_TYPE = "kdbai"
|
|
31
31
|
|
|
@@ -118,11 +118,11 @@ class KdbaiUploader(Uploader):
|
|
|
118
118
|
table = db.table(self.upload_config.table_name)
|
|
119
119
|
yield table
|
|
120
120
|
|
|
121
|
-
def upsert_batch(self, batch:
|
|
121
|
+
def upsert_batch(self, batch: "DataFrame"):
|
|
122
122
|
with self.get_table() as table:
|
|
123
123
|
table.insert(batch)
|
|
124
124
|
|
|
125
|
-
def process_dataframe(self, df:
|
|
125
|
+
def process_dataframe(self, df: "DataFrame"):
|
|
126
126
|
logger.debug(
|
|
127
127
|
f"uploading {len(df)} entries to {self.connection_config.endpoint} "
|
|
128
128
|
f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
|
|
@@ -130,7 +130,10 @@ class KdbaiUploader(Uploader):
|
|
|
130
130
|
for batch_df in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
131
131
|
self.upsert_batch(batch=batch_df)
|
|
132
132
|
|
|
133
|
+
@requires_dependencies(["pandas"], extras="kdbai")
|
|
133
134
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
135
|
+
import pandas as pd
|
|
136
|
+
|
|
134
137
|
df = pd.DataFrame(data=data)
|
|
135
138
|
self.process_dataframe(df=df)
|
|
136
139
|
|
|
@@ -8,7 +8,6 @@ from dataclasses import dataclass, field
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
|
|
10
10
|
|
|
11
|
-
import pandas as pd
|
|
12
11
|
from pydantic import Field
|
|
13
12
|
|
|
14
13
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -26,6 +25,7 @@ CONNECTOR_TYPE = "lancedb"
|
|
|
26
25
|
if TYPE_CHECKING:
|
|
27
26
|
from lancedb import AsyncConnection
|
|
28
27
|
from lancedb.table import AsyncTable
|
|
28
|
+
from pandas import DataFrame
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class LanceDBConnectionConfig(ConnectionConfig, ABC):
|
|
@@ -69,6 +69,7 @@ class LanceDBUploadStager(UploadStager):
|
|
|
69
69
|
default_factory=LanceDBUploadStagerConfig
|
|
70
70
|
)
|
|
71
71
|
|
|
72
|
+
@requires_dependencies(["pandas"], extras="lancedb")
|
|
72
73
|
def run(
|
|
73
74
|
self,
|
|
74
75
|
elements_filepath: Path,
|
|
@@ -77,6 +78,8 @@ class LanceDBUploadStager(UploadStager):
|
|
|
77
78
|
output_filename: str,
|
|
78
79
|
**kwargs: Any,
|
|
79
80
|
) -> Path:
|
|
81
|
+
import pandas as pd
|
|
82
|
+
|
|
80
83
|
with open(elements_filepath) as elements_file:
|
|
81
84
|
elements_contents: list[dict] = json.load(elements_file)
|
|
82
85
|
|
|
@@ -129,7 +132,10 @@ class LanceDBUploader(Uploader):
|
|
|
129
132
|
finally:
|
|
130
133
|
table.close()
|
|
131
134
|
|
|
135
|
+
@requires_dependencies(["pandas"], extras="lancedb")
|
|
132
136
|
async def run_async(self, path, file_data, **kwargs):
|
|
137
|
+
import pandas as pd
|
|
138
|
+
|
|
133
139
|
df = pd.read_feather(path)
|
|
134
140
|
async with self.get_table() as table:
|
|
135
141
|
schema = await table.schema()
|
|
@@ -144,7 +150,9 @@ class LanceDBUploader(Uploader):
|
|
|
144
150
|
await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
|
|
145
151
|
await table.add(data=df)
|
|
146
152
|
|
|
147
|
-
def _fit_to_schema(self, df:
|
|
153
|
+
def _fit_to_schema(self, df: "DataFrame", schema) -> "DataFrame":
|
|
154
|
+
import pandas as pd
|
|
155
|
+
|
|
148
156
|
columns = set(df.columns)
|
|
149
157
|
schema_fields = set(schema.names)
|
|
150
158
|
columns_to_drop = columns - schema_fields
|
|
@@ -3,8 +3,6 @@ from contextlib import contextmanager
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import numpy as np
|
|
7
|
-
import pandas as pd
|
|
8
6
|
from pydantic import Field, Secret
|
|
9
7
|
|
|
10
8
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
@@ -27,6 +25,7 @@ if TYPE_CHECKING:
|
|
|
27
25
|
from databricks.sdk.core import oauth_service_principal
|
|
28
26
|
from databricks.sql.client import Connection as DeltaTableConnection
|
|
29
27
|
from databricks.sql.client import Cursor as DeltaTableCursor
|
|
28
|
+
from pandas import DataFrame
|
|
30
29
|
|
|
31
30
|
CONNECTOR_TYPE = "databricks_delta_tables"
|
|
32
31
|
|
|
@@ -180,7 +179,10 @@ class DatabricksDeltaTablesUploader(SQLUploader):
|
|
|
180
179
|
)
|
|
181
180
|
return statement
|
|
182
181
|
|
|
183
|
-
|
|
182
|
+
@requires_dependencies(["pandas"], extras="databricks-delta-tables")
|
|
183
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
184
|
+
import numpy as np
|
|
185
|
+
|
|
184
186
|
if self.can_delete():
|
|
185
187
|
self.delete_by_record_id(file_data=file_data)
|
|
186
188
|
else:
|
|
@@ -3,9 +3,9 @@ from contextlib import contextmanager
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
from unstructured_ingest.v2.logger import logger
|
|
10
10
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
11
11
|
DestinationRegistryEntry,
|
|
@@ -46,6 +46,7 @@ class SingleStoreConnectionConfig(SQLConnectionConfig):
|
|
|
46
46
|
database: Optional[str] = Field(default=None, description="SingleStore database")
|
|
47
47
|
|
|
48
48
|
@contextmanager
|
|
49
|
+
@requires_dependencies(["singlestoredb"], extras="singlestore")
|
|
49
50
|
def get_connection(self) -> Generator["SingleStoreConnection", None, None]:
|
|
50
51
|
import singlestoredb as s2
|
|
51
52
|
|
|
@@ -130,9 +131,12 @@ class SingleStoreUploader(SQLUploader):
|
|
|
130
131
|
values_delimiter: str = "%s"
|
|
131
132
|
connector_type: str = CONNECTOR_TYPE
|
|
132
133
|
|
|
134
|
+
@requires_dependencies(["pandas"], extras="singlestore")
|
|
133
135
|
def prepare_data(
|
|
134
136
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
135
137
|
) -> list[tuple[Any, ...]]:
|
|
138
|
+
import pandas as pd
|
|
139
|
+
|
|
136
140
|
output = []
|
|
137
141
|
for row in data:
|
|
138
142
|
parsed = []
|