unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +178 -0
- test/integration/connectors/sql/test_sqlite.py +151 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +203 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +177 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +310 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +172 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -14
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +85 -31
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import sys
|
|
4
|
+
import uuid
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field, replace
|
|
7
|
+
from datetime import date, datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from time import time
|
|
10
|
+
from typing import Any, Generator, Union
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from dateutil import parser
|
|
14
|
+
from pydantic import Field, Secret
|
|
15
|
+
|
|
16
|
+
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
17
|
+
from unstructured_ingest.v2.interfaces import (
|
|
18
|
+
AccessConfig,
|
|
19
|
+
ConnectionConfig,
|
|
20
|
+
Downloader,
|
|
21
|
+
DownloaderConfig,
|
|
22
|
+
DownloadResponse,
|
|
23
|
+
FileData,
|
|
24
|
+
FileDataSourceMetadata,
|
|
25
|
+
Indexer,
|
|
26
|
+
IndexerConfig,
|
|
27
|
+
Uploader,
|
|
28
|
+
UploaderConfig,
|
|
29
|
+
UploadStager,
|
|
30
|
+
UploadStagerConfig,
|
|
31
|
+
download_responses,
|
|
32
|
+
)
|
|
33
|
+
from unstructured_ingest.v2.logger import logger
|
|
34
|
+
|
|
35
|
+
_COLUMNS = (
|
|
36
|
+
"id",
|
|
37
|
+
"element_id",
|
|
38
|
+
"text",
|
|
39
|
+
"embeddings",
|
|
40
|
+
"type",
|
|
41
|
+
"system",
|
|
42
|
+
"layout_width",
|
|
43
|
+
"layout_height",
|
|
44
|
+
"points",
|
|
45
|
+
"url",
|
|
46
|
+
"version",
|
|
47
|
+
"date_created",
|
|
48
|
+
"date_modified",
|
|
49
|
+
"date_processed",
|
|
50
|
+
"permissions_data",
|
|
51
|
+
"record_locator",
|
|
52
|
+
"category_depth",
|
|
53
|
+
"parent_id",
|
|
54
|
+
"attached_filename",
|
|
55
|
+
"filetype",
|
|
56
|
+
"last_modified",
|
|
57
|
+
"file_directory",
|
|
58
|
+
"filename",
|
|
59
|
+
"languages",
|
|
60
|
+
"page_number",
|
|
61
|
+
"links",
|
|
62
|
+
"page_name",
|
|
63
|
+
"link_urls",
|
|
64
|
+
"link_texts",
|
|
65
|
+
"sent_from",
|
|
66
|
+
"sent_to",
|
|
67
|
+
"subject",
|
|
68
|
+
"section",
|
|
69
|
+
"header_footer_type",
|
|
70
|
+
"emphasized_text_contents",
|
|
71
|
+
"emphasized_text_tags",
|
|
72
|
+
"text_as_html",
|
|
73
|
+
"regex_metadata",
|
|
74
|
+
"detection_class_prob",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def parse_date_string(date_value: Union[str, int]) -> date:
|
|
81
|
+
try:
|
|
82
|
+
timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
|
|
83
|
+
return datetime.fromtimestamp(timestamp)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.debug(f"date {date_value} string not a timestamp: {e}")
|
|
86
|
+
return parser.parse(date_value)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class SQLAccessConfig(AccessConfig):
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class SQLConnectionConfig(ConnectionConfig, ABC):
|
|
94
|
+
access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
|
|
95
|
+
|
|
96
|
+
@abstractmethod
|
|
97
|
+
def get_connection(self) -> Any:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class SQLIndexerConfig(IndexerConfig):
|
|
102
|
+
table_name: str
|
|
103
|
+
id_column: str
|
|
104
|
+
batch_size: int = 100
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class SQLIndexer(Indexer, ABC):
|
|
108
|
+
connection_config: SQLConnectionConfig
|
|
109
|
+
index_config: SQLIndexerConfig
|
|
110
|
+
|
|
111
|
+
@abstractmethod
|
|
112
|
+
def _get_doc_ids(self) -> list[str]:
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
def precheck(self) -> None:
|
|
116
|
+
try:
|
|
117
|
+
connection = self.connection_config.get_connection()
|
|
118
|
+
cursor = connection.cursor()
|
|
119
|
+
cursor.execute("SELECT 1;")
|
|
120
|
+
cursor.close()
|
|
121
|
+
except Exception as e:
|
|
122
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
123
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
124
|
+
|
|
125
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
126
|
+
ids = self._get_doc_ids()
|
|
127
|
+
id_batches: list[frozenset[str]] = [
|
|
128
|
+
frozenset(
|
|
129
|
+
ids[
|
|
130
|
+
i
|
|
131
|
+
* self.index_config.batch_size : (i + 1) # noqa
|
|
132
|
+
* self.index_config.batch_size
|
|
133
|
+
]
|
|
134
|
+
)
|
|
135
|
+
for i in range(
|
|
136
|
+
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
137
|
+
)
|
|
138
|
+
]
|
|
139
|
+
for batch in id_batches:
|
|
140
|
+
# Make sure the hash is always a positive number to create identified
|
|
141
|
+
identified = str(hash(batch) + sys.maxsize + 1)
|
|
142
|
+
yield FileData(
|
|
143
|
+
identifier=identified,
|
|
144
|
+
connector_type=self.connector_type,
|
|
145
|
+
metadata=FileDataSourceMetadata(
|
|
146
|
+
date_processed=str(time()),
|
|
147
|
+
),
|
|
148
|
+
doc_type="batch",
|
|
149
|
+
additional_metadata={
|
|
150
|
+
"ids": list(batch),
|
|
151
|
+
"table_name": self.index_config.table_name,
|
|
152
|
+
"id_column": self.index_config.id_column,
|
|
153
|
+
},
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class SQLDownloaderConfig(DownloaderConfig):
|
|
158
|
+
fields: list[str] = field(default_factory=list)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class SQLDownloader(Downloader, ABC):
|
|
162
|
+
connection_config: SQLConnectionConfig
|
|
163
|
+
download_config: SQLDownloaderConfig
|
|
164
|
+
|
|
165
|
+
@abstractmethod
|
|
166
|
+
def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[pd.DataFrame]:
|
|
170
|
+
data = [dict(zip(columns, row)) for row in rows]
|
|
171
|
+
df = pd.DataFrame(data)
|
|
172
|
+
dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
|
|
173
|
+
return dfs
|
|
174
|
+
|
|
175
|
+
def get_data(self, file_data: FileData) -> list[pd.DataFrame]:
|
|
176
|
+
rows, columns = self.query_db(file_data=file_data)
|
|
177
|
+
return self.sql_to_df(rows=rows, columns=columns)
|
|
178
|
+
|
|
179
|
+
def get_identifier(self, table_name: str, record_id: str) -> str:
|
|
180
|
+
f = f"{table_name}-{record_id}"
|
|
181
|
+
if self.download_config.fields:
|
|
182
|
+
f = "{}-{}".format(
|
|
183
|
+
f,
|
|
184
|
+
hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
|
|
185
|
+
)
|
|
186
|
+
return f
|
|
187
|
+
|
|
188
|
+
def generate_download_response(
|
|
189
|
+
self, result: pd.DataFrame, file_data: FileData
|
|
190
|
+
) -> DownloadResponse:
|
|
191
|
+
id_column = file_data.additional_metadata["id_column"]
|
|
192
|
+
table_name = file_data.additional_metadata["table_name"]
|
|
193
|
+
record_id = result.iloc[0][id_column]
|
|
194
|
+
filename_id = self.get_identifier(table_name=table_name, record_id=record_id)
|
|
195
|
+
filename = f"{filename_id}.csv"
|
|
196
|
+
download_path = self.download_dir / Path(filename)
|
|
197
|
+
logger.debug(
|
|
198
|
+
f"Downloading results from table {table_name} and id {record_id} to {download_path}"
|
|
199
|
+
)
|
|
200
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
201
|
+
result.to_csv(download_path)
|
|
202
|
+
copied_file_data = replace(file_data)
|
|
203
|
+
copied_file_data.identifier = filename_id
|
|
204
|
+
copied_file_data.doc_type = "file"
|
|
205
|
+
copied_file_data.additional_metadata.pop("ids", None)
|
|
206
|
+
return super().generate_download_response(
|
|
207
|
+
file_data=copied_file_data, download_path=download_path
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
211
|
+
data_dfs = self.get_data(file_data=file_data)
|
|
212
|
+
download_responses = []
|
|
213
|
+
for df in data_dfs:
|
|
214
|
+
download_responses.append(
|
|
215
|
+
self.generate_download_response(result=df, file_data=file_data)
|
|
216
|
+
)
|
|
217
|
+
return download_responses
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class SQLUploadStagerConfig(UploadStagerConfig):
|
|
221
|
+
pass
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@dataclass
|
|
225
|
+
class SQLUploadStager(UploadStager):
|
|
226
|
+
upload_stager_config: SQLUploadStagerConfig = field(default_factory=SQLUploadStagerConfig)
|
|
227
|
+
|
|
228
|
+
def run(
|
|
229
|
+
self,
|
|
230
|
+
elements_filepath: Path,
|
|
231
|
+
file_data: FileData,
|
|
232
|
+
output_dir: Path,
|
|
233
|
+
output_filename: str,
|
|
234
|
+
**kwargs: Any,
|
|
235
|
+
) -> Path:
|
|
236
|
+
with open(elements_filepath) as elements_file:
|
|
237
|
+
elements_contents: list[dict] = json.load(elements_file)
|
|
238
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
239
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
240
|
+
|
|
241
|
+
output = []
|
|
242
|
+
for data in elements_contents:
|
|
243
|
+
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
244
|
+
data_source = metadata.pop("data_source", {})
|
|
245
|
+
coordinates = metadata.pop("coordinates", {})
|
|
246
|
+
|
|
247
|
+
data.update(metadata)
|
|
248
|
+
data.update(data_source)
|
|
249
|
+
data.update(coordinates)
|
|
250
|
+
|
|
251
|
+
data["id"] = str(uuid.uuid4())
|
|
252
|
+
|
|
253
|
+
# remove extraneous, not supported columns
|
|
254
|
+
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
255
|
+
|
|
256
|
+
output.append(data)
|
|
257
|
+
|
|
258
|
+
df = pd.DataFrame.from_dict(output)
|
|
259
|
+
for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
|
|
260
|
+
df[column] = df[column].apply(parse_date_string)
|
|
261
|
+
for column in filter(
|
|
262
|
+
lambda x: x in df.columns,
|
|
263
|
+
("permissions_data", "record_locator", "points", "links"),
|
|
264
|
+
):
|
|
265
|
+
df[column] = df[column].apply(
|
|
266
|
+
lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
|
|
267
|
+
)
|
|
268
|
+
for column in filter(
|
|
269
|
+
lambda x: x in df.columns,
|
|
270
|
+
("version", "page_number", "regex_metadata"),
|
|
271
|
+
):
|
|
272
|
+
df[column] = df[column].apply(str)
|
|
273
|
+
|
|
274
|
+
with output_path.open("w") as output_file:
|
|
275
|
+
df.to_json(output_file, orient="records", lines=True)
|
|
276
|
+
return output_path
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class SQLUploaderConfig(UploaderConfig):
|
|
280
|
+
batch_size: int = Field(default=50, description="Number of records per batch")
|
|
281
|
+
table_name: str = Field(default="elements", description="which table to upload contents to")
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
@dataclass
|
|
285
|
+
class SQLUploader(Uploader):
|
|
286
|
+
upload_config: SQLUploaderConfig
|
|
287
|
+
connection_config: SQLConnectionConfig
|
|
288
|
+
|
|
289
|
+
def precheck(self) -> None:
|
|
290
|
+
try:
|
|
291
|
+
connection = self.connection_config.get_connection()
|
|
292
|
+
cursor = connection.cursor()
|
|
293
|
+
cursor.execute("SELECT 1;")
|
|
294
|
+
cursor.close()
|
|
295
|
+
except Exception as e:
|
|
296
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
297
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
298
|
+
|
|
299
|
+
@abstractmethod
|
|
300
|
+
def prepare_data(
|
|
301
|
+
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
302
|
+
) -> list[tuple[Any, ...]]:
|
|
303
|
+
pass
|
|
304
|
+
|
|
305
|
+
@abstractmethod
|
|
306
|
+
def upload_contents(self, path: Path) -> None:
|
|
307
|
+
pass
|
|
308
|
+
|
|
309
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
310
|
+
self.upload_contents(path=path)
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import Field, Secret, model_validator
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
11
|
+
from unstructured_ingest.v2.logger import logger
|
|
12
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
13
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
14
|
+
_DATE_COLUMNS,
|
|
15
|
+
SQLAccessConfig,
|
|
16
|
+
SQLConnectionConfig,
|
|
17
|
+
SQLDownloader,
|
|
18
|
+
SQLDownloaderConfig,
|
|
19
|
+
SQLIndexer,
|
|
20
|
+
SQLIndexerConfig,
|
|
21
|
+
SQLUploader,
|
|
22
|
+
SQLUploaderConfig,
|
|
23
|
+
SQLUploadStager,
|
|
24
|
+
SQLUploadStagerConfig,
|
|
25
|
+
parse_date_string,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from sqlite3 import Connection as SqliteConnection
|
|
30
|
+
|
|
31
|
+
CONNECTOR_TYPE = "sqlite"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SQLiteAccessConfig(SQLAccessConfig):
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SQLiteConnectionConfig(SQLConnectionConfig):
|
|
39
|
+
access_config: Secret[SQLiteAccessConfig] = Field(
|
|
40
|
+
default=SQLiteAccessConfig(), validate_default=True
|
|
41
|
+
)
|
|
42
|
+
database_path: Path = Field(
|
|
43
|
+
description="Path to the .db file.",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
@model_validator(mode="after")
|
|
47
|
+
def check_database_path(self) -> "SQLiteConnectionConfig":
|
|
48
|
+
if not self.database_path.exists():
|
|
49
|
+
raise ValueError(f"{self.database_path} does not exist")
|
|
50
|
+
if not self.database_path.is_file():
|
|
51
|
+
raise ValueError(f"{self.database_path} is not a valid file")
|
|
52
|
+
return self
|
|
53
|
+
|
|
54
|
+
def get_connection(self) -> "SqliteConnection":
|
|
55
|
+
from sqlite3 import connect
|
|
56
|
+
|
|
57
|
+
return connect(database=self.database_path)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class SQLiteIndexerConfig(SQLIndexerConfig):
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class SQLiteIndexer(SQLIndexer):
|
|
66
|
+
connection_config: SQLConnectionConfig
|
|
67
|
+
index_config: SQLIndexerConfig
|
|
68
|
+
connector_type: str = CONNECTOR_TYPE
|
|
69
|
+
|
|
70
|
+
def _get_doc_ids(self) -> list[str]:
|
|
71
|
+
with self.connection_config.get_connection() as sqlite_connection:
|
|
72
|
+
cursor = sqlite_connection.cursor()
|
|
73
|
+
cursor.execute(
|
|
74
|
+
f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
|
|
75
|
+
)
|
|
76
|
+
results = cursor.fetchall()
|
|
77
|
+
ids = [result[0] for result in results]
|
|
78
|
+
return ids
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class SQLiteDownloaderConfig(SQLDownloaderConfig):
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class SQLiteDownloader(SQLDownloader):
|
|
87
|
+
connection_config: SQLConnectionConfig
|
|
88
|
+
download_config: SQLDownloaderConfig
|
|
89
|
+
connector_type: str = CONNECTOR_TYPE
|
|
90
|
+
|
|
91
|
+
def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
|
|
92
|
+
table_name = file_data.additional_metadata["table_name"]
|
|
93
|
+
id_column = file_data.additional_metadata["id_column"]
|
|
94
|
+
ids = file_data.additional_metadata["ids"]
|
|
95
|
+
with self.connection_config.get_connection() as sqlite_connection:
|
|
96
|
+
cursor = sqlite_connection.cursor()
|
|
97
|
+
fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
|
|
98
|
+
query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
|
|
99
|
+
fields=fields,
|
|
100
|
+
table_name=table_name,
|
|
101
|
+
id_column=id_column,
|
|
102
|
+
ids=",".join([str(i) for i in ids]),
|
|
103
|
+
)
|
|
104
|
+
logger.debug(f"running query: {query}")
|
|
105
|
+
cursor.execute(query)
|
|
106
|
+
rows = cursor.fetchall()
|
|
107
|
+
columns = [col[0] for col in cursor.description]
|
|
108
|
+
return rows, columns
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class SQLiteUploadStagerConfig(SQLUploadStagerConfig):
|
|
112
|
+
pass
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class SQLiteUploadStager(SQLUploadStager):
|
|
116
|
+
upload_stager_config: SQLiteUploadStagerConfig
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class SQLiteUploaderConfig(SQLUploaderConfig):
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class SQLiteUploader(SQLUploader):
|
|
125
|
+
upload_config: SQLiteUploaderConfig = field(default_factory=SQLiteUploaderConfig)
|
|
126
|
+
connection_config: SQLiteConnectionConfig
|
|
127
|
+
connector_type: str = CONNECTOR_TYPE
|
|
128
|
+
|
|
129
|
+
def prepare_data(
|
|
130
|
+
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
131
|
+
) -> list[tuple[Any, ...]]:
|
|
132
|
+
output = []
|
|
133
|
+
for row in data:
|
|
134
|
+
parsed = []
|
|
135
|
+
for column_name, value in zip(columns, row):
|
|
136
|
+
if isinstance(value, (list, dict)):
|
|
137
|
+
value = json.dumps(value)
|
|
138
|
+
if column_name in _DATE_COLUMNS:
|
|
139
|
+
if value is None:
|
|
140
|
+
parsed.append(None)
|
|
141
|
+
else:
|
|
142
|
+
parsed.append(parse_date_string(value))
|
|
143
|
+
else:
|
|
144
|
+
parsed.append(value)
|
|
145
|
+
output.append(tuple(parsed))
|
|
146
|
+
return output
|
|
147
|
+
|
|
148
|
+
def upload_contents(self, path: Path) -> None:
|
|
149
|
+
df = pd.read_json(path, orient="records", lines=True)
|
|
150
|
+
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database_path} ")
|
|
151
|
+
df.replace({np.nan: None}, inplace=True)
|
|
152
|
+
|
|
153
|
+
columns = tuple(df.columns)
|
|
154
|
+
stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) \
|
|
155
|
+
VALUES({','.join(['?' for x in columns])})" # noqa E501
|
|
156
|
+
|
|
157
|
+
for rows in pd.read_json(
|
|
158
|
+
path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
159
|
+
):
|
|
160
|
+
with self.connection_config.get_connection() as conn:
|
|
161
|
+
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
162
|
+
conn.executemany(stmt, values)
|
|
163
|
+
conn.commit()
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
sqlite_destination_entry = DestinationRegistryEntry(
|
|
167
|
+
connection_config=SQLiteConnectionConfig,
|
|
168
|
+
uploader=SQLiteUploader,
|
|
169
|
+
uploader_config=SQLiteUploaderConfig,
|
|
170
|
+
upload_stager=SQLiteUploadStager,
|
|
171
|
+
upload_stager_config=SQLiteUploadStagerConfig,
|
|
172
|
+
)
|
|
@@ -22,6 +22,7 @@ class EmbedderConfig(BaseModel):
|
|
|
22
22
|
"voyageai",
|
|
23
23
|
"octoai",
|
|
24
24
|
"mixedbread-ai",
|
|
25
|
+
"togetherai",
|
|
25
26
|
]
|
|
26
27
|
] = Field(default=None, description="Type of the embedding class to be used.")
|
|
27
28
|
embedding_api_key: Optional[SecretStr] = Field(
|
|
@@ -107,6 +108,16 @@ class EmbedderConfig(BaseModel):
|
|
|
107
108
|
config=MixedbreadAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
108
109
|
)
|
|
109
110
|
|
|
111
|
+
def get_togetherai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
112
|
+
from unstructured_ingest.embed.togetherai import (
|
|
113
|
+
TogetherAIEmbeddingConfig,
|
|
114
|
+
TogetherAIEmbeddingEncoder,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return TogetherAIEmbeddingEncoder(
|
|
118
|
+
config=TogetherAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
119
|
+
)
|
|
120
|
+
|
|
110
121
|
def get_embedder(self) -> "BaseEmbeddingEncoder":
|
|
111
122
|
kwargs: dict[str, Any] = {}
|
|
112
123
|
if self.embedding_api_key:
|
|
@@ -133,6 +144,8 @@ class EmbedderConfig(BaseModel):
|
|
|
133
144
|
return self.get_voyageai_embedder(embedding_kwargs=kwargs)
|
|
134
145
|
if self.embedding_provider == "mixedbread-ai":
|
|
135
146
|
return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
|
|
147
|
+
if self.embedding_provider == "togetherai":
|
|
148
|
+
return self.get_togetherai_embedder(embedding_kwargs=kwargs)
|
|
136
149
|
|
|
137
150
|
raise ValueError(f"{self.embedding_provider} not a recognized encoder")
|
|
138
151
|
|
|
@@ -55,7 +55,7 @@ class PartitionerConfig(BaseModel):
|
|
|
55
55
|
"fields if they exist and drop all other fields. ",
|
|
56
56
|
)
|
|
57
57
|
partition_endpoint: Optional[str] = Field(
|
|
58
|
-
default="https://api.
|
|
58
|
+
default="https://api.unstructuredapp.io/general/v0/general",
|
|
59
59
|
description="If partitioning via api, use the following host.",
|
|
60
60
|
)
|
|
61
61
|
partition_by_api: bool = Field(
|
|
@@ -153,6 +153,7 @@ class Partitioner(BaseProcess, ABC):
|
|
|
153
153
|
async def partition_via_api(
|
|
154
154
|
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
155
155
|
) -> list[dict]:
|
|
156
|
+
metadata = metadata or {}
|
|
156
157
|
logger.debug(f"partitioning file {filename} with metadata: {metadata}")
|
|
157
158
|
|
|
158
159
|
elements = await call_api(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
+
Requires-Dist: python-dateutil
|
|
26
|
+
Requires-Dist: pandas
|
|
25
27
|
Requires-Dist: pydantic>=2.7
|
|
26
28
|
Requires-Dist: dataclasses-json
|
|
27
29
|
Requires-Dist: opentelemetry-sdk
|
|
28
|
-
Requires-Dist: python-dateutil
|
|
29
|
-
Requires-Dist: pandas
|
|
30
|
-
Requires-Dist: tqdm
|
|
31
30
|
Requires-Dist: click
|
|
31
|
+
Requires-Dist: tqdm
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
@@ -41,11 +41,11 @@ Requires-Dist: azure-search-documents; extra == "azure-cognitive-search"
|
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
42
|
Requires-Dist: boto3; extra == "bedrock"
|
|
43
43
|
Provides-Extra: biomed
|
|
44
|
-
Requires-Dist: requests; extra == "biomed"
|
|
45
44
|
Requires-Dist: bs4; extra == "biomed"
|
|
45
|
+
Requires-Dist: requests; extra == "biomed"
|
|
46
46
|
Provides-Extra: box
|
|
47
|
-
Requires-Dist: fsspec; extra == "box"
|
|
48
47
|
Requires-Dist: boxfs; extra == "box"
|
|
48
|
+
Requires-Dist: fsspec; extra == "box"
|
|
49
49
|
Provides-Extra: chroma
|
|
50
50
|
Requires-Dist: chromadb; extra == "chroma"
|
|
51
51
|
Provides-Extra: clarifai
|
|
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
|
|
|
69
69
|
Provides-Extra: docx
|
|
70
70
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
71
71
|
Provides-Extra: dropbox
|
|
72
|
-
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
73
72
|
Requires-Dist: fsspec; extra == "dropbox"
|
|
73
|
+
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
74
74
|
Provides-Extra: elasticsearch
|
|
75
75
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
76
76
|
Provides-Extra: embed-huggingface
|
|
@@ -88,11 +88,11 @@ Provides-Extra: epub
|
|
|
88
88
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
89
89
|
Provides-Extra: gcs
|
|
90
90
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
91
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
92
91
|
Requires-Dist: bs4; extra == "gcs"
|
|
92
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
93
93
|
Provides-Extra: github
|
|
94
|
-
Requires-Dist: requests; extra == "github"
|
|
95
94
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
95
|
+
Requires-Dist: requests; extra == "github"
|
|
96
96
|
Provides-Extra: gitlab
|
|
97
97
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
98
98
|
Provides-Extra: google-drive
|
|
@@ -105,7 +105,7 @@ Requires-Dist: atlassian-python-api; extra == "jira"
|
|
|
105
105
|
Provides-Extra: kafka
|
|
106
106
|
Requires-Dist: confluent-kafka; extra == "kafka"
|
|
107
107
|
Provides-Extra: kdbai
|
|
108
|
-
Requires-Dist: kdbai-client; extra == "kdbai"
|
|
108
|
+
Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
|
|
109
109
|
Provides-Extra: md
|
|
110
110
|
Requires-Dist: unstructured[md]; extra == "md"
|
|
111
111
|
Provides-Extra: milvus
|
|
@@ -116,15 +116,15 @@ Provides-Extra: msg
|
|
|
116
116
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
117
117
|
Provides-Extra: notion
|
|
118
118
|
Requires-Dist: notion-client; extra == "notion"
|
|
119
|
-
Requires-Dist: httpx; extra == "notion"
|
|
120
119
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
121
120
|
Requires-Dist: backoff; extra == "notion"
|
|
121
|
+
Requires-Dist: httpx; extra == "notion"
|
|
122
122
|
Provides-Extra: odt
|
|
123
123
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
124
124
|
Provides-Extra: onedrive
|
|
125
125
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
126
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
127
126
|
Requires-Dist: bs4; extra == "onedrive"
|
|
127
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
128
128
|
Provides-Extra: openai
|
|
129
129
|
Requires-Dist: openai; extra == "openai"
|
|
130
130
|
Requires-Dist: tiktoken; extra == "openai"
|
|
@@ -156,13 +156,13 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
156
156
|
Provides-Extra: rtf
|
|
157
157
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
158
158
|
Provides-Extra: s3
|
|
159
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
160
159
|
Requires-Dist: fsspec; extra == "s3"
|
|
160
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
161
161
|
Provides-Extra: salesforce
|
|
162
162
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
163
163
|
Provides-Extra: sftp
|
|
164
|
-
Requires-Dist: paramiko; extra == "sftp"
|
|
165
164
|
Requires-Dist: fsspec; extra == "sftp"
|
|
165
|
+
Requires-Dist: paramiko; extra == "sftp"
|
|
166
166
|
Provides-Extra: sharepoint
|
|
167
167
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
168
168
|
Requires-Dist: msal; extra == "sharepoint"
|
|
@@ -170,6 +170,8 @@ Provides-Extra: singlestore
|
|
|
170
170
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
171
171
|
Provides-Extra: slack
|
|
172
172
|
Requires-Dist: slack-sdk; extra == "slack"
|
|
173
|
+
Provides-Extra: togetherai
|
|
174
|
+
Requires-Dist: together; extra == "togetherai"
|
|
173
175
|
Provides-Extra: tsv
|
|
174
176
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
175
177
|
Provides-Extra: vectara
|