unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/test_postgres.py +100 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/test_sqlite.py +91 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +198 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +14 -12
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +82 -29
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,275 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import uuid
|
|
3
|
-
from dataclasses import dataclass, field
|
|
4
|
-
from datetime import date, datetime
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from dateutil import parser
|
|
11
|
-
from pydantic import Field, Secret
|
|
12
|
-
|
|
13
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
14
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
|
-
from unstructured_ingest.v2.interfaces import (
|
|
16
|
-
AccessConfig,
|
|
17
|
-
ConnectionConfig,
|
|
18
|
-
FileData,
|
|
19
|
-
Uploader,
|
|
20
|
-
UploaderConfig,
|
|
21
|
-
UploadStager,
|
|
22
|
-
UploadStagerConfig,
|
|
23
|
-
)
|
|
24
|
-
from unstructured_ingest.v2.logger import logger
|
|
25
|
-
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
26
|
-
|
|
27
|
-
if TYPE_CHECKING:
|
|
28
|
-
from sqlite3 import Connection as SqliteConnection
|
|
29
|
-
|
|
30
|
-
from psycopg2.extensions import connection as PostgresConnection
|
|
31
|
-
|
|
32
|
-
CONNECTOR_TYPE = "sql"
|
|
33
|
-
ELEMENTS_TABLE_NAME = "elements"
|
|
34
|
-
SQLITE_DB = "sqlite"
|
|
35
|
-
POSTGRESQL_DB = "postgresql"
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class SQLAccessConfig(AccessConfig):
|
|
39
|
-
username: Optional[str] = Field(default=None, description="DB username")
|
|
40
|
-
password: Optional[str] = Field(default=None, description="DB password")
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class SQLConnectionConfig(ConnectionConfig):
|
|
44
|
-
db_type: Literal["sqlite", "postgresql"] = Field(
|
|
45
|
-
default=SQLITE_DB, description="Type of the database backend"
|
|
46
|
-
)
|
|
47
|
-
database: Optional[str] = Field(
|
|
48
|
-
default=None,
|
|
49
|
-
description="Database name. For sqlite databases, this is the path to the .db file.",
|
|
50
|
-
)
|
|
51
|
-
host: Optional[str] = Field(default=None, description="DB host")
|
|
52
|
-
port: Optional[int] = Field(default=5432, description="DB host connection port")
|
|
53
|
-
access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
|
|
54
|
-
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
55
|
-
|
|
56
|
-
def __post_init__(self):
|
|
57
|
-
if (self.db_type == SQLITE_DB) and (self.database is None):
|
|
58
|
-
raise ValueError(
|
|
59
|
-
"A sqlite connection requires a path to a *.db file "
|
|
60
|
-
"through the `database` argument"
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class SQLUploadStagerConfig(UploadStagerConfig):
|
|
65
|
-
pass
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
_COLUMNS = (
|
|
69
|
-
"id",
|
|
70
|
-
"element_id",
|
|
71
|
-
"text",
|
|
72
|
-
"embeddings",
|
|
73
|
-
"type",
|
|
74
|
-
"system",
|
|
75
|
-
"layout_width",
|
|
76
|
-
"layout_height",
|
|
77
|
-
"points",
|
|
78
|
-
"url",
|
|
79
|
-
"version",
|
|
80
|
-
"date_created",
|
|
81
|
-
"date_modified",
|
|
82
|
-
"date_processed",
|
|
83
|
-
"permissions_data",
|
|
84
|
-
"record_locator",
|
|
85
|
-
"category_depth",
|
|
86
|
-
"parent_id",
|
|
87
|
-
"attached_filename",
|
|
88
|
-
"filetype",
|
|
89
|
-
"last_modified",
|
|
90
|
-
"file_directory",
|
|
91
|
-
"filename",
|
|
92
|
-
"languages",
|
|
93
|
-
"page_number",
|
|
94
|
-
"links",
|
|
95
|
-
"page_name",
|
|
96
|
-
"link_urls",
|
|
97
|
-
"link_texts",
|
|
98
|
-
"sent_from",
|
|
99
|
-
"sent_to",
|
|
100
|
-
"subject",
|
|
101
|
-
"section",
|
|
102
|
-
"header_footer_type",
|
|
103
|
-
"emphasized_text_contents",
|
|
104
|
-
"emphasized_text_tags",
|
|
105
|
-
"text_as_html",
|
|
106
|
-
"regex_metadata",
|
|
107
|
-
"detection_class_prob",
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def parse_date_string(date_value: Union[str, int]) -> date:
|
|
114
|
-
try:
|
|
115
|
-
timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
|
|
116
|
-
return datetime.fromtimestamp(timestamp)
|
|
117
|
-
except Exception as e:
|
|
118
|
-
logger.debug(f"date {date_value} string not a timestamp: {e}")
|
|
119
|
-
return parser.parse(date_value)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
@dataclass
|
|
123
|
-
class SQLUploadStager(UploadStager):
|
|
124
|
-
upload_stager_config: SQLUploadStagerConfig = field(
|
|
125
|
-
default_factory=lambda: SQLUploadStagerConfig()
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
def run(
|
|
129
|
-
self,
|
|
130
|
-
elements_filepath: Path,
|
|
131
|
-
file_data: FileData,
|
|
132
|
-
output_dir: Path,
|
|
133
|
-
output_filename: str,
|
|
134
|
-
**kwargs: Any,
|
|
135
|
-
) -> Path:
|
|
136
|
-
with open(elements_filepath) as elements_file:
|
|
137
|
-
elements_contents: list[dict] = json.load(elements_file)
|
|
138
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
139
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
140
|
-
|
|
141
|
-
output = []
|
|
142
|
-
for data in elements_contents:
|
|
143
|
-
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
144
|
-
data_source = metadata.pop("data_source", {})
|
|
145
|
-
coordinates = metadata.pop("coordinates", {})
|
|
146
|
-
|
|
147
|
-
data.update(metadata)
|
|
148
|
-
data.update(data_source)
|
|
149
|
-
data.update(coordinates)
|
|
150
|
-
|
|
151
|
-
data["id"] = str(uuid.uuid4())
|
|
152
|
-
|
|
153
|
-
# remove extraneous, not supported columns
|
|
154
|
-
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
155
|
-
|
|
156
|
-
output.append(data)
|
|
157
|
-
|
|
158
|
-
df = pd.DataFrame.from_dict(output)
|
|
159
|
-
for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
|
|
160
|
-
df[column] = df[column].apply(parse_date_string)
|
|
161
|
-
for column in filter(
|
|
162
|
-
lambda x: x in df.columns,
|
|
163
|
-
("permissions_data", "record_locator", "points", "links"),
|
|
164
|
-
):
|
|
165
|
-
df[column] = df[column].apply(
|
|
166
|
-
lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
|
|
167
|
-
)
|
|
168
|
-
for column in filter(
|
|
169
|
-
lambda x: x in df.columns,
|
|
170
|
-
("version", "page_number", "regex_metadata"),
|
|
171
|
-
):
|
|
172
|
-
df[column] = df[column].apply(str)
|
|
173
|
-
|
|
174
|
-
with output_path.open("w") as output_file:
|
|
175
|
-
df.to_json(output_file, orient="records", lines=True)
|
|
176
|
-
return output_path
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
class SQLUploaderConfig(UploaderConfig):
|
|
180
|
-
batch_size: int = Field(default=50, description="Number of records per batch")
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
@dataclass
|
|
184
|
-
class SQLUploader(Uploader):
|
|
185
|
-
connector_type: str = CONNECTOR_TYPE
|
|
186
|
-
upload_config: SQLUploaderConfig
|
|
187
|
-
connection_config: SQLConnectionConfig
|
|
188
|
-
|
|
189
|
-
def precheck(self) -> None:
|
|
190
|
-
try:
|
|
191
|
-
cursor = self.connection().cursor()
|
|
192
|
-
cursor.execute("SELECT 1;")
|
|
193
|
-
cursor.close()
|
|
194
|
-
except Exception as e:
|
|
195
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
196
|
-
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
197
|
-
|
|
198
|
-
@property
|
|
199
|
-
def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
|
|
200
|
-
if self.connection_config.db_type == POSTGRESQL_DB:
|
|
201
|
-
return self._make_psycopg_connection
|
|
202
|
-
elif self.connection_config.db_type == SQLITE_DB:
|
|
203
|
-
return self._make_sqlite_connection
|
|
204
|
-
raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
|
|
205
|
-
|
|
206
|
-
def _make_sqlite_connection(self) -> "SqliteConnection":
|
|
207
|
-
from sqlite3 import connect
|
|
208
|
-
|
|
209
|
-
return connect(database=self.connection_config.database)
|
|
210
|
-
|
|
211
|
-
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
212
|
-
def _make_psycopg_connection(self) -> "PostgresConnection":
|
|
213
|
-
from psycopg2 import connect
|
|
214
|
-
|
|
215
|
-
access_config = self.connection_config.access_config.get_secret_value()
|
|
216
|
-
return connect(
|
|
217
|
-
user=access_config.username,
|
|
218
|
-
password=access_config.password,
|
|
219
|
-
dbname=self.connection_config.database,
|
|
220
|
-
host=self.connection_config.host,
|
|
221
|
-
port=self.connection_config.port,
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
def prepare_data(
|
|
225
|
-
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
226
|
-
) -> list[tuple[Any, ...]]:
|
|
227
|
-
output = []
|
|
228
|
-
for row in data:
|
|
229
|
-
parsed = []
|
|
230
|
-
for column_name, value in zip(columns, row):
|
|
231
|
-
if self.connection_config.db_type == SQLITE_DB and isinstance(value, (list, dict)):
|
|
232
|
-
value = json.dumps(value)
|
|
233
|
-
if column_name in _DATE_COLUMNS:
|
|
234
|
-
if value is None:
|
|
235
|
-
parsed.append(None)
|
|
236
|
-
else:
|
|
237
|
-
parsed.append(parse_date_string(value))
|
|
238
|
-
else:
|
|
239
|
-
parsed.append(value)
|
|
240
|
-
output.append(tuple(parsed))
|
|
241
|
-
return output
|
|
242
|
-
|
|
243
|
-
def upload_contents(self, path: Path) -> None:
|
|
244
|
-
df = pd.read_json(path, orient="records", lines=True)
|
|
245
|
-
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
246
|
-
df.replace({np.nan: None}, inplace=True)
|
|
247
|
-
|
|
248
|
-
columns = tuple(df.columns)
|
|
249
|
-
stmt = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(columns)}) \
|
|
250
|
-
VALUES({','.join(['?' if self.connection_config.db_type==SQLITE_DB else '%s' for x in columns])})" # noqa E501
|
|
251
|
-
|
|
252
|
-
for rows in pd.read_json(
|
|
253
|
-
path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
254
|
-
):
|
|
255
|
-
with self.connection() as conn:
|
|
256
|
-
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
257
|
-
if self.connection_config.db_type == SQLITE_DB:
|
|
258
|
-
conn.executemany(stmt, values)
|
|
259
|
-
else:
|
|
260
|
-
with conn.cursor() as cur:
|
|
261
|
-
cur.executemany(stmt, values)
|
|
262
|
-
|
|
263
|
-
conn.commit()
|
|
264
|
-
|
|
265
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
266
|
-
self.upload_contents(path=path)
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
sql_destination_entry = DestinationRegistryEntry(
|
|
270
|
-
connection_config=SQLConnectionConfig,
|
|
271
|
-
uploader=SQLUploader,
|
|
272
|
-
uploader_config=SQLUploaderConfig,
|
|
273
|
-
upload_stager=SQLUploadStager,
|
|
274
|
-
upload_stager_config=SQLUploadStagerConfig,
|
|
275
|
-
)
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|