unstructured-ingest 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_databricks_delta_tables.py +142 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_pinecone.py +68 -2
- test/unit/v2/connectors/sql/__init__.py +0 -0
- test/unit/v2/connectors/sql/test_sql.py +72 -0
- test/unit/v2/connectors/test_confluence.py +6 -6
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/processes/connectors/confluence.py +30 -10
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +6 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +6 -3
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +106 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +18 -11
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +6 -0
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +213 -0
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +28 -9
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/METADATA +22 -20
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/RECORD +23 -18
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -5,12 +5,10 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
5
5
|
from pydantic import Field, Secret
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.error import DestinationConnectionError
|
|
8
|
-
from unstructured_ingest.utils.data_prep import
|
|
9
|
-
flatten_dict,
|
|
10
|
-
generator_batching_wbytes,
|
|
11
|
-
)
|
|
8
|
+
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
12
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
10
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
11
|
+
from unstructured_ingest.v2.errors import UserError
|
|
14
12
|
from unstructured_ingest.v2.interfaces import (
|
|
15
13
|
AccessConfig,
|
|
16
14
|
ConnectionConfig,
|
|
@@ -63,6 +61,7 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
63
61
|
pc = self.get_client()
|
|
64
62
|
|
|
65
63
|
index = pc.Index(name=self.index_name, **index_kwargs)
|
|
64
|
+
|
|
66
65
|
logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
|
|
67
66
|
return index
|
|
68
67
|
|
|
@@ -182,14 +181,18 @@ class PineconeUploader(Uploader):
|
|
|
182
181
|
delete_kwargs = {
|
|
183
182
|
"filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
|
|
184
183
|
}
|
|
184
|
+
|
|
185
185
|
if namespace := self.upload_config.namespace:
|
|
186
186
|
delete_kwargs["namespace"] = namespace
|
|
187
|
+
try:
|
|
188
|
+
index.delete(**delete_kwargs)
|
|
189
|
+
except UserError as e:
|
|
190
|
+
logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
|
|
187
191
|
|
|
188
|
-
resp = index.delete(**delete_kwargs)
|
|
189
192
|
logger.debug(
|
|
190
193
|
f"deleted any content with metadata "
|
|
191
194
|
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
192
|
-
f"from pinecone index: {
|
|
195
|
+
f"from pinecone index: {delete_kwargs}"
|
|
193
196
|
)
|
|
194
197
|
|
|
195
198
|
def serverless_delete_by_record_id(self, file_data: FileData) -> None:
|
|
@@ -203,15 +206,19 @@ class PineconeUploader(Uploader):
|
|
|
203
206
|
deleted_ids = 0
|
|
204
207
|
if namespace := self.upload_config.namespace:
|
|
205
208
|
list_kwargs["namespace"] = namespace
|
|
209
|
+
|
|
206
210
|
for ids in index.list(**list_kwargs):
|
|
207
211
|
deleted_ids += len(ids)
|
|
208
212
|
delete_kwargs = {"ids": ids}
|
|
213
|
+
|
|
209
214
|
if namespace := self.upload_config.namespace:
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
+
delete_kwargs["namespace"] = namespace
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
index.delete(**delete_kwargs)
|
|
219
|
+
except UserError as e:
|
|
220
|
+
logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
|
|
221
|
+
|
|
215
222
|
logger.info(
|
|
216
223
|
f"deleted {deleted_ids} records with metadata "
|
|
217
224
|
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
@@ -5,6 +5,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
5
5
|
add_source_entry,
|
|
6
6
|
)
|
|
7
7
|
|
|
8
|
+
from .databricks_delta_tables import CONNECTOR_TYPE as DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE
|
|
9
|
+
from .databricks_delta_tables import databricks_delta_tables_destination_entry
|
|
8
10
|
from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
|
|
9
11
|
from .postgres import postgres_destination_entry, postgres_source_entry
|
|
10
12
|
from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
|
|
@@ -25,3 +27,7 @@ add_destination_entry(destination_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake
|
|
|
25
27
|
add_destination_entry(
|
|
26
28
|
destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
|
|
27
29
|
)
|
|
30
|
+
add_destination_entry(
|
|
31
|
+
destination_type=DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE,
|
|
32
|
+
entry=databricks_delta_tables_destination_entry,
|
|
33
|
+
)
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
11
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
13
|
+
from unstructured_ingest.v2.logger import logger
|
|
14
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
|
+
DestinationRegistryEntry,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
18
|
+
SQLAccessConfig,
|
|
19
|
+
SQLConnectionConfig,
|
|
20
|
+
SQLUploader,
|
|
21
|
+
SQLUploaderConfig,
|
|
22
|
+
SQLUploadStager,
|
|
23
|
+
SQLUploadStagerConfig,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from databricks.sdk.core import oauth_service_principal
|
|
28
|
+
from databricks.sql.client import Connection as DeltaTableConnection
|
|
29
|
+
from databricks.sql.client import Cursor as DeltaTableCursor
|
|
30
|
+
|
|
31
|
+
CONNECTOR_TYPE = "databricks_delta_tables"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DatabrickDeltaTablesAccessConfig(SQLAccessConfig):
|
|
35
|
+
token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
|
|
36
|
+
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
37
|
+
client_secret: Optional[str] = Field(
|
|
38
|
+
default=None, description="Client Secret of the OAuth app."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DatabrickDeltaTablesConnectionConfig(SQLConnectionConfig):
|
|
43
|
+
access_config: Secret[DatabrickDeltaTablesAccessConfig]
|
|
44
|
+
server_hostname: str = Field(description="server hostname connection config value")
|
|
45
|
+
http_path: str = Field(description="http path connection config value")
|
|
46
|
+
user_agent: str = "unstructuredio_oss"
|
|
47
|
+
|
|
48
|
+
@requires_dependencies(["databricks"], extras="databricks-delta-tables")
|
|
49
|
+
def get_credentials_provider(self) -> "oauth_service_principal":
|
|
50
|
+
from databricks.sdk.core import Config, oauth_service_principal
|
|
51
|
+
|
|
52
|
+
host = f"https://{self.server_hostname}"
|
|
53
|
+
access_configs = self.access_config.get_secret_value()
|
|
54
|
+
if (client_id := access_configs.client_id) and (
|
|
55
|
+
client_secret := access_configs.client_secret
|
|
56
|
+
):
|
|
57
|
+
return oauth_service_principal(
|
|
58
|
+
Config(
|
|
59
|
+
host=host,
|
|
60
|
+
client_id=client_id,
|
|
61
|
+
client_secret=client_secret,
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
def model_post_init(self, __context: Any) -> None:
|
|
67
|
+
access_config = self.access_config.get_secret_value()
|
|
68
|
+
if access_config.token and access_config.client_secret and access_config.client_id:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
"One one for of auth can be provided, either token or client id and secret"
|
|
71
|
+
)
|
|
72
|
+
if not access_config.token and not (
|
|
73
|
+
access_config.client_secret and access_config.client_id
|
|
74
|
+
):
|
|
75
|
+
raise ValueError(
|
|
76
|
+
"One form of auth must be provided, either token or client id and secret"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
@contextmanager
|
|
80
|
+
@requires_dependencies(["databricks"], extras="databricks-delta-tables")
|
|
81
|
+
def get_connection(self, **connect_kwargs) -> Generator["DeltaTableConnection", None, None]:
|
|
82
|
+
from databricks.sql import connect
|
|
83
|
+
|
|
84
|
+
connect_kwargs = connect_kwargs or {}
|
|
85
|
+
connect_kwargs["_user_agent_entry"] = self.user_agent
|
|
86
|
+
connect_kwargs["server_hostname"] = connect_kwargs.get(
|
|
87
|
+
"server_hostname", self.server_hostname
|
|
88
|
+
)
|
|
89
|
+
connect_kwargs["http_path"] = connect_kwargs.get("http_path", self.http_path)
|
|
90
|
+
|
|
91
|
+
if credential_provider := self.get_credentials_provider():
|
|
92
|
+
connect_kwargs["credentials_provider"] = credential_provider
|
|
93
|
+
else:
|
|
94
|
+
connect_kwargs["access_token"] = self.access_config.get_secret_value().token
|
|
95
|
+
with connect(**connect_kwargs) as connection:
|
|
96
|
+
yield connection
|
|
97
|
+
|
|
98
|
+
@contextmanager
|
|
99
|
+
def get_cursor(self, **connect_kwargs) -> Generator["DeltaTableCursor", None, None]:
|
|
100
|
+
with self.get_connection(**connect_kwargs) as connection:
|
|
101
|
+
cursor = connection.cursor()
|
|
102
|
+
yield cursor
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class DatabrickDeltaTablesUploadStagerConfig(SQLUploadStagerConfig):
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class DatabrickDeltaTablesUploadStager(SQLUploadStager):
|
|
110
|
+
upload_stager_config: DatabrickDeltaTablesUploadStagerConfig
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class DatabrickDeltaTablesUploaderConfig(SQLUploaderConfig):
|
|
114
|
+
catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
|
|
115
|
+
database: str = Field(description="Database name", default="default")
|
|
116
|
+
table_name: str = Field(description="Table name")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass
|
|
120
|
+
class DatabrickDeltaTablesUploader(SQLUploader):
|
|
121
|
+
upload_config: DatabrickDeltaTablesUploaderConfig
|
|
122
|
+
connection_config: DatabrickDeltaTablesConnectionConfig
|
|
123
|
+
connector_type: str = CONNECTOR_TYPE
|
|
124
|
+
|
|
125
|
+
@contextmanager
|
|
126
|
+
def get_cursor(self) -> Generator[Any, None, None]:
|
|
127
|
+
with self.connection_config.get_cursor() as cursor:
|
|
128
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
129
|
+
yield cursor
|
|
130
|
+
|
|
131
|
+
def precheck(self) -> None:
|
|
132
|
+
with self.connection_config.get_cursor() as cursor:
|
|
133
|
+
cursor.execute("SHOW CATALOGS")
|
|
134
|
+
catalogs = [r[0] for r in cursor.fetchall()]
|
|
135
|
+
if self.upload_config.catalog not in catalogs:
|
|
136
|
+
raise ValueError(
|
|
137
|
+
"Catalog {} not found in {}".format(
|
|
138
|
+
self.upload_config.catalog, ", ".join(catalogs)
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
142
|
+
cursor.execute("SHOW DATABASES")
|
|
143
|
+
databases = [r[0] for r in cursor.fetchall()]
|
|
144
|
+
if self.upload_config.database not in databases:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
"Database {} not found in {}".format(
|
|
147
|
+
self.upload_config.database, ", ".join(databases)
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
cursor.execute("SHOW TABLES")
|
|
151
|
+
table_names = [r[1] for r in cursor.fetchall()]
|
|
152
|
+
if self.upload_config.table_name not in table_names:
|
|
153
|
+
raise ValueError(
|
|
154
|
+
"Table {} not found in {}".format(
|
|
155
|
+
self.upload_config.table_name, ", ".join(table_names)
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def create_statement(self, columns: list[str], values: tuple[Any, ...]) -> str:
|
|
160
|
+
values_list = []
|
|
161
|
+
for v in values:
|
|
162
|
+
if isinstance(v, dict):
|
|
163
|
+
values_list.append(json.dumps(v))
|
|
164
|
+
elif isinstance(v, list):
|
|
165
|
+
if v and isinstance(v[0], (int, float)):
|
|
166
|
+
values_list.append("ARRAY({})".format(", ".join([str(val) for val in v])))
|
|
167
|
+
else:
|
|
168
|
+
values_list.append("ARRAY({})".format(", ".join([f"'{val}'" for val in v])))
|
|
169
|
+
else:
|
|
170
|
+
values_list.append(f"'{v}'")
|
|
171
|
+
statement = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
|
|
172
|
+
table_name=self.upload_config.table_name,
|
|
173
|
+
columns=", ".join(columns),
|
|
174
|
+
values=", ".join(values_list),
|
|
175
|
+
)
|
|
176
|
+
return statement
|
|
177
|
+
|
|
178
|
+
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
179
|
+
if self.can_delete():
|
|
180
|
+
self.delete_by_record_id(file_data=file_data)
|
|
181
|
+
else:
|
|
182
|
+
logger.warning(
|
|
183
|
+
f"table doesn't contain expected "
|
|
184
|
+
f"record id column "
|
|
185
|
+
f"{self.upload_config.record_id_key}, skipping delete"
|
|
186
|
+
)
|
|
187
|
+
df.replace({np.nan: None}, inplace=True)
|
|
188
|
+
self._fit_to_schema(df=df)
|
|
189
|
+
|
|
190
|
+
columns = list(df.columns)
|
|
191
|
+
logger.info(
|
|
192
|
+
f"writing a total of {len(df)} elements via"
|
|
193
|
+
f" document batches to destination"
|
|
194
|
+
f" table named {self.upload_config.table_name}"
|
|
195
|
+
# f" with batch size {self.upload_config.batch_size}"
|
|
196
|
+
)
|
|
197
|
+
# TODO: currently variable binding not supporting for list types,
|
|
198
|
+
# update once that gets resolved in SDK
|
|
199
|
+
for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
200
|
+
with self.get_cursor() as cursor:
|
|
201
|
+
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
202
|
+
for v in values:
|
|
203
|
+
stmt = self.create_statement(columns=columns, values=v)
|
|
204
|
+
cursor.execute(stmt)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
databricks_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
208
|
+
connection_config=DatabrickDeltaTablesConnectionConfig,
|
|
209
|
+
uploader=DatabrickDeltaTablesUploader,
|
|
210
|
+
uploader_config=DatabrickDeltaTablesUploaderConfig,
|
|
211
|
+
upload_stager=DatabrickDeltaTablesUploadStager,
|
|
212
|
+
upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
|
|
213
|
+
)
|
|
@@ -170,7 +170,7 @@ class SnowflakeUploader(SQLUploader):
|
|
|
170
170
|
f"{self.upload_config.record_id_key}, skipping delete"
|
|
171
171
|
)
|
|
172
172
|
df.replace({np.nan: None}, inplace=True)
|
|
173
|
-
self._fit_to_schema(df=df
|
|
173
|
+
self._fit_to_schema(df=df)
|
|
174
174
|
|
|
175
175
|
columns = list(df.columns)
|
|
176
176
|
stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
|
|
@@ -129,8 +129,13 @@ class SQLIndexer(Indexer, ABC):
|
|
|
129
129
|
connection_config: SQLConnectionConfig
|
|
130
130
|
index_config: SQLIndexerConfig
|
|
131
131
|
|
|
132
|
-
|
|
132
|
+
@contextmanager
|
|
133
|
+
def get_cursor(self) -> Generator[Any, None, None]:
|
|
133
134
|
with self.connection_config.get_cursor() as cursor:
|
|
135
|
+
yield cursor
|
|
136
|
+
|
|
137
|
+
def _get_doc_ids(self) -> list[str]:
|
|
138
|
+
with self.get_cursor() as cursor:
|
|
134
139
|
cursor.execute(
|
|
135
140
|
f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
|
|
136
141
|
)
|
|
@@ -140,7 +145,7 @@ class SQLIndexer(Indexer, ABC):
|
|
|
140
145
|
|
|
141
146
|
def precheck(self) -> None:
|
|
142
147
|
try:
|
|
143
|
-
with self.
|
|
148
|
+
with self.get_cursor() as cursor:
|
|
144
149
|
cursor.execute("SELECT 1;")
|
|
145
150
|
except Exception as e:
|
|
146
151
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
@@ -182,6 +187,11 @@ class SQLDownloader(Downloader, ABC):
|
|
|
182
187
|
connection_config: SQLConnectionConfig
|
|
183
188
|
download_config: SQLDownloaderConfig
|
|
184
189
|
|
|
190
|
+
@contextmanager
|
|
191
|
+
def get_cursor(self) -> Generator[Any, None, None]:
|
|
192
|
+
with self.connection_config.get_cursor() as cursor:
|
|
193
|
+
yield cursor
|
|
194
|
+
|
|
185
195
|
@abstractmethod
|
|
186
196
|
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
187
197
|
pass
|
|
@@ -300,6 +310,8 @@ class SQLUploadStager(UploadStager):
|
|
|
300
310
|
)
|
|
301
311
|
df = self.conform_dataframe(df=df)
|
|
302
312
|
|
|
313
|
+
output_filename_suffix = Path(elements_filepath).suffix
|
|
314
|
+
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
303
315
|
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
304
316
|
|
|
305
317
|
self.write_output(output_path=output_path, data=df.to_dict(orient="records"))
|
|
@@ -323,12 +335,17 @@ class SQLUploader(Uploader):
|
|
|
323
335
|
|
|
324
336
|
def precheck(self) -> None:
|
|
325
337
|
try:
|
|
326
|
-
with self.
|
|
338
|
+
with self.get_cursor() as cursor:
|
|
327
339
|
cursor.execute("SELECT 1;")
|
|
328
340
|
except Exception as e:
|
|
329
341
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
330
342
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
331
343
|
|
|
344
|
+
@contextmanager
|
|
345
|
+
def get_cursor(self) -> Generator[Any, None, None]:
|
|
346
|
+
with self.connection_config.get_cursor() as cursor:
|
|
347
|
+
yield cursor
|
|
348
|
+
|
|
332
349
|
def prepare_data(
|
|
333
350
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
334
351
|
) -> list[tuple[Any, ...]]:
|
|
@@ -346,7 +363,7 @@ class SQLUploader(Uploader):
|
|
|
346
363
|
output.append(tuple(parsed))
|
|
347
364
|
return output
|
|
348
365
|
|
|
349
|
-
def _fit_to_schema(self, df: pd.DataFrame
|
|
366
|
+
def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
350
367
|
columns = set(df.columns)
|
|
351
368
|
schema_fields = set(columns)
|
|
352
369
|
columns_to_drop = columns - schema_fields
|
|
@@ -367,6 +384,7 @@ class SQLUploader(Uploader):
|
|
|
367
384
|
|
|
368
385
|
for column in missing_columns:
|
|
369
386
|
df[column] = pd.Series()
|
|
387
|
+
return df
|
|
370
388
|
|
|
371
389
|
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
372
390
|
if self.can_delete():
|
|
@@ -378,7 +396,7 @@ class SQLUploader(Uploader):
|
|
|
378
396
|
f"{self.upload_config.record_id_key}, skipping delete"
|
|
379
397
|
)
|
|
380
398
|
df.replace({np.nan: None}, inplace=True)
|
|
381
|
-
self._fit_to_schema(df=df
|
|
399
|
+
self._fit_to_schema(df=df)
|
|
382
400
|
|
|
383
401
|
columns = list(df.columns)
|
|
384
402
|
stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
|
|
@@ -393,7 +411,7 @@ class SQLUploader(Uploader):
|
|
|
393
411
|
f" with batch size {self.upload_config.batch_size}"
|
|
394
412
|
)
|
|
395
413
|
for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
396
|
-
with self.
|
|
414
|
+
with self.get_cursor() as cursor:
|
|
397
415
|
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
398
416
|
# For debugging purposes:
|
|
399
417
|
# for val in values:
|
|
@@ -406,7 +424,7 @@ class SQLUploader(Uploader):
|
|
|
406
424
|
cursor.executemany(stmt, values)
|
|
407
425
|
|
|
408
426
|
def get_table_columns(self) -> list[str]:
|
|
409
|
-
with self.
|
|
427
|
+
with self.get_cursor() as cursor:
|
|
410
428
|
cursor.execute(f"SELECT * from {self.upload_config.table_name}")
|
|
411
429
|
return [desc[0] for desc in cursor.description]
|
|
412
430
|
|
|
@@ -420,10 +438,11 @@ class SQLUploader(Uploader):
|
|
|
420
438
|
f"from table {self.upload_config.table_name}"
|
|
421
439
|
)
|
|
422
440
|
stmt = f"DELETE FROM {self.upload_config.table_name} WHERE {self.upload_config.record_id_key} = {self.values_delimiter}" # noqa: E501
|
|
423
|
-
with self.
|
|
441
|
+
with self.get_cursor() as cursor:
|
|
424
442
|
cursor.execute(stmt, [file_data.identifier])
|
|
425
443
|
rowcount = cursor.rowcount
|
|
426
|
-
|
|
444
|
+
if rowcount > 0:
|
|
445
|
+
logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
|
|
427
446
|
|
|
428
447
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
429
448
|
df = pd.DataFrame(data)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,43 +22,45 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: click
|
|
26
|
+
Requires-Dist: pydantic>=2.7
|
|
26
27
|
Requires-Dist: pandas
|
|
28
|
+
Requires-Dist: ndjson
|
|
29
|
+
Requires-Dist: opentelemetry-sdk
|
|
30
|
+
Requires-Dist: python-dateutil
|
|
27
31
|
Requires-Dist: tqdm
|
|
28
32
|
Requires-Dist: dataclasses-json
|
|
29
|
-
Requires-Dist: opentelemetry-sdk
|
|
30
|
-
Requires-Dist: click
|
|
31
|
-
Requires-Dist: ndjson
|
|
32
|
-
Requires-Dist: pydantic>=2.7
|
|
33
33
|
Provides-Extra: airtable
|
|
34
34
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
35
35
|
Provides-Extra: astradb
|
|
36
36
|
Requires-Dist: astrapy; extra == "astradb"
|
|
37
37
|
Provides-Extra: azure
|
|
38
|
-
Requires-Dist: adlfs; extra == "azure"
|
|
39
38
|
Requires-Dist: fsspec; extra == "azure"
|
|
39
|
+
Requires-Dist: adlfs; extra == "azure"
|
|
40
40
|
Provides-Extra: azure-ai-search
|
|
41
41
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
42
42
|
Provides-Extra: bedrock
|
|
43
43
|
Requires-Dist: boto3; extra == "bedrock"
|
|
44
44
|
Requires-Dist: aioboto3; extra == "bedrock"
|
|
45
45
|
Provides-Extra: biomed
|
|
46
|
-
Requires-Dist: requests; extra == "biomed"
|
|
47
46
|
Requires-Dist: bs4; extra == "biomed"
|
|
47
|
+
Requires-Dist: requests; extra == "biomed"
|
|
48
48
|
Provides-Extra: box
|
|
49
|
-
Requires-Dist: boxfs; extra == "box"
|
|
50
49
|
Requires-Dist: fsspec; extra == "box"
|
|
50
|
+
Requires-Dist: boxfs; extra == "box"
|
|
51
51
|
Provides-Extra: chroma
|
|
52
52
|
Requires-Dist: chromadb; extra == "chroma"
|
|
53
53
|
Provides-Extra: clarifai
|
|
54
54
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
55
55
|
Provides-Extra: confluence
|
|
56
|
-
Requires-Dist: requests; extra == "confluence"
|
|
57
56
|
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
57
|
+
Requires-Dist: requests; extra == "confluence"
|
|
58
58
|
Provides-Extra: couchbase
|
|
59
59
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
60
60
|
Provides-Extra: csv
|
|
61
61
|
Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
62
|
+
Provides-Extra: databricks-delta-tables
|
|
63
|
+
Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
62
64
|
Provides-Extra: databricks-volumes
|
|
63
65
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
64
66
|
Provides-Extra: delta-table
|
|
@@ -82,8 +84,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
|
82
84
|
Provides-Extra: embed-mixedbreadai
|
|
83
85
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
84
86
|
Provides-Extra: embed-octoai
|
|
85
|
-
Requires-Dist: openai; extra == "embed-octoai"
|
|
86
87
|
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
88
|
+
Requires-Dist: openai; extra == "embed-octoai"
|
|
87
89
|
Provides-Extra: embed-vertexai
|
|
88
90
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
89
91
|
Provides-Extra: embed-voyageai
|
|
@@ -91,8 +93,8 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
91
93
|
Provides-Extra: epub
|
|
92
94
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
93
95
|
Provides-Extra: gcs
|
|
94
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
95
96
|
Requires-Dist: fsspec; extra == "gcs"
|
|
97
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
96
98
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
97
99
|
Provides-Extra: github
|
|
98
100
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
@@ -102,8 +104,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
|
|
|
102
104
|
Provides-Extra: google-drive
|
|
103
105
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
104
106
|
Provides-Extra: hubspot
|
|
105
|
-
Requires-Dist: urllib3; extra == "hubspot"
|
|
106
107
|
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
108
|
+
Requires-Dist: urllib3; extra == "hubspot"
|
|
107
109
|
Provides-Extra: jira
|
|
108
110
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
109
111
|
Provides-Extra: kafka
|
|
@@ -122,12 +124,12 @@ Provides-Extra: msg
|
|
|
122
124
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
123
125
|
Provides-Extra: neo4j
|
|
124
126
|
Requires-Dist: cymple; extra == "neo4j"
|
|
125
|
-
Requires-Dist: networkx; extra == "neo4j"
|
|
126
127
|
Requires-Dist: neo4j; extra == "neo4j"
|
|
128
|
+
Requires-Dist: networkx; extra == "neo4j"
|
|
127
129
|
Provides-Extra: notion
|
|
128
|
-
Requires-Dist: backoff; extra == "notion"
|
|
129
130
|
Requires-Dist: notion-client; extra == "notion"
|
|
130
131
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
132
|
+
Requires-Dist: backoff; extra == "notion"
|
|
131
133
|
Requires-Dist: httpx; extra == "notion"
|
|
132
134
|
Provides-Extra: odt
|
|
133
135
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
@@ -136,8 +138,8 @@ Requires-Dist: bs4; extra == "onedrive"
|
|
|
136
138
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
137
139
|
Requires-Dist: msal; extra == "onedrive"
|
|
138
140
|
Provides-Extra: openai
|
|
139
|
-
Requires-Dist: openai; extra == "openai"
|
|
140
141
|
Requires-Dist: tiktoken; extra == "openai"
|
|
142
|
+
Requires-Dist: openai; extra == "openai"
|
|
141
143
|
Provides-Extra: opensearch
|
|
142
144
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
143
145
|
Provides-Extra: org
|
|
@@ -168,13 +170,13 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
168
170
|
Provides-Extra: rtf
|
|
169
171
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
170
172
|
Provides-Extra: s3
|
|
171
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
172
173
|
Requires-Dist: fsspec; extra == "s3"
|
|
174
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
173
175
|
Provides-Extra: salesforce
|
|
174
176
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
175
177
|
Provides-Extra: sftp
|
|
176
|
-
Requires-Dist: paramiko; extra == "sftp"
|
|
177
178
|
Requires-Dist: fsspec; extra == "sftp"
|
|
179
|
+
Requires-Dist: paramiko; extra == "sftp"
|
|
178
180
|
Provides-Extra: sharepoint
|
|
179
181
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
180
182
|
Requires-Dist: msal; extra == "sharepoint"
|
|
@@ -183,16 +185,16 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
183
185
|
Provides-Extra: slack
|
|
184
186
|
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
185
187
|
Provides-Extra: snowflake
|
|
186
|
-
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
187
188
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
189
|
+
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
188
190
|
Provides-Extra: togetherai
|
|
189
191
|
Requires-Dist: together; extra == "togetherai"
|
|
190
192
|
Provides-Extra: tsv
|
|
191
193
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
192
194
|
Provides-Extra: vectara
|
|
195
|
+
Requires-Dist: httpx; extra == "vectara"
|
|
193
196
|
Requires-Dist: requests; extra == "vectara"
|
|
194
197
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
195
|
-
Requires-Dist: httpx; extra == "vectara"
|
|
196
198
|
Provides-Extra: weaviate
|
|
197
199
|
Requires-Dist: weaviate-client; extra == "weaviate"
|
|
198
200
|
Provides-Extra: wikipedia
|