unstructured-ingest 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_databricks_delta_tables.py +142 -0
- test/integration/connectors/test_pinecone.py +68 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +6 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +6 -3
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +106 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +18 -11
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +6 -0
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +213 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +26 -9
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.3.15.dist-info}/METADATA +22 -20
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.3.15.dist-info}/RECORD +17 -14
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.3.15.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.3.15.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.3.15.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.3.15.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
from databricks.sql import connect
|
|
10
|
+
from databricks.sql.client import Connection as DeltaTableConnection
|
|
11
|
+
from databricks.sql.client import Cursor as DeltaTableCursor
|
|
12
|
+
from pydantic import BaseModel, SecretStr
|
|
13
|
+
|
|
14
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG, env_setup_path
|
|
15
|
+
from test.integration.utils import requires_env
|
|
16
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
17
|
+
from unstructured_ingest.v2.logger import logger
|
|
18
|
+
from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
|
|
19
|
+
CONNECTOR_TYPE,
|
|
20
|
+
DatabrickDeltaTablesAccessConfig,
|
|
21
|
+
DatabrickDeltaTablesConnectionConfig,
|
|
22
|
+
DatabrickDeltaTablesUploader,
|
|
23
|
+
DatabrickDeltaTablesUploaderConfig,
|
|
24
|
+
DatabrickDeltaTablesUploadStager,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
CATALOG = "utic-dev-tech-fixtures"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class EnvData(BaseModel):
|
|
31
|
+
server_hostname: str
|
|
32
|
+
http_path: str
|
|
33
|
+
access_token: SecretStr
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_env_data() -> EnvData:
|
|
37
|
+
return EnvData(
|
|
38
|
+
server_hostname=os.environ["DATABRICKS_SERVER_HOSTNAME"],
|
|
39
|
+
http_path=os.environ["DATABRICKS_HTTP_PATH"],
|
|
40
|
+
access_token=os.environ["DATABRICKS_ACCESS_TOKEN"],
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_destination_schema(new_table_name: str) -> str:
|
|
45
|
+
p = Path(env_setup_path / "sql" / "databricks_delta_tables" / "destination" / "schema.sql")
|
|
46
|
+
with p.open() as f:
|
|
47
|
+
data_lines = f.readlines()
|
|
48
|
+
data_lines[0] = data_lines[0].replace("elements", new_table_name)
|
|
49
|
+
data = "".join([line.strip() for line in data_lines])
|
|
50
|
+
return data
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@contextmanager
|
|
54
|
+
def get_connection() -> DeltaTableConnection:
|
|
55
|
+
env_data = get_env_data()
|
|
56
|
+
with connect(
|
|
57
|
+
server_hostname=env_data.server_hostname,
|
|
58
|
+
http_path=env_data.http_path,
|
|
59
|
+
access_token=env_data.access_token.get_secret_value(),
|
|
60
|
+
) as connection:
|
|
61
|
+
yield connection
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@contextmanager
|
|
65
|
+
def get_cursor() -> DeltaTableCursor:
|
|
66
|
+
with get_connection() as connection:
|
|
67
|
+
with connection.cursor() as cursor:
|
|
68
|
+
cursor.execute(f"USE CATALOG '{CATALOG}'")
|
|
69
|
+
yield cursor
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@pytest.fixture
|
|
73
|
+
def destination_table() -> str:
|
|
74
|
+
random_id = str(uuid4())[:8]
|
|
75
|
+
table_name = f"elements_{random_id}"
|
|
76
|
+
destination_schema = get_destination_schema(new_table_name=table_name)
|
|
77
|
+
with get_cursor() as cursor:
|
|
78
|
+
logger.info(f"creating table: {table_name}")
|
|
79
|
+
cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
|
|
80
|
+
cursor.execute(destination_schema)
|
|
81
|
+
|
|
82
|
+
yield table_name
|
|
83
|
+
with get_cursor() as cursor:
|
|
84
|
+
logger.info(f"dropping table: {table_name}")
|
|
85
|
+
cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def validate_destination(expected_num_elements: int, table_name: str, retries=30, interval=1):
|
|
89
|
+
with get_cursor() as cursor:
|
|
90
|
+
for i in range(retries):
|
|
91
|
+
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
|
|
92
|
+
count = cursor.fetchone()[0]
|
|
93
|
+
if count == expected_num_elements:
|
|
94
|
+
break
|
|
95
|
+
logger.info(f"retry attempt {i}: expected {expected_num_elements} != count {count}")
|
|
96
|
+
time.sleep(interval)
|
|
97
|
+
assert (
|
|
98
|
+
count == expected_num_elements
|
|
99
|
+
), f"dest check failed: got {count}, expected {expected_num_elements}"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@pytest.mark.asyncio
|
|
103
|
+
@pytest.mark.skip("Resources take too long to spin up to run in CI")
|
|
104
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
|
|
105
|
+
@requires_env("DATABRICKS_SERVER_HOSTNAME", "DATABRICKS_HTTP_PATH", "DATABRICKS_ACCESS_TOKEN")
|
|
106
|
+
async def test_databricks_delta_tables_destination(
|
|
107
|
+
upload_file: Path, temp_dir: Path, destination_table: str
|
|
108
|
+
):
|
|
109
|
+
env_data = get_env_data()
|
|
110
|
+
mock_file_data = FileData(
|
|
111
|
+
identifier="mock file data",
|
|
112
|
+
connector_type=CONNECTOR_TYPE,
|
|
113
|
+
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
114
|
+
)
|
|
115
|
+
stager = DatabrickDeltaTablesUploadStager()
|
|
116
|
+
staged_path = stager.run(
|
|
117
|
+
elements_filepath=upload_file,
|
|
118
|
+
file_data=mock_file_data,
|
|
119
|
+
output_dir=temp_dir,
|
|
120
|
+
output_filename=upload_file.name,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
assert staged_path.suffix == upload_file.suffix
|
|
124
|
+
|
|
125
|
+
uploader = DatabrickDeltaTablesUploader(
|
|
126
|
+
connection_config=DatabrickDeltaTablesConnectionConfig(
|
|
127
|
+
access_config=DatabrickDeltaTablesAccessConfig(
|
|
128
|
+
token=env_data.access_token.get_secret_value()
|
|
129
|
+
),
|
|
130
|
+
http_path=env_data.http_path,
|
|
131
|
+
server_hostname=env_data.server_hostname,
|
|
132
|
+
),
|
|
133
|
+
upload_config=DatabrickDeltaTablesUploaderConfig(
|
|
134
|
+
catalog=CATALOG, database="default", table_name=destination_table
|
|
135
|
+
),
|
|
136
|
+
)
|
|
137
|
+
with staged_path.open("r") as f:
|
|
138
|
+
staged_data = json.load(f)
|
|
139
|
+
expected_num_elements = len(staged_data)
|
|
140
|
+
uploader.precheck()
|
|
141
|
+
uploader.run(path=staged_path, file_data=mock_file_data)
|
|
142
|
+
validate_destination(expected_num_elements=expected_num_elements, table_name=destination_table)
|
|
@@ -107,11 +107,15 @@ def pinecone_index() -> Generator[str, None, None]:
|
|
|
107
107
|
|
|
108
108
|
|
|
109
109
|
def validate_pinecone_index(
|
|
110
|
-
index_name: str,
|
|
110
|
+
index_name: str,
|
|
111
|
+
expected_num_of_vectors: int,
|
|
112
|
+
retries=30,
|
|
113
|
+
interval=1,
|
|
114
|
+
namespace: str = "default",
|
|
111
115
|
) -> None:
|
|
112
116
|
# Because there's a delay for the index to catch up to the recent writes, add in a retry
|
|
113
117
|
pinecone = Pinecone(api_key=get_api_key())
|
|
114
|
-
index = pinecone.Index(name=index_name)
|
|
118
|
+
index = pinecone.Index(name=index_name, namespace=namespace)
|
|
115
119
|
vector_count = -1
|
|
116
120
|
for i in range(retries):
|
|
117
121
|
index_stats = index.describe_index_stats()
|
|
@@ -133,11 +137,13 @@ def validate_pinecone_index(
|
|
|
133
137
|
@pytest.mark.asyncio
|
|
134
138
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
135
139
|
async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp_dir: Path):
|
|
140
|
+
|
|
136
141
|
file_data = FileData(
|
|
137
142
|
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
138
143
|
connector_type=CONNECTOR_TYPE,
|
|
139
144
|
identifier="pinecone_mock_id",
|
|
140
145
|
)
|
|
146
|
+
|
|
141
147
|
connection_config = PineconeConnectionConfig(
|
|
142
148
|
index_name=pinecone_index,
|
|
143
149
|
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
@@ -224,6 +230,66 @@ async def test_pinecone_destination_large_index(
|
|
|
224
230
|
)
|
|
225
231
|
|
|
226
232
|
|
|
233
|
+
@requires_env(API_KEY)
|
|
234
|
+
@pytest.mark.asyncio
|
|
235
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
236
|
+
async def test_pinecone_destination_namespace(
|
|
237
|
+
pinecone_index: str, upload_file: Path, temp_dir: Path
|
|
238
|
+
):
|
|
239
|
+
"""
|
|
240
|
+
tests namespace functionality of destination connector.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
# creates a file data structure.
|
|
244
|
+
file_data = FileData(
|
|
245
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
246
|
+
connector_type=CONNECTOR_TYPE,
|
|
247
|
+
identifier="pinecone_mock_id",
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
connection_config = PineconeConnectionConfig(
|
|
251
|
+
index_name=pinecone_index,
|
|
252
|
+
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
stager_config = PineconeUploadStagerConfig()
|
|
256
|
+
|
|
257
|
+
stager = PineconeUploadStager(upload_stager_config=stager_config)
|
|
258
|
+
new_upload_file = stager.run(
|
|
259
|
+
elements_filepath=upload_file,
|
|
260
|
+
output_dir=temp_dir,
|
|
261
|
+
output_filename=upload_file.name,
|
|
262
|
+
file_data=file_data,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# here add namespace defintion
|
|
266
|
+
upload_config = PineconeUploaderConfig()
|
|
267
|
+
namespace_test_name = "user-1"
|
|
268
|
+
upload_config.namespace = namespace_test_name
|
|
269
|
+
uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
|
|
270
|
+
uploader.precheck()
|
|
271
|
+
|
|
272
|
+
uploader.run(path=new_upload_file, file_data=file_data)
|
|
273
|
+
with new_upload_file.open() as f:
|
|
274
|
+
staged_content = json.load(f)
|
|
275
|
+
expected_num_of_vectors = len(staged_content)
|
|
276
|
+
logger.info("validating first upload")
|
|
277
|
+
validate_pinecone_index(
|
|
278
|
+
index_name=pinecone_index,
|
|
279
|
+
expected_num_of_vectors=expected_num_of_vectors,
|
|
280
|
+
namespace=namespace_test_name,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# Rerun uploader and make sure no duplicates exist
|
|
284
|
+
uploader.run(path=new_upload_file, file_data=file_data)
|
|
285
|
+
logger.info("validating second upload")
|
|
286
|
+
validate_pinecone_index(
|
|
287
|
+
index_name=pinecone_index,
|
|
288
|
+
expected_num_of_vectors=expected_num_of_vectors,
|
|
289
|
+
namespace=namespace_test_name,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
227
293
|
@requires_env(API_KEY)
|
|
228
294
|
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
|
|
229
295
|
def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.15" # pragma: no cover
|
|
@@ -2,7 +2,7 @@ import json
|
|
|
2
2
|
from abc import ABC
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, TypeVar
|
|
5
|
+
from typing import Any, Optional, TypeVar
|
|
6
6
|
|
|
7
7
|
import ndjson
|
|
8
8
|
from pydantic import BaseModel
|
|
@@ -22,10 +22,10 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
|
|
|
22
22
|
class UploadStager(BaseProcess, ABC):
|
|
23
23
|
upload_stager_config: UploadStagerConfigT
|
|
24
24
|
|
|
25
|
-
def write_output(self, output_path: Path, data: list[dict]) -> None:
|
|
25
|
+
def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
|
|
26
26
|
if output_path.suffix == ".json":
|
|
27
27
|
with output_path.open("w") as f:
|
|
28
|
-
json.dump(data, f, indent=
|
|
28
|
+
json.dump(data, f, indent=indent)
|
|
29
29
|
elif output_path.suffix == ".ndjson":
|
|
30
30
|
with output_path.open("w") as f:
|
|
31
31
|
ndjson.dump(data, f)
|
|
@@ -25,6 +25,8 @@ from .volumes_native import (
|
|
|
25
25
|
databricks_native_volumes_destination_entry,
|
|
26
26
|
databricks_native_volumes_source_entry,
|
|
27
27
|
)
|
|
28
|
+
from .volumes_table import CONNECTOR_TYPE as VOLUMES_TABLE_CONNECTOR_TYPE
|
|
29
|
+
from .volumes_table import databricks_volumes_delta_tables_destination_entry
|
|
28
30
|
|
|
29
31
|
add_source_entry(source_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_source_entry)
|
|
30
32
|
add_destination_entry(
|
|
@@ -50,3 +52,7 @@ add_source_entry(
|
|
|
50
52
|
add_destination_entry(
|
|
51
53
|
destination_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_destination_entry
|
|
52
54
|
)
|
|
55
|
+
add_destination_entry(
|
|
56
|
+
destination_type=VOLUMES_TABLE_CONNECTOR_TYPE,
|
|
57
|
+
entry=databricks_volumes_delta_tables_destination_entry,
|
|
58
|
+
)
|
|
@@ -187,6 +187,11 @@ class DatabricksVolumesUploader(Uploader, ABC):
|
|
|
187
187
|
upload_config: DatabricksVolumesUploaderConfig
|
|
188
188
|
connection_config: DatabricksVolumesConnectionConfig
|
|
189
189
|
|
|
190
|
+
def get_output_path(self, file_data: FileData) -> str:
|
|
191
|
+
return os.path.join(
|
|
192
|
+
self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
|
|
193
|
+
)
|
|
194
|
+
|
|
190
195
|
def precheck(self) -> None:
|
|
191
196
|
try:
|
|
192
197
|
assert self.connection_config.get_client().current_user.me().active
|
|
@@ -194,9 +199,7 @@ class DatabricksVolumesUploader(Uploader, ABC):
|
|
|
194
199
|
raise self.connection_config.wrap_error(e=e)
|
|
195
200
|
|
|
196
201
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
197
|
-
output_path =
|
|
198
|
-
self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
|
|
199
|
-
)
|
|
202
|
+
output_path = self.get_output_path(file_data=file_data)
|
|
200
203
|
with open(path, "rb") as elements_file:
|
|
201
204
|
try:
|
|
202
205
|
self.connection_config.get_client().files.upload(
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
|
|
11
|
+
from unstructured_ingest.v2.logger import logger
|
|
12
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
|
+
DestinationRegistryEntry,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
|
|
16
|
+
from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
|
|
17
|
+
DatabrickDeltaTablesConnectionConfig,
|
|
18
|
+
DatabrickDeltaTablesUploadStager,
|
|
19
|
+
DatabrickDeltaTablesUploadStagerConfig,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
CONNECTOR_TYPE = "databricks_volume_delta_tables"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
26
|
+
database: str = Field(description="Database name", default="default")
|
|
27
|
+
table_name: str = Field(description="Table name")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class DatabricksVolumeDeltaTableStager(DatabrickDeltaTablesUploadStager):
|
|
32
|
+
def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
|
|
33
|
+
# To avoid new line issues when migrating from volumes into delta tables, omit indenting
|
|
34
|
+
# and always write it as a json file
|
|
35
|
+
with output_path.with_suffix(".json").open("w") as f:
|
|
36
|
+
json.dump(data, f)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
41
|
+
connection_config: DatabrickDeltaTablesConnectionConfig
|
|
42
|
+
upload_config: DatabricksVolumeDeltaTableUploaderConfig
|
|
43
|
+
connector_type: str = CONNECTOR_TYPE
|
|
44
|
+
|
|
45
|
+
def precheck(self) -> None:
|
|
46
|
+
with self.connection_config.get_cursor() as cursor:
|
|
47
|
+
cursor.execute("SHOW CATALOGS")
|
|
48
|
+
catalogs = [r[0] for r in cursor.fetchall()]
|
|
49
|
+
if self.upload_config.catalog not in catalogs:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
"Catalog {} not found in {}".format(
|
|
52
|
+
self.upload_config.catalog, ", ".join(catalogs)
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
56
|
+
cursor.execute("SHOW DATABASES")
|
|
57
|
+
databases = [r[0] for r in cursor.fetchall()]
|
|
58
|
+
if self.upload_config.database not in databases:
|
|
59
|
+
raise ValueError(
|
|
60
|
+
"Database {} not found in {}".format(
|
|
61
|
+
self.upload_config.database, ", ".join(databases)
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
cursor.execute("SHOW TABLES")
|
|
65
|
+
table_names = [r[1] for r in cursor.fetchall()]
|
|
66
|
+
if self.upload_config.table_name not in table_names:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"Table {} not found in {}".format(
|
|
69
|
+
self.upload_config.table_name, ", ".join(table_names)
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def get_output_path(self, file_data: FileData, suffix: str = ".json") -> str:
|
|
74
|
+
filename = Path(file_data.source_identifiers.filename)
|
|
75
|
+
adjusted_filename = filename if filename.suffix == suffix else f"{filename}{suffix}"
|
|
76
|
+
return os.path.join(self.upload_config.path, f"{adjusted_filename}")
|
|
77
|
+
|
|
78
|
+
@contextmanager
|
|
79
|
+
def get_cursor(self, **connect_kwargs) -> Generator[Any, None, None]:
|
|
80
|
+
with self.connection_config.get_cursor(**connect_kwargs) as cursor:
|
|
81
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
82
|
+
yield cursor
|
|
83
|
+
|
|
84
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
85
|
+
with self.get_cursor(staging_allowed_local_path=str(path.parent)) as cursor:
|
|
86
|
+
catalog_path = self.get_output_path(file_data=file_data)
|
|
87
|
+
logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
|
|
88
|
+
cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
|
|
89
|
+
logger.debug(
|
|
90
|
+
f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
|
|
91
|
+
)
|
|
92
|
+
with path.open() as f:
|
|
93
|
+
data = json.load(f)
|
|
94
|
+
columns = data[0].keys()
|
|
95
|
+
column_str = ", ".join(columns)
|
|
96
|
+
sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
|
|
97
|
+
cursor.execute(sql_statment)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
101
|
+
connection_config=DatabrickDeltaTablesConnectionConfig,
|
|
102
|
+
uploader=DatabricksVolumeDeltaTableUploader,
|
|
103
|
+
uploader_config=DatabricksVolumeDeltaTableUploaderConfig,
|
|
104
|
+
upload_stager=DatabricksVolumeDeltaTableStager,
|
|
105
|
+
upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
|
|
106
|
+
)
|
|
@@ -5,12 +5,10 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
5
5
|
from pydantic import Field, Secret
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.error import DestinationConnectionError
|
|
8
|
-
from unstructured_ingest.utils.data_prep import
|
|
9
|
-
flatten_dict,
|
|
10
|
-
generator_batching_wbytes,
|
|
11
|
-
)
|
|
8
|
+
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
12
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
10
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
11
|
+
from unstructured_ingest.v2.errors import UserError
|
|
14
12
|
from unstructured_ingest.v2.interfaces import (
|
|
15
13
|
AccessConfig,
|
|
16
14
|
ConnectionConfig,
|
|
@@ -63,6 +61,7 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
63
61
|
pc = self.get_client()
|
|
64
62
|
|
|
65
63
|
index = pc.Index(name=self.index_name, **index_kwargs)
|
|
64
|
+
|
|
66
65
|
logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
|
|
67
66
|
return index
|
|
68
67
|
|
|
@@ -182,14 +181,18 @@ class PineconeUploader(Uploader):
|
|
|
182
181
|
delete_kwargs = {
|
|
183
182
|
"filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
|
|
184
183
|
}
|
|
184
|
+
|
|
185
185
|
if namespace := self.upload_config.namespace:
|
|
186
186
|
delete_kwargs["namespace"] = namespace
|
|
187
|
+
try:
|
|
188
|
+
index.delete(**delete_kwargs)
|
|
189
|
+
except UserError as e:
|
|
190
|
+
logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
|
|
187
191
|
|
|
188
|
-
resp = index.delete(**delete_kwargs)
|
|
189
192
|
logger.debug(
|
|
190
193
|
f"deleted any content with metadata "
|
|
191
194
|
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
192
|
-
f"from pinecone index: {
|
|
195
|
+
f"from pinecone index: {delete_kwargs}"
|
|
193
196
|
)
|
|
194
197
|
|
|
195
198
|
def serverless_delete_by_record_id(self, file_data: FileData) -> None:
|
|
@@ -203,15 +206,19 @@ class PineconeUploader(Uploader):
|
|
|
203
206
|
deleted_ids = 0
|
|
204
207
|
if namespace := self.upload_config.namespace:
|
|
205
208
|
list_kwargs["namespace"] = namespace
|
|
209
|
+
|
|
206
210
|
for ids in index.list(**list_kwargs):
|
|
207
211
|
deleted_ids += len(ids)
|
|
208
212
|
delete_kwargs = {"ids": ids}
|
|
213
|
+
|
|
209
214
|
if namespace := self.upload_config.namespace:
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
+
delete_kwargs["namespace"] = namespace
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
index.delete(**delete_kwargs)
|
|
219
|
+
except UserError as e:
|
|
220
|
+
logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
|
|
221
|
+
|
|
215
222
|
logger.info(
|
|
216
223
|
f"deleted {deleted_ids} records with metadata "
|
|
217
224
|
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
@@ -5,6 +5,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
5
5
|
add_source_entry,
|
|
6
6
|
)
|
|
7
7
|
|
|
8
|
+
from .databricks_delta_tables import CONNECTOR_TYPE as DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE
|
|
9
|
+
from .databricks_delta_tables import databricks_delta_tables_destination_entry
|
|
8
10
|
from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
|
|
9
11
|
from .postgres import postgres_destination_entry, postgres_source_entry
|
|
10
12
|
from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
|
|
@@ -25,3 +27,7 @@ add_destination_entry(destination_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake
|
|
|
25
27
|
add_destination_entry(
|
|
26
28
|
destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
|
|
27
29
|
)
|
|
30
|
+
add_destination_entry(
|
|
31
|
+
destination_type=DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE,
|
|
32
|
+
entry=databricks_delta_tables_destination_entry,
|
|
33
|
+
)
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
11
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.interfaces import FileData
|
|
13
|
+
from unstructured_ingest.v2.logger import logger
|
|
14
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
|
+
DestinationRegistryEntry,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
18
|
+
SQLAccessConfig,
|
|
19
|
+
SQLConnectionConfig,
|
|
20
|
+
SQLUploader,
|
|
21
|
+
SQLUploaderConfig,
|
|
22
|
+
SQLUploadStager,
|
|
23
|
+
SQLUploadStagerConfig,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from databricks.sdk.core import oauth_service_principal
|
|
28
|
+
from databricks.sql.client import Connection as DeltaTableConnection
|
|
29
|
+
from databricks.sql.client import Cursor as DeltaTableCursor
|
|
30
|
+
|
|
31
|
+
CONNECTOR_TYPE = "databricks_delta_tables"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DatabrickDeltaTablesAccessConfig(SQLAccessConfig):
|
|
35
|
+
token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
|
|
36
|
+
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
37
|
+
client_secret: Optional[str] = Field(
|
|
38
|
+
default=None, description="Client Secret of the OAuth app."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DatabrickDeltaTablesConnectionConfig(SQLConnectionConfig):
|
|
43
|
+
access_config: Secret[DatabrickDeltaTablesAccessConfig]
|
|
44
|
+
server_hostname: str = Field(description="server hostname connection config value")
|
|
45
|
+
http_path: str = Field(description="http path connection config value")
|
|
46
|
+
user_agent: str = "unstructuredio_oss"
|
|
47
|
+
|
|
48
|
+
@requires_dependencies(["databricks"], extras="databricks-delta-tables")
|
|
49
|
+
def get_credentials_provider(self) -> "oauth_service_principal":
|
|
50
|
+
from databricks.sdk.core import Config, oauth_service_principal
|
|
51
|
+
|
|
52
|
+
host = f"https://{self.server_hostname}"
|
|
53
|
+
access_configs = self.access_config.get_secret_value()
|
|
54
|
+
if (client_id := access_configs.client_id) and (
|
|
55
|
+
client_secret := access_configs.client_secret
|
|
56
|
+
):
|
|
57
|
+
return oauth_service_principal(
|
|
58
|
+
Config(
|
|
59
|
+
host=host,
|
|
60
|
+
client_id=client_id,
|
|
61
|
+
client_secret=client_secret,
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
def model_post_init(self, __context: Any) -> None:
|
|
67
|
+
access_config = self.access_config.get_secret_value()
|
|
68
|
+
if access_config.token and access_config.client_secret and access_config.client_id:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
"One one for of auth can be provided, either token or client id and secret"
|
|
71
|
+
)
|
|
72
|
+
if not access_config.token and not (
|
|
73
|
+
access_config.client_secret and access_config.client_id
|
|
74
|
+
):
|
|
75
|
+
raise ValueError(
|
|
76
|
+
"One form of auth must be provided, either token or client id and secret"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
@contextmanager
|
|
80
|
+
@requires_dependencies(["databricks"], extras="databricks-delta-tables")
|
|
81
|
+
def get_connection(self, **connect_kwargs) -> Generator["DeltaTableConnection", None, None]:
|
|
82
|
+
from databricks.sql import connect
|
|
83
|
+
|
|
84
|
+
connect_kwargs = connect_kwargs or {}
|
|
85
|
+
connect_kwargs["_user_agent_entry"] = self.user_agent
|
|
86
|
+
connect_kwargs["server_hostname"] = connect_kwargs.get(
|
|
87
|
+
"server_hostname", self.server_hostname
|
|
88
|
+
)
|
|
89
|
+
connect_kwargs["http_path"] = connect_kwargs.get("http_path", self.http_path)
|
|
90
|
+
|
|
91
|
+
if credential_provider := self.get_credentials_provider():
|
|
92
|
+
connect_kwargs["credentials_provider"] = credential_provider
|
|
93
|
+
else:
|
|
94
|
+
connect_kwargs["access_token"] = self.access_config.get_secret_value().token
|
|
95
|
+
with connect(**connect_kwargs) as connection:
|
|
96
|
+
yield connection
|
|
97
|
+
|
|
98
|
+
@contextmanager
|
|
99
|
+
def get_cursor(self, **connect_kwargs) -> Generator["DeltaTableCursor", None, None]:
|
|
100
|
+
with self.get_connection(**connect_kwargs) as connection:
|
|
101
|
+
cursor = connection.cursor()
|
|
102
|
+
yield cursor
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class DatabrickDeltaTablesUploadStagerConfig(SQLUploadStagerConfig):
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class DatabrickDeltaTablesUploadStager(SQLUploadStager):
|
|
110
|
+
upload_stager_config: DatabrickDeltaTablesUploadStagerConfig
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class DatabrickDeltaTablesUploaderConfig(SQLUploaderConfig):
|
|
114
|
+
catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
|
|
115
|
+
database: str = Field(description="Database name", default="default")
|
|
116
|
+
table_name: str = Field(description="Table name")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass
|
|
120
|
+
class DatabrickDeltaTablesUploader(SQLUploader):
|
|
121
|
+
upload_config: DatabrickDeltaTablesUploaderConfig
|
|
122
|
+
connection_config: DatabrickDeltaTablesConnectionConfig
|
|
123
|
+
connector_type: str = CONNECTOR_TYPE
|
|
124
|
+
|
|
125
|
+
@contextmanager
|
|
126
|
+
def get_cursor(self) -> Generator[Any, None, None]:
|
|
127
|
+
with self.connection_config.get_cursor() as cursor:
|
|
128
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
129
|
+
yield cursor
|
|
130
|
+
|
|
131
|
+
def precheck(self) -> None:
|
|
132
|
+
with self.connection_config.get_cursor() as cursor:
|
|
133
|
+
cursor.execute("SHOW CATALOGS")
|
|
134
|
+
catalogs = [r[0] for r in cursor.fetchall()]
|
|
135
|
+
if self.upload_config.catalog not in catalogs:
|
|
136
|
+
raise ValueError(
|
|
137
|
+
"Catalog {} not found in {}".format(
|
|
138
|
+
self.upload_config.catalog, ", ".join(catalogs)
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
142
|
+
cursor.execute("SHOW DATABASES")
|
|
143
|
+
databases = [r[0] for r in cursor.fetchall()]
|
|
144
|
+
if self.upload_config.database not in databases:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
"Database {} not found in {}".format(
|
|
147
|
+
self.upload_config.database, ", ".join(databases)
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
cursor.execute("SHOW TABLES")
|
|
151
|
+
table_names = [r[1] for r in cursor.fetchall()]
|
|
152
|
+
if self.upload_config.table_name not in table_names:
|
|
153
|
+
raise ValueError(
|
|
154
|
+
"Table {} not found in {}".format(
|
|
155
|
+
self.upload_config.table_name, ", ".join(table_names)
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def create_statement(self, columns: list[str], values: tuple[Any, ...]) -> str:
|
|
160
|
+
values_list = []
|
|
161
|
+
for v in values:
|
|
162
|
+
if isinstance(v, dict):
|
|
163
|
+
values_list.append(json.dumps(v))
|
|
164
|
+
elif isinstance(v, list):
|
|
165
|
+
if v and isinstance(v[0], (int, float)):
|
|
166
|
+
values_list.append("ARRAY({})".format(", ".join([str(val) for val in v])))
|
|
167
|
+
else:
|
|
168
|
+
values_list.append("ARRAY({})".format(", ".join([f"'{val}'" for val in v])))
|
|
169
|
+
else:
|
|
170
|
+
values_list.append(f"'{v}'")
|
|
171
|
+
statement = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
|
|
172
|
+
table_name=self.upload_config.table_name,
|
|
173
|
+
columns=", ".join(columns),
|
|
174
|
+
values=", ".join(values_list),
|
|
175
|
+
)
|
|
176
|
+
return statement
|
|
177
|
+
|
|
178
|
+
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
179
|
+
if self.can_delete():
|
|
180
|
+
self.delete_by_record_id(file_data=file_data)
|
|
181
|
+
else:
|
|
182
|
+
logger.warning(
|
|
183
|
+
f"table doesn't contain expected "
|
|
184
|
+
f"record id column "
|
|
185
|
+
f"{self.upload_config.record_id_key}, skipping delete"
|
|
186
|
+
)
|
|
187
|
+
df.replace({np.nan: None}, inplace=True)
|
|
188
|
+
self._fit_to_schema(df=df)
|
|
189
|
+
|
|
190
|
+
columns = list(df.columns)
|
|
191
|
+
logger.info(
|
|
192
|
+
f"writing a total of {len(df)} elements via"
|
|
193
|
+
f" document batches to destination"
|
|
194
|
+
f" table named {self.upload_config.table_name}"
|
|
195
|
+
# f" with batch size {self.upload_config.batch_size}"
|
|
196
|
+
)
|
|
197
|
+
# TODO: currently variable binding not supporting for list types,
|
|
198
|
+
# update once that gets resolved in SDK
|
|
199
|
+
for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
200
|
+
with self.get_cursor() as cursor:
|
|
201
|
+
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
202
|
+
for v in values:
|
|
203
|
+
stmt = self.create_statement(columns=columns, values=v)
|
|
204
|
+
cursor.execute(stmt)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
databricks_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
208
|
+
connection_config=DatabrickDeltaTablesConnectionConfig,
|
|
209
|
+
uploader=DatabrickDeltaTablesUploader,
|
|
210
|
+
uploader_config=DatabrickDeltaTablesUploaderConfig,
|
|
211
|
+
upload_stager=DatabrickDeltaTablesUploadStager,
|
|
212
|
+
upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
|
|
213
|
+
)
|
|
@@ -129,8 +129,13 @@ class SQLIndexer(Indexer, ABC):
|
|
|
129
129
|
connection_config: SQLConnectionConfig
|
|
130
130
|
index_config: SQLIndexerConfig
|
|
131
131
|
|
|
132
|
-
|
|
132
|
+
@contextmanager
|
|
133
|
+
def get_cursor(self) -> Generator[Any, None, None]:
|
|
133
134
|
with self.connection_config.get_cursor() as cursor:
|
|
135
|
+
yield cursor
|
|
136
|
+
|
|
137
|
+
def _get_doc_ids(self) -> list[str]:
|
|
138
|
+
with self.get_cursor() as cursor:
|
|
134
139
|
cursor.execute(
|
|
135
140
|
f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
|
|
136
141
|
)
|
|
@@ -140,7 +145,7 @@ class SQLIndexer(Indexer, ABC):
|
|
|
140
145
|
|
|
141
146
|
def precheck(self) -> None:
|
|
142
147
|
try:
|
|
143
|
-
with self.
|
|
148
|
+
with self.get_cursor() as cursor:
|
|
144
149
|
cursor.execute("SELECT 1;")
|
|
145
150
|
except Exception as e:
|
|
146
151
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
@@ -182,6 +187,11 @@ class SQLDownloader(Downloader, ABC):
|
|
|
182
187
|
connection_config: SQLConnectionConfig
|
|
183
188
|
download_config: SQLDownloaderConfig
|
|
184
189
|
|
|
190
|
+
@contextmanager
|
|
191
|
+
def get_cursor(self) -> Generator[Any, None, None]:
|
|
192
|
+
with self.connection_config.get_cursor() as cursor:
|
|
193
|
+
yield cursor
|
|
194
|
+
|
|
185
195
|
@abstractmethod
|
|
186
196
|
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
187
197
|
pass
|
|
@@ -323,12 +333,17 @@ class SQLUploader(Uploader):
|
|
|
323
333
|
|
|
324
334
|
def precheck(self) -> None:
|
|
325
335
|
try:
|
|
326
|
-
with self.
|
|
336
|
+
with self.get_cursor() as cursor:
|
|
327
337
|
cursor.execute("SELECT 1;")
|
|
328
338
|
except Exception as e:
|
|
329
339
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
330
340
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
331
341
|
|
|
342
|
+
@contextmanager
|
|
343
|
+
def get_cursor(self) -> Generator[Any, None, None]:
|
|
344
|
+
with self.connection_config.get_cursor() as cursor:
|
|
345
|
+
yield cursor
|
|
346
|
+
|
|
332
347
|
def prepare_data(
|
|
333
348
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
334
349
|
) -> list[tuple[Any, ...]]:
|
|
@@ -346,7 +361,7 @@ class SQLUploader(Uploader):
|
|
|
346
361
|
output.append(tuple(parsed))
|
|
347
362
|
return output
|
|
348
363
|
|
|
349
|
-
def _fit_to_schema(self, df: pd.DataFrame
|
|
364
|
+
def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
350
365
|
columns = set(df.columns)
|
|
351
366
|
schema_fields = set(columns)
|
|
352
367
|
columns_to_drop = columns - schema_fields
|
|
@@ -367,6 +382,7 @@ class SQLUploader(Uploader):
|
|
|
367
382
|
|
|
368
383
|
for column in missing_columns:
|
|
369
384
|
df[column] = pd.Series()
|
|
385
|
+
return df
|
|
370
386
|
|
|
371
387
|
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
372
388
|
if self.can_delete():
|
|
@@ -378,7 +394,7 @@ class SQLUploader(Uploader):
|
|
|
378
394
|
f"{self.upload_config.record_id_key}, skipping delete"
|
|
379
395
|
)
|
|
380
396
|
df.replace({np.nan: None}, inplace=True)
|
|
381
|
-
self._fit_to_schema(df=df
|
|
397
|
+
self._fit_to_schema(df=df)
|
|
382
398
|
|
|
383
399
|
columns = list(df.columns)
|
|
384
400
|
stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
|
|
@@ -393,7 +409,7 @@ class SQLUploader(Uploader):
|
|
|
393
409
|
f" with batch size {self.upload_config.batch_size}"
|
|
394
410
|
)
|
|
395
411
|
for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
396
|
-
with self.
|
|
412
|
+
with self.get_cursor() as cursor:
|
|
397
413
|
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
398
414
|
# For debugging purposes:
|
|
399
415
|
# for val in values:
|
|
@@ -406,7 +422,7 @@ class SQLUploader(Uploader):
|
|
|
406
422
|
cursor.executemany(stmt, values)
|
|
407
423
|
|
|
408
424
|
def get_table_columns(self) -> list[str]:
|
|
409
|
-
with self.
|
|
425
|
+
with self.get_cursor() as cursor:
|
|
410
426
|
cursor.execute(f"SELECT * from {self.upload_config.table_name}")
|
|
411
427
|
return [desc[0] for desc in cursor.description]
|
|
412
428
|
|
|
@@ -420,10 +436,11 @@ class SQLUploader(Uploader):
|
|
|
420
436
|
f"from table {self.upload_config.table_name}"
|
|
421
437
|
)
|
|
422
438
|
stmt = f"DELETE FROM {self.upload_config.table_name} WHERE {self.upload_config.record_id_key} = {self.values_delimiter}" # noqa: E501
|
|
423
|
-
with self.
|
|
439
|
+
with self.get_cursor() as cursor:
|
|
424
440
|
cursor.execute(stmt, [file_data.identifier])
|
|
425
441
|
rowcount = cursor.rowcount
|
|
426
|
-
|
|
442
|
+
if rowcount > 0:
|
|
443
|
+
logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
|
|
427
444
|
|
|
428
445
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
429
446
|
df = pd.DataFrame(data)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.15
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,14 +22,14 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: ndjson
|
|
26
|
+
Requires-Dist: pydantic>=2.7
|
|
26
27
|
Requires-Dist: pandas
|
|
27
|
-
Requires-Dist: tqdm
|
|
28
28
|
Requires-Dist: dataclasses-json
|
|
29
|
-
Requires-Dist:
|
|
29
|
+
Requires-Dist: tqdm
|
|
30
30
|
Requires-Dist: click
|
|
31
|
-
Requires-Dist:
|
|
32
|
-
Requires-Dist:
|
|
31
|
+
Requires-Dist: python-dateutil
|
|
32
|
+
Requires-Dist: opentelemetry-sdk
|
|
33
33
|
Provides-Extra: airtable
|
|
34
34
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
35
35
|
Provides-Extra: astradb
|
|
@@ -40,11 +40,11 @@ Requires-Dist: fsspec; extra == "azure"
|
|
|
40
40
|
Provides-Extra: azure-ai-search
|
|
41
41
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
42
42
|
Provides-Extra: bedrock
|
|
43
|
-
Requires-Dist: boto3; extra == "bedrock"
|
|
44
43
|
Requires-Dist: aioboto3; extra == "bedrock"
|
|
44
|
+
Requires-Dist: boto3; extra == "bedrock"
|
|
45
45
|
Provides-Extra: biomed
|
|
46
|
-
Requires-Dist: requests; extra == "biomed"
|
|
47
46
|
Requires-Dist: bs4; extra == "biomed"
|
|
47
|
+
Requires-Dist: requests; extra == "biomed"
|
|
48
48
|
Provides-Extra: box
|
|
49
49
|
Requires-Dist: boxfs; extra == "box"
|
|
50
50
|
Requires-Dist: fsspec; extra == "box"
|
|
@@ -59,6 +59,8 @@ Provides-Extra: couchbase
|
|
|
59
59
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
60
60
|
Provides-Extra: csv
|
|
61
61
|
Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
62
|
+
Provides-Extra: databricks-delta-tables
|
|
63
|
+
Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
62
64
|
Provides-Extra: databricks-volumes
|
|
63
65
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
64
66
|
Provides-Extra: delta-table
|
|
@@ -71,8 +73,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
|
|
|
71
73
|
Provides-Extra: docx
|
|
72
74
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
73
75
|
Provides-Extra: dropbox
|
|
74
|
-
Requires-Dist: fsspec; extra == "dropbox"
|
|
75
76
|
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
77
|
+
Requires-Dist: fsspec; extra == "dropbox"
|
|
76
78
|
Provides-Extra: duckdb
|
|
77
79
|
Requires-Dist: duckdb; extra == "duckdb"
|
|
78
80
|
Provides-Extra: elasticsearch
|
|
@@ -82,8 +84,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
|
82
84
|
Provides-Extra: embed-mixedbreadai
|
|
83
85
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
84
86
|
Provides-Extra: embed-octoai
|
|
85
|
-
Requires-Dist: openai; extra == "embed-octoai"
|
|
86
87
|
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
88
|
+
Requires-Dist: openai; extra == "embed-octoai"
|
|
87
89
|
Provides-Extra: embed-vertexai
|
|
88
90
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
89
91
|
Provides-Extra: embed-voyageai
|
|
@@ -92,8 +94,8 @@ Provides-Extra: epub
|
|
|
92
94
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
93
95
|
Provides-Extra: gcs
|
|
94
96
|
Requires-Dist: bs4; extra == "gcs"
|
|
95
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
96
97
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
98
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
97
99
|
Provides-Extra: github
|
|
98
100
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
99
101
|
Requires-Dist: requests; extra == "github"
|
|
@@ -122,22 +124,22 @@ Provides-Extra: msg
|
|
|
122
124
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
123
125
|
Provides-Extra: neo4j
|
|
124
126
|
Requires-Dist: cymple; extra == "neo4j"
|
|
125
|
-
Requires-Dist: networkx; extra == "neo4j"
|
|
126
127
|
Requires-Dist: neo4j; extra == "neo4j"
|
|
128
|
+
Requires-Dist: networkx; extra == "neo4j"
|
|
127
129
|
Provides-Extra: notion
|
|
130
|
+
Requires-Dist: httpx; extra == "notion"
|
|
131
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
128
132
|
Requires-Dist: backoff; extra == "notion"
|
|
129
133
|
Requires-Dist: notion-client; extra == "notion"
|
|
130
|
-
Requires-Dist: htmlBuilder; extra == "notion"
|
|
131
|
-
Requires-Dist: httpx; extra == "notion"
|
|
132
134
|
Provides-Extra: odt
|
|
133
135
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
134
136
|
Provides-Extra: onedrive
|
|
135
|
-
Requires-Dist: bs4; extra == "onedrive"
|
|
136
137
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
138
|
+
Requires-Dist: bs4; extra == "onedrive"
|
|
137
139
|
Requires-Dist: msal; extra == "onedrive"
|
|
138
140
|
Provides-Extra: openai
|
|
139
|
-
Requires-Dist: openai; extra == "openai"
|
|
140
141
|
Requires-Dist: tiktoken; extra == "openai"
|
|
142
|
+
Requires-Dist: openai; extra == "openai"
|
|
141
143
|
Provides-Extra: opensearch
|
|
142
144
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
143
145
|
Provides-Extra: org
|
|
@@ -168,8 +170,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
168
170
|
Provides-Extra: rtf
|
|
169
171
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
170
172
|
Provides-Extra: s3
|
|
171
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
172
173
|
Requires-Dist: fsspec; extra == "s3"
|
|
174
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
173
175
|
Provides-Extra: salesforce
|
|
174
176
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
175
177
|
Provides-Extra: sftp
|
|
@@ -183,16 +185,16 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
183
185
|
Provides-Extra: slack
|
|
184
186
|
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
185
187
|
Provides-Extra: snowflake
|
|
186
|
-
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
187
188
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
189
|
+
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
188
190
|
Provides-Extra: togetherai
|
|
189
191
|
Requires-Dist: together; extra == "togetherai"
|
|
190
192
|
Provides-Extra: tsv
|
|
191
193
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
192
194
|
Provides-Extra: vectara
|
|
193
|
-
Requires-Dist: requests; extra == "vectara"
|
|
194
|
-
Requires-Dist: aiofiles; extra == "vectara"
|
|
195
195
|
Requires-Dist: httpx; extra == "vectara"
|
|
196
|
+
Requires-Dist: aiofiles; extra == "vectara"
|
|
197
|
+
Requires-Dist: requests; extra == "vectara"
|
|
196
198
|
Provides-Extra: weaviate
|
|
197
199
|
Requires-Dist: weaviate-client; extra == "weaviate"
|
|
198
200
|
Provides-Extra: wikipedia
|
|
@@ -16,7 +16,7 @@ test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8u
|
|
|
16
16
|
test/integration/connectors/test_neo4j.py,sha256=r4TRYtTXeeOdcRcfa_gvslhSKvoIWrwN1FRJ5XRoH4k,8456
|
|
17
17
|
test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qkoV3srltFOSSKTA,5403
|
|
18
18
|
test/integration/connectors/test_onedrive.py,sha256=TcMaa5BIp8J6engS4UZ2t19WQP0NNz2rkpBB47m7A3Y,3835
|
|
19
|
-
test/integration/connectors/test_pinecone.py,sha256=
|
|
19
|
+
test/integration/connectors/test_pinecone.py,sha256=acKEu1vnAk0Ht3FhCnGtOEKaj_YlgCzZB7wRU17ehQ0,12407
|
|
20
20
|
test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
|
|
21
21
|
test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
|
|
22
22
|
test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
|
|
@@ -34,6 +34,7 @@ test/integration/connectors/elasticsearch/conftest.py,sha256=-i4_7MkIxSQENz7nuD2
|
|
|
34
34
|
test/integration/connectors/elasticsearch/test_elasticsearch.py,sha256=TsSEPsyaTUoEvFBadinrdM0b5C4FoUtEwCv24OUbpO8,12072
|
|
35
35
|
test/integration/connectors/elasticsearch/test_opensearch.py,sha256=7b7z0GqoBsBqA3IK35N6axmwEMjzJ1l3Fg2WT2c7uqs,11450
|
|
36
36
|
test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
+
test/integration/connectors/sql/test_databricks_delta_tables.py,sha256=UjVjw5hVoMSNJoYdoYympYow25gvcDAEHLmUmOJKz7I,5036
|
|
37
38
|
test/integration/connectors/sql/test_postgres.py,sha256=bGDyzLRpgrXO7nl0U8nF2zSNr6ykUG-w8T4daIqUCG4,6970
|
|
38
39
|
test/integration/connectors/sql/test_singlestore.py,sha256=XeU2s4Kt_3tGyaDYYKTgYjdOyb8j2dnz4TgSMwFUjWs,6153
|
|
39
40
|
test/integration/connectors/sql/test_snowflake.py,sha256=LEwsRDoC6-rRiwYsqeo5B9Eo6RYygLLGAUsrtrgI9pM,7494
|
|
@@ -96,7 +97,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
96
97
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
97
98
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
98
99
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
99
|
-
unstructured_ingest/__version__.py,sha256=
|
|
100
|
+
unstructured_ingest/__version__.py,sha256=31lJzr6gfqqAcVEa6C2kjStzBSJPXWUyP7eRpa8Y7gI,43
|
|
100
101
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
101
102
|
unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
|
|
102
103
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -386,7 +387,7 @@ unstructured_ingest/v2/interfaces/file_data.py,sha256=7MyRlj5dijQsCR6W18wQ8fEgJi
|
|
|
386
387
|
unstructured_ingest/v2/interfaces/indexer.py,sha256=gsa1MLhFa82BzD2h4Yb7ons0VxRwKINZOrzvHAahwVU,846
|
|
387
388
|
unstructured_ingest/v2/interfaces/process.py,sha256=BgglTu5K93FnDDopZKKr_rkK2LTZOguR6kcQjKHjF40,392
|
|
388
389
|
unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
|
|
389
|
-
unstructured_ingest/v2/interfaces/upload_stager.py,sha256=
|
|
390
|
+
unstructured_ingest/v2/interfaces/upload_stager.py,sha256=nbMuo_U6Gqn9bDJrAJTCjrZXKMw_G28OZOuNsT23i0k,3608
|
|
390
391
|
unstructured_ingest/v2/interfaces/uploader.py,sha256=T2oHbN-d4Px1w1oATKKYZA10aUssqytEpiaqBM92r0Q,1600
|
|
391
392
|
unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
392
393
|
unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
|
|
@@ -427,19 +428,20 @@ unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNN
|
|
|
427
428
|
unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=HU1IwchTM7Q1kkeIFVe-Lg6gInMItBpgkDkVwuTvkGY,14259
|
|
428
429
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=d6gC40YmfqBNXxizAt4MO4OOu5BoCZ7SAe1AbNwTP0E,18322
|
|
429
430
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
430
|
-
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=
|
|
431
|
+
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=bQDCch7OGiQgpWO3n3ncLuQ4XCWqDc7ZWEB-Qrqkss8,10730
|
|
431
432
|
unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
|
|
432
433
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
433
434
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
|
|
434
435
|
unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
|
|
435
436
|
unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
|
|
436
437
|
unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
|
|
437
|
-
unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=
|
|
438
|
-
unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=
|
|
438
|
+
unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
|
|
439
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=Yj4fIxgGo9Qh1x_6-INdbrPGHCuZElu-QKNfSVtW7F4,7694
|
|
439
440
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=TA2e_1SIr4VaEI62873eyReCNfgmQ51_2Pko2I04pPM,2747
|
|
440
441
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=cb-EUW0T-linZMkbU6AcKEGWnFHQvhpO5Abtps4P2X0,3532
|
|
441
442
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=tR8NubkyHw49IpW_42g6w1Koxlm56EPiPf1lB-eoRSI,2783
|
|
442
443
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=dJLD1fueXf8_0AfC4cg0G7siJZVefz68iuEx2Kq7rMs,2890
|
|
444
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=muj7G2JFO_WwAPub14k0VqDmN3c56t9MA60rM48wal8,4750
|
|
443
445
|
unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
|
|
444
446
|
unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=XTV9Pox3_xVmI8YVQWC9Bn6PugbPM49kp4Scv1OXFys,2649
|
|
445
447
|
unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=oUHHaLpO2pWW2Lu4Mc-XFjrA0ze97205WQ_xP95ua4M,4296
|
|
@@ -538,20 +540,21 @@ unstructured_ingest/v2/processes/connectors/qdrant/cloud.py,sha256=accJ4sNWBVWV-
|
|
|
538
540
|
unstructured_ingest/v2/processes/connectors/qdrant/local.py,sha256=cGEyv3Oy6y4BQ4DU8yhJWMpL82QYwBVdPTxxNuV127U,1588
|
|
539
541
|
unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py,sha256=BHI7HYSdbS05j2vrjyDvLzVG1WfsM8osKeq-lttlybQ,5437
|
|
540
542
|
unstructured_ingest/v2/processes/connectors/qdrant/server.py,sha256=odvCZWZp8DmRxLXMR7tHhW-c7UQbix1_zpFdfXfCvKI,1613
|
|
541
|
-
unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=
|
|
543
|
+
unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=mxcrncrjeP-C2jqQoTOOpGjV3Bmyfg4efT5lq_c-V1E,1760
|
|
544
|
+
unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha256=s_W6wSvyIXZ9mdAxvgSXFeFSze9E7pwIvc38p1hVDLM,8839
|
|
542
545
|
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
|
|
543
546
|
unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=-2E9dsdNhjAiuzeSBytBbAhljOhvQ8kN8wvlUESvLo8,5465
|
|
544
547
|
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=8qCm1XiJmVxy8TSeoxwmQrE2W1x8S8At2ctrS_lJ8-I,7780
|
|
545
|
-
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=
|
|
548
|
+
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=ZGpeBfiOEzVaSiQxwqJkMC00Eu6TQhsrZKHnOHM0Xug,15667
|
|
546
549
|
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=Q5RAqn5Ccw-pbeKZLkiMn5IVw6EemCMukXzLlS7pDhc,5162
|
|
547
550
|
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
|
|
548
551
|
unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
|
|
549
552
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
550
553
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
551
554
|
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=X1yv1H_orDQ-J965EMXhR2XaURqe8vovSi9n1fk85B4,10499
|
|
552
|
-
unstructured_ingest-0.3.
|
|
553
|
-
unstructured_ingest-0.3.
|
|
554
|
-
unstructured_ingest-0.3.
|
|
555
|
-
unstructured_ingest-0.3.
|
|
556
|
-
unstructured_ingest-0.3.
|
|
557
|
-
unstructured_ingest-0.3.
|
|
555
|
+
unstructured_ingest-0.3.15.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
556
|
+
unstructured_ingest-0.3.15.dist-info/METADATA,sha256=rZFAbiv0HZ-VUWVk4MP2vANZuzsxJLhK2_QWZ5zTjRA,7929
|
|
557
|
+
unstructured_ingest-0.3.15.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
558
|
+
unstructured_ingest-0.3.15.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
559
|
+
unstructured_ingest-0.3.15.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
560
|
+
unstructured_ingest-0.3.15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.3.14.dist-info → unstructured_ingest-0.3.15.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|