unstructured-ingest 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -0,0 +1,142 @@
1
+ import json
2
+ import os
3
+ import time
4
+ from contextlib import contextmanager
5
+ from pathlib import Path
6
+ from uuid import uuid4
7
+
8
+ import pytest
9
+ from databricks.sql import connect
10
+ from databricks.sql.client import Connection as DeltaTableConnection
11
+ from databricks.sql.client import Cursor as DeltaTableCursor
12
+ from pydantic import BaseModel, SecretStr
13
+
14
+ from test.integration.connectors.utils.constants import DESTINATION_TAG, SQL_TAG, env_setup_path
15
+ from test.integration.utils import requires_env
16
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
17
+ from unstructured_ingest.v2.logger import logger
18
+ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
19
+ CONNECTOR_TYPE,
20
+ DatabrickDeltaTablesAccessConfig,
21
+ DatabrickDeltaTablesConnectionConfig,
22
+ DatabrickDeltaTablesUploader,
23
+ DatabrickDeltaTablesUploaderConfig,
24
+ DatabrickDeltaTablesUploadStager,
25
+ )
26
+
27
+ CATALOG = "utic-dev-tech-fixtures"
28
+
29
+
30
+ class EnvData(BaseModel):
31
+ server_hostname: str
32
+ http_path: str
33
+ access_token: SecretStr
34
+
35
+
36
+ def get_env_data() -> EnvData:
37
+ return EnvData(
38
+ server_hostname=os.environ["DATABRICKS_SERVER_HOSTNAME"],
39
+ http_path=os.environ["DATABRICKS_HTTP_PATH"],
40
+ access_token=os.environ["DATABRICKS_ACCESS_TOKEN"],
41
+ )
42
+
43
+
44
+ def get_destination_schema(new_table_name: str) -> str:
45
+ p = Path(env_setup_path / "sql" / "databricks_delta_tables" / "destination" / "schema.sql")
46
+ with p.open() as f:
47
+ data_lines = f.readlines()
48
+ data_lines[0] = data_lines[0].replace("elements", new_table_name)
49
+ data = "".join([line.strip() for line in data_lines])
50
+ return data
51
+
52
+
53
+ @contextmanager
54
+ def get_connection() -> DeltaTableConnection:
55
+ env_data = get_env_data()
56
+ with connect(
57
+ server_hostname=env_data.server_hostname,
58
+ http_path=env_data.http_path,
59
+ access_token=env_data.access_token.get_secret_value(),
60
+ ) as connection:
61
+ yield connection
62
+
63
+
64
+ @contextmanager
65
+ def get_cursor() -> DeltaTableCursor:
66
+ with get_connection() as connection:
67
+ with connection.cursor() as cursor:
68
+ cursor.execute(f"USE CATALOG '{CATALOG}'")
69
+ yield cursor
70
+
71
+
72
+ @pytest.fixture
73
+ def destination_table() -> str:
74
+ random_id = str(uuid4())[:8]
75
+ table_name = f"elements_{random_id}"
76
+ destination_schema = get_destination_schema(new_table_name=table_name)
77
+ with get_cursor() as cursor:
78
+ logger.info(f"creating table: {table_name}")
79
+ cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
80
+ cursor.execute(destination_schema)
81
+
82
+ yield table_name
83
+ with get_cursor() as cursor:
84
+ logger.info(f"dropping table: {table_name}")
85
+ cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
86
+
87
+
88
+ def validate_destination(expected_num_elements: int, table_name: str, retries=30, interval=1):
89
+ with get_cursor() as cursor:
90
+ for i in range(retries):
91
+ cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
92
+ count = cursor.fetchone()[0]
93
+ if count == expected_num_elements:
94
+ break
95
+ logger.info(f"retry attempt {i}: expected {expected_num_elements} != count {count}")
96
+ time.sleep(interval)
97
+ assert (
98
+ count == expected_num_elements
99
+ ), f"dest check failed: got {count}, expected {expected_num_elements}"
100
+
101
+
102
+ @pytest.mark.asyncio
103
+ @pytest.mark.skip("Resources take too long to spin up to run in CI")
104
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, SQL_TAG)
105
+ @requires_env("DATABRICKS_SERVER_HOSTNAME", "DATABRICKS_HTTP_PATH", "DATABRICKS_ACCESS_TOKEN")
106
+ async def test_databricks_delta_tables_destination(
107
+ upload_file: Path, temp_dir: Path, destination_table: str
108
+ ):
109
+ env_data = get_env_data()
110
+ mock_file_data = FileData(
111
+ identifier="mock file data",
112
+ connector_type=CONNECTOR_TYPE,
113
+ source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
114
+ )
115
+ stager = DatabrickDeltaTablesUploadStager()
116
+ staged_path = stager.run(
117
+ elements_filepath=upload_file,
118
+ file_data=mock_file_data,
119
+ output_dir=temp_dir,
120
+ output_filename=upload_file.name,
121
+ )
122
+
123
+ assert staged_path.suffix == upload_file.suffix
124
+
125
+ uploader = DatabrickDeltaTablesUploader(
126
+ connection_config=DatabrickDeltaTablesConnectionConfig(
127
+ access_config=DatabrickDeltaTablesAccessConfig(
128
+ token=env_data.access_token.get_secret_value()
129
+ ),
130
+ http_path=env_data.http_path,
131
+ server_hostname=env_data.server_hostname,
132
+ ),
133
+ upload_config=DatabrickDeltaTablesUploaderConfig(
134
+ catalog=CATALOG, database="default", table_name=destination_table
135
+ ),
136
+ )
137
+ with staged_path.open("r") as f:
138
+ staged_data = json.load(f)
139
+ expected_num_elements = len(staged_data)
140
+ uploader.precheck()
141
+ uploader.run(path=staged_path, file_data=mock_file_data)
142
+ validate_destination(expected_num_elements=expected_num_elements, table_name=destination_table)
@@ -107,11 +107,15 @@ def pinecone_index() -> Generator[str, None, None]:
107
107
 
108
108
 
109
109
  def validate_pinecone_index(
110
- index_name: str, expected_num_of_vectors: int, retries=30, interval=1
110
+ index_name: str,
111
+ expected_num_of_vectors: int,
112
+ retries=30,
113
+ interval=1,
114
+ namespace: str = "default",
111
115
  ) -> None:
112
116
  # Because there's a delay for the index to catch up to the recent writes, add in a retry
113
117
  pinecone = Pinecone(api_key=get_api_key())
114
- index = pinecone.Index(name=index_name)
118
+ index = pinecone.Index(name=index_name, namespace=namespace)
115
119
  vector_count = -1
116
120
  for i in range(retries):
117
121
  index_stats = index.describe_index_stats()
@@ -133,11 +137,13 @@ def validate_pinecone_index(
133
137
  @pytest.mark.asyncio
134
138
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
135
139
  async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp_dir: Path):
140
+
136
141
  file_data = FileData(
137
142
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
138
143
  connector_type=CONNECTOR_TYPE,
139
144
  identifier="pinecone_mock_id",
140
145
  )
146
+
141
147
  connection_config = PineconeConnectionConfig(
142
148
  index_name=pinecone_index,
143
149
  access_config=PineconeAccessConfig(api_key=get_api_key()),
@@ -224,6 +230,66 @@ async def test_pinecone_destination_large_index(
224
230
  )
225
231
 
226
232
 
233
+ @requires_env(API_KEY)
234
+ @pytest.mark.asyncio
235
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
236
+ async def test_pinecone_destination_namespace(
237
+ pinecone_index: str, upload_file: Path, temp_dir: Path
238
+ ):
239
+ """
240
+ tests namespace functionality of destination connector.
241
+ """
242
+
243
+ # creates a file data structure.
244
+ file_data = FileData(
245
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
246
+ connector_type=CONNECTOR_TYPE,
247
+ identifier="pinecone_mock_id",
248
+ )
249
+
250
+ connection_config = PineconeConnectionConfig(
251
+ index_name=pinecone_index,
252
+ access_config=PineconeAccessConfig(api_key=get_api_key()),
253
+ )
254
+
255
+ stager_config = PineconeUploadStagerConfig()
256
+
257
+ stager = PineconeUploadStager(upload_stager_config=stager_config)
258
+ new_upload_file = stager.run(
259
+ elements_filepath=upload_file,
260
+ output_dir=temp_dir,
261
+ output_filename=upload_file.name,
262
+ file_data=file_data,
263
+ )
264
+
265
+ # here add namespace defintion
266
+ upload_config = PineconeUploaderConfig()
267
+ namespace_test_name = "user-1"
268
+ upload_config.namespace = namespace_test_name
269
+ uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
270
+ uploader.precheck()
271
+
272
+ uploader.run(path=new_upload_file, file_data=file_data)
273
+ with new_upload_file.open() as f:
274
+ staged_content = json.load(f)
275
+ expected_num_of_vectors = len(staged_content)
276
+ logger.info("validating first upload")
277
+ validate_pinecone_index(
278
+ index_name=pinecone_index,
279
+ expected_num_of_vectors=expected_num_of_vectors,
280
+ namespace=namespace_test_name,
281
+ )
282
+
283
+ # Rerun uploader and make sure no duplicates exist
284
+ uploader.run(path=new_upload_file, file_data=file_data)
285
+ logger.info("validating second upload")
286
+ validate_pinecone_index(
287
+ index_name=pinecone_index,
288
+ expected_num_of_vectors=expected_num_of_vectors,
289
+ namespace=namespace_test_name,
290
+ )
291
+
292
+
227
293
  @requires_env(API_KEY)
228
294
  @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
229
295
  def test_large_metadata(pinecone_index: str, tmp_path: Path, upload_file: Path):
@@ -1 +1 @@
1
- __version__ = "0.3.14" # pragma: no cover
1
+ __version__ = "0.3.15" # pragma: no cover
@@ -2,7 +2,7 @@ import json
2
2
  from abc import ABC
3
3
  from dataclasses import dataclass
4
4
  from pathlib import Path
5
- from typing import Any, TypeVar
5
+ from typing import Any, Optional, TypeVar
6
6
 
7
7
  import ndjson
8
8
  from pydantic import BaseModel
@@ -22,10 +22,10 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
22
22
  class UploadStager(BaseProcess, ABC):
23
23
  upload_stager_config: UploadStagerConfigT
24
24
 
25
- def write_output(self, output_path: Path, data: list[dict]) -> None:
25
+ def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
26
26
  if output_path.suffix == ".json":
27
27
  with output_path.open("w") as f:
28
- json.dump(data, f, indent=2)
28
+ json.dump(data, f, indent=indent)
29
29
  elif output_path.suffix == ".ndjson":
30
30
  with output_path.open("w") as f:
31
31
  ndjson.dump(data, f)
@@ -25,6 +25,8 @@ from .volumes_native import (
25
25
  databricks_native_volumes_destination_entry,
26
26
  databricks_native_volumes_source_entry,
27
27
  )
28
+ from .volumes_table import CONNECTOR_TYPE as VOLUMES_TABLE_CONNECTOR_TYPE
29
+ from .volumes_table import databricks_volumes_delta_tables_destination_entry
28
30
 
29
31
  add_source_entry(source_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_source_entry)
30
32
  add_destination_entry(
@@ -50,3 +52,7 @@ add_source_entry(
50
52
  add_destination_entry(
51
53
  destination_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_destination_entry
52
54
  )
55
+ add_destination_entry(
56
+ destination_type=VOLUMES_TABLE_CONNECTOR_TYPE,
57
+ entry=databricks_volumes_delta_tables_destination_entry,
58
+ )
@@ -187,6 +187,11 @@ class DatabricksVolumesUploader(Uploader, ABC):
187
187
  upload_config: DatabricksVolumesUploaderConfig
188
188
  connection_config: DatabricksVolumesConnectionConfig
189
189
 
190
+ def get_output_path(self, file_data: FileData) -> str:
191
+ return os.path.join(
192
+ self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
193
+ )
194
+
190
195
  def precheck(self) -> None:
191
196
  try:
192
197
  assert self.connection_config.get_client().current_user.me().active
@@ -194,9 +199,7 @@ class DatabricksVolumesUploader(Uploader, ABC):
194
199
  raise self.connection_config.wrap_error(e=e)
195
200
 
196
201
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
197
- output_path = os.path.join(
198
- self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
199
- )
202
+ output_path = self.get_output_path(file_data=file_data)
200
203
  with open(path, "rb") as elements_file:
201
204
  try:
202
205
  self.connection_config.get_client().files.upload(
@@ -0,0 +1,106 @@
1
+ import json
2
+ import os
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Any, Generator, Optional
7
+
8
+ from pydantic import Field
9
+
10
+ from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
11
+ from unstructured_ingest.v2.logger import logger
12
+ from unstructured_ingest.v2.processes.connector_registry import (
13
+ DestinationRegistryEntry,
14
+ )
15
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
16
+ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
17
+ DatabrickDeltaTablesConnectionConfig,
18
+ DatabrickDeltaTablesUploadStager,
19
+ DatabrickDeltaTablesUploadStagerConfig,
20
+ )
21
+
22
+ CONNECTOR_TYPE = "databricks_volume_delta_tables"
23
+
24
+
25
+ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
26
+ database: str = Field(description="Database name", default="default")
27
+ table_name: str = Field(description="Table name")
28
+
29
+
30
+ @dataclass
31
+ class DatabricksVolumeDeltaTableStager(DatabrickDeltaTablesUploadStager):
32
+ def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
33
+ # To avoid new line issues when migrating from volumes into delta tables, omit indenting
34
+ # and always write it as a json file
35
+ with output_path.with_suffix(".json").open("w") as f:
36
+ json.dump(data, f)
37
+
38
+
39
+ @dataclass
40
+ class DatabricksVolumeDeltaTableUploader(Uploader):
41
+ connection_config: DatabrickDeltaTablesConnectionConfig
42
+ upload_config: DatabricksVolumeDeltaTableUploaderConfig
43
+ connector_type: str = CONNECTOR_TYPE
44
+
45
+ def precheck(self) -> None:
46
+ with self.connection_config.get_cursor() as cursor:
47
+ cursor.execute("SHOW CATALOGS")
48
+ catalogs = [r[0] for r in cursor.fetchall()]
49
+ if self.upload_config.catalog not in catalogs:
50
+ raise ValueError(
51
+ "Catalog {} not found in {}".format(
52
+ self.upload_config.catalog, ", ".join(catalogs)
53
+ )
54
+ )
55
+ cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
56
+ cursor.execute("SHOW DATABASES")
57
+ databases = [r[0] for r in cursor.fetchall()]
58
+ if self.upload_config.database not in databases:
59
+ raise ValueError(
60
+ "Database {} not found in {}".format(
61
+ self.upload_config.database, ", ".join(databases)
62
+ )
63
+ )
64
+ cursor.execute("SHOW TABLES")
65
+ table_names = [r[1] for r in cursor.fetchall()]
66
+ if self.upload_config.table_name not in table_names:
67
+ raise ValueError(
68
+ "Table {} not found in {}".format(
69
+ self.upload_config.table_name, ", ".join(table_names)
70
+ )
71
+ )
72
+
73
+ def get_output_path(self, file_data: FileData, suffix: str = ".json") -> str:
74
+ filename = Path(file_data.source_identifiers.filename)
75
+ adjusted_filename = filename if filename.suffix == suffix else f"{filename}{suffix}"
76
+ return os.path.join(self.upload_config.path, f"{adjusted_filename}")
77
+
78
+ @contextmanager
79
+ def get_cursor(self, **connect_kwargs) -> Generator[Any, None, None]:
80
+ with self.connection_config.get_cursor(**connect_kwargs) as cursor:
81
+ cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
82
+ yield cursor
83
+
84
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
85
+ with self.get_cursor(staging_allowed_local_path=str(path.parent)) as cursor:
86
+ catalog_path = self.get_output_path(file_data=file_data)
87
+ logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
88
+ cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
89
+ logger.debug(
90
+ f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
91
+ )
92
+ with path.open() as f:
93
+ data = json.load(f)
94
+ columns = data[0].keys()
95
+ column_str = ", ".join(columns)
96
+ sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
97
+ cursor.execute(sql_statment)
98
+
99
+
100
+ databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
101
+ connection_config=DatabrickDeltaTablesConnectionConfig,
102
+ uploader=DatabricksVolumeDeltaTableUploader,
103
+ uploader_config=DatabricksVolumeDeltaTableUploaderConfig,
104
+ upload_stager=DatabricksVolumeDeltaTableStager,
105
+ upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
106
+ )
@@ -5,12 +5,10 @@ from typing import TYPE_CHECKING, Any, Optional
5
5
  from pydantic import Field, Secret
6
6
 
7
7
  from unstructured_ingest.error import DestinationConnectionError
8
- from unstructured_ingest.utils.data_prep import (
9
- flatten_dict,
10
- generator_batching_wbytes,
11
- )
8
+ from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
12
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
10
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
11
+ from unstructured_ingest.v2.errors import UserError
14
12
  from unstructured_ingest.v2.interfaces import (
15
13
  AccessConfig,
16
14
  ConnectionConfig,
@@ -63,6 +61,7 @@ class PineconeConnectionConfig(ConnectionConfig):
63
61
  pc = self.get_client()
64
62
 
65
63
  index = pc.Index(name=self.index_name, **index_kwargs)
64
+
66
65
  logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
67
66
  return index
68
67
 
@@ -182,14 +181,18 @@ class PineconeUploader(Uploader):
182
181
  delete_kwargs = {
183
182
  "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
184
183
  }
184
+
185
185
  if namespace := self.upload_config.namespace:
186
186
  delete_kwargs["namespace"] = namespace
187
+ try:
188
+ index.delete(**delete_kwargs)
189
+ except UserError as e:
190
+ logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
187
191
 
188
- resp = index.delete(**delete_kwargs)
189
192
  logger.debug(
190
193
  f"deleted any content with metadata "
191
194
  f"{self.upload_config.record_id_key}={file_data.identifier} "
192
- f"from pinecone index: {resp}"
195
+ f"from pinecone index: {delete_kwargs}"
193
196
  )
194
197
 
195
198
  def serverless_delete_by_record_id(self, file_data: FileData) -> None:
@@ -203,15 +206,19 @@ class PineconeUploader(Uploader):
203
206
  deleted_ids = 0
204
207
  if namespace := self.upload_config.namespace:
205
208
  list_kwargs["namespace"] = namespace
209
+
206
210
  for ids in index.list(**list_kwargs):
207
211
  deleted_ids += len(ids)
208
212
  delete_kwargs = {"ids": ids}
213
+
209
214
  if namespace := self.upload_config.namespace:
210
- delete_resp = delete_kwargs["namespace"] = namespace
211
- # delete_resp should be an empty dict if there were no errors
212
- if delete_resp:
213
- logger.error(f"failed to delete batch of ids: {delete_resp}")
214
- index.delete(**delete_kwargs)
215
+ delete_kwargs["namespace"] = namespace
216
+
217
+ try:
218
+ index.delete(**delete_kwargs)
219
+ except UserError as e:
220
+ logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
221
+
215
222
  logger.info(
216
223
  f"deleted {deleted_ids} records with metadata "
217
224
  f"{self.upload_config.record_id_key}={file_data.identifier} "
@@ -5,6 +5,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
5
5
  add_source_entry,
6
6
  )
7
7
 
8
+ from .databricks_delta_tables import CONNECTOR_TYPE as DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE
9
+ from .databricks_delta_tables import databricks_delta_tables_destination_entry
8
10
  from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
9
11
  from .postgres import postgres_destination_entry, postgres_source_entry
10
12
  from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
@@ -25,3 +27,7 @@ add_destination_entry(destination_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake
25
27
  add_destination_entry(
26
28
  destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
27
29
  )
30
+ add_destination_entry(
31
+ destination_type=DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE,
32
+ entry=databricks_delta_tables_destination_entry,
33
+ )
@@ -0,0 +1,213 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.utils.data_prep import split_dataframe
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces import FileData
13
+ from unstructured_ingest.v2.logger import logger
14
+ from unstructured_ingest.v2.processes.connector_registry import (
15
+ DestinationRegistryEntry,
16
+ )
17
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
18
+ SQLAccessConfig,
19
+ SQLConnectionConfig,
20
+ SQLUploader,
21
+ SQLUploaderConfig,
22
+ SQLUploadStager,
23
+ SQLUploadStagerConfig,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from databricks.sdk.core import oauth_service_principal
28
+ from databricks.sql.client import Connection as DeltaTableConnection
29
+ from databricks.sql.client import Cursor as DeltaTableCursor
30
+
31
+ CONNECTOR_TYPE = "databricks_delta_tables"
32
+
33
+
34
+ class DatabrickDeltaTablesAccessConfig(SQLAccessConfig):
35
+ token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
36
+ client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
37
+ client_secret: Optional[str] = Field(
38
+ default=None, description="Client Secret of the OAuth app."
39
+ )
40
+
41
+
42
+ class DatabrickDeltaTablesConnectionConfig(SQLConnectionConfig):
43
+ access_config: Secret[DatabrickDeltaTablesAccessConfig]
44
+ server_hostname: str = Field(description="server hostname connection config value")
45
+ http_path: str = Field(description="http path connection config value")
46
+ user_agent: str = "unstructuredio_oss"
47
+
48
+ @requires_dependencies(["databricks"], extras="databricks-delta-tables")
49
+ def get_credentials_provider(self) -> "oauth_service_principal":
50
+ from databricks.sdk.core import Config, oauth_service_principal
51
+
52
+ host = f"https://{self.server_hostname}"
53
+ access_configs = self.access_config.get_secret_value()
54
+ if (client_id := access_configs.client_id) and (
55
+ client_secret := access_configs.client_secret
56
+ ):
57
+ return oauth_service_principal(
58
+ Config(
59
+ host=host,
60
+ client_id=client_id,
61
+ client_secret=client_secret,
62
+ )
63
+ )
64
+ return False
65
+
66
+ def model_post_init(self, __context: Any) -> None:
67
+ access_config = self.access_config.get_secret_value()
68
+ if access_config.token and access_config.client_secret and access_config.client_id:
69
+ raise ValueError(
70
+ "One one for of auth can be provided, either token or client id and secret"
71
+ )
72
+ if not access_config.token and not (
73
+ access_config.client_secret and access_config.client_id
74
+ ):
75
+ raise ValueError(
76
+ "One form of auth must be provided, either token or client id and secret"
77
+ )
78
+
79
+ @contextmanager
80
+ @requires_dependencies(["databricks"], extras="databricks-delta-tables")
81
+ def get_connection(self, **connect_kwargs) -> Generator["DeltaTableConnection", None, None]:
82
+ from databricks.sql import connect
83
+
84
+ connect_kwargs = connect_kwargs or {}
85
+ connect_kwargs["_user_agent_entry"] = self.user_agent
86
+ connect_kwargs["server_hostname"] = connect_kwargs.get(
87
+ "server_hostname", self.server_hostname
88
+ )
89
+ connect_kwargs["http_path"] = connect_kwargs.get("http_path", self.http_path)
90
+
91
+ if credential_provider := self.get_credentials_provider():
92
+ connect_kwargs["credentials_provider"] = credential_provider
93
+ else:
94
+ connect_kwargs["access_token"] = self.access_config.get_secret_value().token
95
+ with connect(**connect_kwargs) as connection:
96
+ yield connection
97
+
98
+ @contextmanager
99
+ def get_cursor(self, **connect_kwargs) -> Generator["DeltaTableCursor", None, None]:
100
+ with self.get_connection(**connect_kwargs) as connection:
101
+ cursor = connection.cursor()
102
+ yield cursor
103
+
104
+
105
+ class DatabrickDeltaTablesUploadStagerConfig(SQLUploadStagerConfig):
106
+ pass
107
+
108
+
109
+ class DatabrickDeltaTablesUploadStager(SQLUploadStager):
110
+ upload_stager_config: DatabrickDeltaTablesUploadStagerConfig
111
+
112
+
113
+ class DatabrickDeltaTablesUploaderConfig(SQLUploaderConfig):
114
+ catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
115
+ database: str = Field(description="Database name", default="default")
116
+ table_name: str = Field(description="Table name")
117
+
118
+
119
+ @dataclass
120
+ class DatabrickDeltaTablesUploader(SQLUploader):
121
+ upload_config: DatabrickDeltaTablesUploaderConfig
122
+ connection_config: DatabrickDeltaTablesConnectionConfig
123
+ connector_type: str = CONNECTOR_TYPE
124
+
125
+ @contextmanager
126
+ def get_cursor(self) -> Generator[Any, None, None]:
127
+ with self.connection_config.get_cursor() as cursor:
128
+ cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
129
+ yield cursor
130
+
131
+ def precheck(self) -> None:
132
+ with self.connection_config.get_cursor() as cursor:
133
+ cursor.execute("SHOW CATALOGS")
134
+ catalogs = [r[0] for r in cursor.fetchall()]
135
+ if self.upload_config.catalog not in catalogs:
136
+ raise ValueError(
137
+ "Catalog {} not found in {}".format(
138
+ self.upload_config.catalog, ", ".join(catalogs)
139
+ )
140
+ )
141
+ cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
142
+ cursor.execute("SHOW DATABASES")
143
+ databases = [r[0] for r in cursor.fetchall()]
144
+ if self.upload_config.database not in databases:
145
+ raise ValueError(
146
+ "Database {} not found in {}".format(
147
+ self.upload_config.database, ", ".join(databases)
148
+ )
149
+ )
150
+ cursor.execute("SHOW TABLES")
151
+ table_names = [r[1] for r in cursor.fetchall()]
152
+ if self.upload_config.table_name not in table_names:
153
+ raise ValueError(
154
+ "Table {} not found in {}".format(
155
+ self.upload_config.table_name, ", ".join(table_names)
156
+ )
157
+ )
158
+
159
+ def create_statement(self, columns: list[str], values: tuple[Any, ...]) -> str:
160
+ values_list = []
161
+ for v in values:
162
+ if isinstance(v, dict):
163
+ values_list.append(json.dumps(v))
164
+ elif isinstance(v, list):
165
+ if v and isinstance(v[0], (int, float)):
166
+ values_list.append("ARRAY({})".format(", ".join([str(val) for val in v])))
167
+ else:
168
+ values_list.append("ARRAY({})".format(", ".join([f"'{val}'" for val in v])))
169
+ else:
170
+ values_list.append(f"'{v}'")
171
+ statement = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
172
+ table_name=self.upload_config.table_name,
173
+ columns=", ".join(columns),
174
+ values=", ".join(values_list),
175
+ )
176
+ return statement
177
+
178
+ def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
179
+ if self.can_delete():
180
+ self.delete_by_record_id(file_data=file_data)
181
+ else:
182
+ logger.warning(
183
+ f"table doesn't contain expected "
184
+ f"record id column "
185
+ f"{self.upload_config.record_id_key}, skipping delete"
186
+ )
187
+ df.replace({np.nan: None}, inplace=True)
188
+ self._fit_to_schema(df=df)
189
+
190
+ columns = list(df.columns)
191
+ logger.info(
192
+ f"writing a total of {len(df)} elements via"
193
+ f" document batches to destination"
194
+ f" table named {self.upload_config.table_name}"
195
+ # f" with batch size {self.upload_config.batch_size}"
196
+ )
197
+ # TODO: currently variable binding not supporting for list types,
198
+ # update once that gets resolved in SDK
199
+ for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
200
+ with self.get_cursor() as cursor:
201
+ values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
202
+ for v in values:
203
+ stmt = self.create_statement(columns=columns, values=v)
204
+ cursor.execute(stmt)
205
+
206
+
207
+ databricks_delta_tables_destination_entry = DestinationRegistryEntry(
208
+ connection_config=DatabrickDeltaTablesConnectionConfig,
209
+ uploader=DatabrickDeltaTablesUploader,
210
+ uploader_config=DatabrickDeltaTablesUploaderConfig,
211
+ upload_stager=DatabrickDeltaTablesUploadStager,
212
+ upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
213
+ )
@@ -129,8 +129,13 @@ class SQLIndexer(Indexer, ABC):
129
129
  connection_config: SQLConnectionConfig
130
130
  index_config: SQLIndexerConfig
131
131
 
132
- def _get_doc_ids(self) -> list[str]:
132
+ @contextmanager
133
+ def get_cursor(self) -> Generator[Any, None, None]:
133
134
  with self.connection_config.get_cursor() as cursor:
135
+ yield cursor
136
+
137
+ def _get_doc_ids(self) -> list[str]:
138
+ with self.get_cursor() as cursor:
134
139
  cursor.execute(
135
140
  f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
136
141
  )
@@ -140,7 +145,7 @@ class SQLIndexer(Indexer, ABC):
140
145
 
141
146
  def precheck(self) -> None:
142
147
  try:
143
- with self.connection_config.get_cursor() as cursor:
148
+ with self.get_cursor() as cursor:
144
149
  cursor.execute("SELECT 1;")
145
150
  except Exception as e:
146
151
  logger.error(f"failed to validate connection: {e}", exc_info=True)
@@ -182,6 +187,11 @@ class SQLDownloader(Downloader, ABC):
182
187
  connection_config: SQLConnectionConfig
183
188
  download_config: SQLDownloaderConfig
184
189
 
190
+ @contextmanager
191
+ def get_cursor(self) -> Generator[Any, None, None]:
192
+ with self.connection_config.get_cursor() as cursor:
193
+ yield cursor
194
+
185
195
  @abstractmethod
186
196
  def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
187
197
  pass
@@ -323,12 +333,17 @@ class SQLUploader(Uploader):
323
333
 
324
334
  def precheck(self) -> None:
325
335
  try:
326
- with self.connection_config.get_cursor() as cursor:
336
+ with self.get_cursor() as cursor:
327
337
  cursor.execute("SELECT 1;")
328
338
  except Exception as e:
329
339
  logger.error(f"failed to validate connection: {e}", exc_info=True)
330
340
  raise DestinationConnectionError(f"failed to validate connection: {e}")
331
341
 
342
+ @contextmanager
343
+ def get_cursor(self) -> Generator[Any, None, None]:
344
+ with self.connection_config.get_cursor() as cursor:
345
+ yield cursor
346
+
332
347
  def prepare_data(
333
348
  self, columns: list[str], data: tuple[tuple[Any, ...], ...]
334
349
  ) -> list[tuple[Any, ...]]:
@@ -346,7 +361,7 @@ class SQLUploader(Uploader):
346
361
  output.append(tuple(parsed))
347
362
  return output
348
363
 
349
- def _fit_to_schema(self, df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
364
+ def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
350
365
  columns = set(df.columns)
351
366
  schema_fields = set(columns)
352
367
  columns_to_drop = columns - schema_fields
@@ -367,6 +382,7 @@ class SQLUploader(Uploader):
367
382
 
368
383
  for column in missing_columns:
369
384
  df[column] = pd.Series()
385
+ return df
370
386
 
371
387
  def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
372
388
  if self.can_delete():
@@ -378,7 +394,7 @@ class SQLUploader(Uploader):
378
394
  f"{self.upload_config.record_id_key}, skipping delete"
379
395
  )
380
396
  df.replace({np.nan: None}, inplace=True)
381
- self._fit_to_schema(df=df, columns=self.get_table_columns())
397
+ self._fit_to_schema(df=df)
382
398
 
383
399
  columns = list(df.columns)
384
400
  stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
@@ -393,7 +409,7 @@ class SQLUploader(Uploader):
393
409
  f" with batch size {self.upload_config.batch_size}"
394
410
  )
395
411
  for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
396
- with self.connection_config.get_cursor() as cursor:
412
+ with self.get_cursor() as cursor:
397
413
  values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
398
414
  # For debugging purposes:
399
415
  # for val in values:
@@ -406,7 +422,7 @@ class SQLUploader(Uploader):
406
422
  cursor.executemany(stmt, values)
407
423
 
408
424
  def get_table_columns(self) -> list[str]:
409
- with self.connection_config.get_cursor() as cursor:
425
+ with self.get_cursor() as cursor:
410
426
  cursor.execute(f"SELECT * from {self.upload_config.table_name}")
411
427
  return [desc[0] for desc in cursor.description]
412
428
 
@@ -420,10 +436,11 @@ class SQLUploader(Uploader):
420
436
  f"from table {self.upload_config.table_name}"
421
437
  )
422
438
  stmt = f"DELETE FROM {self.upload_config.table_name} WHERE {self.upload_config.record_id_key} = {self.values_delimiter}" # noqa: E501
423
- with self.connection_config.get_cursor() as cursor:
439
+ with self.get_cursor() as cursor:
424
440
  cursor.execute(stmt, [file_data.identifier])
425
441
  rowcount = cursor.rowcount
426
- logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
442
+ if rowcount > 0:
443
+ logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
427
444
 
428
445
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
429
446
  df = pd.DataFrame(data)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.3.14
3
+ Version: 0.3.15
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,14 +22,14 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: python-dateutil
25
+ Requires-Dist: ndjson
26
+ Requires-Dist: pydantic>=2.7
26
27
  Requires-Dist: pandas
27
- Requires-Dist: tqdm
28
28
  Requires-Dist: dataclasses-json
29
- Requires-Dist: opentelemetry-sdk
29
+ Requires-Dist: tqdm
30
30
  Requires-Dist: click
31
- Requires-Dist: ndjson
32
- Requires-Dist: pydantic>=2.7
31
+ Requires-Dist: python-dateutil
32
+ Requires-Dist: opentelemetry-sdk
33
33
  Provides-Extra: airtable
34
34
  Requires-Dist: pyairtable; extra == "airtable"
35
35
  Provides-Extra: astradb
@@ -40,11 +40,11 @@ Requires-Dist: fsspec; extra == "azure"
40
40
  Provides-Extra: azure-ai-search
41
41
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
42
42
  Provides-Extra: bedrock
43
- Requires-Dist: boto3; extra == "bedrock"
44
43
  Requires-Dist: aioboto3; extra == "bedrock"
44
+ Requires-Dist: boto3; extra == "bedrock"
45
45
  Provides-Extra: biomed
46
- Requires-Dist: requests; extra == "biomed"
47
46
  Requires-Dist: bs4; extra == "biomed"
47
+ Requires-Dist: requests; extra == "biomed"
48
48
  Provides-Extra: box
49
49
  Requires-Dist: boxfs; extra == "box"
50
50
  Requires-Dist: fsspec; extra == "box"
@@ -59,6 +59,8 @@ Provides-Extra: couchbase
59
59
  Requires-Dist: couchbase; extra == "couchbase"
60
60
  Provides-Extra: csv
61
61
  Requires-Dist: unstructured[tsv]; extra == "csv"
62
+ Provides-Extra: databricks-delta-tables
63
+ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
62
64
  Provides-Extra: databricks-volumes
63
65
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
64
66
  Provides-Extra: delta-table
@@ -71,8 +73,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
71
73
  Provides-Extra: docx
72
74
  Requires-Dist: unstructured[docx]; extra == "docx"
73
75
  Provides-Extra: dropbox
74
- Requires-Dist: fsspec; extra == "dropbox"
75
76
  Requires-Dist: dropboxdrivefs; extra == "dropbox"
77
+ Requires-Dist: fsspec; extra == "dropbox"
76
78
  Provides-Extra: duckdb
77
79
  Requires-Dist: duckdb; extra == "duckdb"
78
80
  Provides-Extra: elasticsearch
@@ -82,8 +84,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
82
84
  Provides-Extra: embed-mixedbreadai
83
85
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
84
86
  Provides-Extra: embed-octoai
85
- Requires-Dist: openai; extra == "embed-octoai"
86
87
  Requires-Dist: tiktoken; extra == "embed-octoai"
88
+ Requires-Dist: openai; extra == "embed-octoai"
87
89
  Provides-Extra: embed-vertexai
88
90
  Requires-Dist: vertexai; extra == "embed-vertexai"
89
91
  Provides-Extra: embed-voyageai
@@ -92,8 +94,8 @@ Provides-Extra: epub
92
94
  Requires-Dist: unstructured[epub]; extra == "epub"
93
95
  Provides-Extra: gcs
94
96
  Requires-Dist: bs4; extra == "gcs"
95
- Requires-Dist: fsspec; extra == "gcs"
96
97
  Requires-Dist: gcsfs; extra == "gcs"
98
+ Requires-Dist: fsspec; extra == "gcs"
97
99
  Provides-Extra: github
98
100
  Requires-Dist: pygithub>1.58.0; extra == "github"
99
101
  Requires-Dist: requests; extra == "github"
@@ -122,22 +124,22 @@ Provides-Extra: msg
122
124
  Requires-Dist: unstructured[msg]; extra == "msg"
123
125
  Provides-Extra: neo4j
124
126
  Requires-Dist: cymple; extra == "neo4j"
125
- Requires-Dist: networkx; extra == "neo4j"
126
127
  Requires-Dist: neo4j; extra == "neo4j"
128
+ Requires-Dist: networkx; extra == "neo4j"
127
129
  Provides-Extra: notion
130
+ Requires-Dist: httpx; extra == "notion"
131
+ Requires-Dist: htmlBuilder; extra == "notion"
128
132
  Requires-Dist: backoff; extra == "notion"
129
133
  Requires-Dist: notion-client; extra == "notion"
130
- Requires-Dist: htmlBuilder; extra == "notion"
131
- Requires-Dist: httpx; extra == "notion"
132
134
  Provides-Extra: odt
133
135
  Requires-Dist: unstructured[odt]; extra == "odt"
134
136
  Provides-Extra: onedrive
135
- Requires-Dist: bs4; extra == "onedrive"
136
137
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
138
+ Requires-Dist: bs4; extra == "onedrive"
137
139
  Requires-Dist: msal; extra == "onedrive"
138
140
  Provides-Extra: openai
139
- Requires-Dist: openai; extra == "openai"
140
141
  Requires-Dist: tiktoken; extra == "openai"
142
+ Requires-Dist: openai; extra == "openai"
141
143
  Provides-Extra: opensearch
142
144
  Requires-Dist: opensearch-py; extra == "opensearch"
143
145
  Provides-Extra: org
@@ -168,8 +170,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
168
170
  Provides-Extra: rtf
169
171
  Requires-Dist: unstructured[rtf]; extra == "rtf"
170
172
  Provides-Extra: s3
171
- Requires-Dist: s3fs; extra == "s3"
172
173
  Requires-Dist: fsspec; extra == "s3"
174
+ Requires-Dist: s3fs; extra == "s3"
173
175
  Provides-Extra: salesforce
174
176
  Requires-Dist: simple-salesforce; extra == "salesforce"
175
177
  Provides-Extra: sftp
@@ -183,16 +185,16 @@ Requires-Dist: singlestoredb; extra == "singlestore"
183
185
  Provides-Extra: slack
184
186
  Requires-Dist: slack-sdk[optional]; extra == "slack"
185
187
  Provides-Extra: snowflake
186
- Requires-Dist: psycopg2-binary; extra == "snowflake"
187
188
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
189
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
188
190
  Provides-Extra: togetherai
189
191
  Requires-Dist: together; extra == "togetherai"
190
192
  Provides-Extra: tsv
191
193
  Requires-Dist: unstructured[tsv]; extra == "tsv"
192
194
  Provides-Extra: vectara
193
- Requires-Dist: requests; extra == "vectara"
194
- Requires-Dist: aiofiles; extra == "vectara"
195
195
  Requires-Dist: httpx; extra == "vectara"
196
+ Requires-Dist: aiofiles; extra == "vectara"
197
+ Requires-Dist: requests; extra == "vectara"
196
198
  Provides-Extra: weaviate
197
199
  Requires-Dist: weaviate-client; extra == "weaviate"
198
200
  Provides-Extra: wikipedia
@@ -16,7 +16,7 @@ test/integration/connectors/test_mongodb.py,sha256=0A6DvF-iTCSZzOefisd_i20j9li8u
16
16
  test/integration/connectors/test_neo4j.py,sha256=r4TRYtTXeeOdcRcfa_gvslhSKvoIWrwN1FRJ5XRoH4k,8456
17
17
  test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qkoV3srltFOSSKTA,5403
18
18
  test/integration/connectors/test_onedrive.py,sha256=TcMaa5BIp8J6engS4UZ2t19WQP0NNz2rkpBB47m7A3Y,3835
19
- test/integration/connectors/test_pinecone.py,sha256=nzHwftPt-dPX4H5OrAJ6bs9qqOSOcfJL9jVEcWSPAqo,10325
19
+ test/integration/connectors/test_pinecone.py,sha256=acKEu1vnAk0Ht3FhCnGtOEKaj_YlgCzZB7wRU17ehQ0,12407
20
20
  test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
21
21
  test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
22
22
  test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
@@ -34,6 +34,7 @@ test/integration/connectors/elasticsearch/conftest.py,sha256=-i4_7MkIxSQENz7nuD2
34
34
  test/integration/connectors/elasticsearch/test_elasticsearch.py,sha256=TsSEPsyaTUoEvFBadinrdM0b5C4FoUtEwCv24OUbpO8,12072
35
35
  test/integration/connectors/elasticsearch/test_opensearch.py,sha256=7b7z0GqoBsBqA3IK35N6axmwEMjzJ1l3Fg2WT2c7uqs,11450
36
36
  test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ test/integration/connectors/sql/test_databricks_delta_tables.py,sha256=UjVjw5hVoMSNJoYdoYympYow25gvcDAEHLmUmOJKz7I,5036
37
38
  test/integration/connectors/sql/test_postgres.py,sha256=bGDyzLRpgrXO7nl0U8nF2zSNr6ykUG-w8T4daIqUCG4,6970
38
39
  test/integration/connectors/sql/test_singlestore.py,sha256=XeU2s4Kt_3tGyaDYYKTgYjdOyb8j2dnz4TgSMwFUjWs,6153
39
40
  test/integration/connectors/sql/test_snowflake.py,sha256=LEwsRDoC6-rRiwYsqeo5B9Eo6RYygLLGAUsrtrgI9pM,7494
@@ -96,7 +97,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
96
97
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
98
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
98
99
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
99
- unstructured_ingest/__version__.py,sha256=F6lwOpOsFNj6MPWAGEZBkXIqf1jekdFZ5wZw3drsib8,43
100
+ unstructured_ingest/__version__.py,sha256=31lJzr6gfqqAcVEa6C2kjStzBSJPXWUyP7eRpa8Y7gI,43
100
101
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
101
102
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
102
103
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -386,7 +387,7 @@ unstructured_ingest/v2/interfaces/file_data.py,sha256=7MyRlj5dijQsCR6W18wQ8fEgJi
386
387
  unstructured_ingest/v2/interfaces/indexer.py,sha256=gsa1MLhFa82BzD2h4Yb7ons0VxRwKINZOrzvHAahwVU,846
387
388
  unstructured_ingest/v2/interfaces/process.py,sha256=BgglTu5K93FnDDopZKKr_rkK2LTZOguR6kcQjKHjF40,392
388
389
  unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
389
- unstructured_ingest/v2/interfaces/upload_stager.py,sha256=HSSq_htv009-5yA8QqIi6rRnkfI1fnDkX5JRom8rNDY,3566
390
+ unstructured_ingest/v2/interfaces/upload_stager.py,sha256=nbMuo_U6Gqn9bDJrAJTCjrZXKMw_G28OZOuNsT23i0k,3608
390
391
  unstructured_ingest/v2/interfaces/uploader.py,sha256=T2oHbN-d4Px1w1oATKKYZA10aUssqytEpiaqBM92r0Q,1600
391
392
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
392
393
  unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
@@ -427,19 +428,20 @@ unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNN
427
428
  unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=HU1IwchTM7Q1kkeIFVe-Lg6gInMItBpgkDkVwuTvkGY,14259
428
429
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=d6gC40YmfqBNXxizAt4MO4OOu5BoCZ7SAe1AbNwTP0E,18322
429
430
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
430
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=cohF7gBj0opSGKXlENSdGfTtyIKMHd1pwu4ydeb7JAY,10605
431
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=bQDCch7OGiQgpWO3n3ncLuQ4XCWqDc7ZWEB-Qrqkss8,10730
431
432
  unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
432
433
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
433
434
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
434
435
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
435
436
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
436
437
  unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
437
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=jO71UTC7bLA_N12CrLWJzh_yZML5gfT7VohxzCpUGWg,1848
438
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=MDnTUjFlqOP4rmQA5wkgT2DhwjhFhUwPpUPGSzqCOOE,7577
438
+ unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
439
+ unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=Yj4fIxgGo9Qh1x_6-INdbrPGHCuZElu-QKNfSVtW7F4,7694
439
440
  unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=TA2e_1SIr4VaEI62873eyReCNfgmQ51_2Pko2I04pPM,2747
440
441
  unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=cb-EUW0T-linZMkbU6AcKEGWnFHQvhpO5Abtps4P2X0,3532
441
442
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=tR8NubkyHw49IpW_42g6w1Koxlm56EPiPf1lB-eoRSI,2783
442
443
  unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=dJLD1fueXf8_0AfC4cg0G7siJZVefz68iuEx2Kq7rMs,2890
444
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=muj7G2JFO_WwAPub14k0VqDmN3c56t9MA60rM48wal8,4750
443
445
  unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
444
446
  unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=XTV9Pox3_xVmI8YVQWC9Bn6PugbPM49kp4Scv1OXFys,2649
445
447
  unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=oUHHaLpO2pWW2Lu4Mc-XFjrA0ze97205WQ_xP95ua4M,4296
@@ -538,20 +540,21 @@ unstructured_ingest/v2/processes/connectors/qdrant/cloud.py,sha256=accJ4sNWBVWV-
538
540
  unstructured_ingest/v2/processes/connectors/qdrant/local.py,sha256=cGEyv3Oy6y4BQ4DU8yhJWMpL82QYwBVdPTxxNuV127U,1588
539
541
  unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py,sha256=BHI7HYSdbS05j2vrjyDvLzVG1WfsM8osKeq-lttlybQ,5437
540
542
  unstructured_ingest/v2/processes/connectors/qdrant/server.py,sha256=odvCZWZp8DmRxLXMR7tHhW-c7UQbix1_zpFdfXfCvKI,1613
541
- unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=E16CXRBw8fZKTuXIECns5wif_I07oncBHskVxHC4p7w,1448
543
+ unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=mxcrncrjeP-C2jqQoTOOpGjV3Bmyfg4efT5lq_c-V1E,1760
544
+ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha256=s_W6wSvyIXZ9mdAxvgSXFeFSze9E7pwIvc38p1hVDLM,8839
542
545
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
543
546
  unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=-2E9dsdNhjAiuzeSBytBbAhljOhvQ8kN8wvlUESvLo8,5465
544
547
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=8qCm1XiJmVxy8TSeoxwmQrE2W1x8S8At2ctrS_lJ8-I,7780
545
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=FtI5DSMd1QUgoYLn8NAVoETc4qwCbFEwSulqziXyODY,15292
548
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=ZGpeBfiOEzVaSiQxwqJkMC00Eu6TQhsrZKHnOHM0Xug,15667
546
549
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=Q5RAqn5Ccw-pbeKZLkiMn5IVw6EemCMukXzLlS7pDhc,5162
547
550
  unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
548
551
  unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
549
552
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
550
553
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
551
554
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=X1yv1H_orDQ-J965EMXhR2XaURqe8vovSi9n1fk85B4,10499
552
- unstructured_ingest-0.3.14.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
553
- unstructured_ingest-0.3.14.dist-info/METADATA,sha256=PiIp0oqW-sia84q3v0SXUGy-Oh0fzUZCmZqBogsg7qA,7813
554
- unstructured_ingest-0.3.14.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
555
- unstructured_ingest-0.3.14.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
556
- unstructured_ingest-0.3.14.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
557
- unstructured_ingest-0.3.14.dist-info/RECORD,,
555
+ unstructured_ingest-0.3.15.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
556
+ unstructured_ingest-0.3.15.dist-info/METADATA,sha256=rZFAbiv0HZ-VUWVk4MP2vANZuzsxJLhK2_QWZ5zTjRA,7929
557
+ unstructured_ingest-0.3.15.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
558
+ unstructured_ingest-0.3.15.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
559
+ unstructured_ingest-0.3.15.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
560
+ unstructured_ingest-0.3.15.dist-info/RECORD,,