unstructured-ingest 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (35) hide show
  1. test/integration/connectors/sql/test_databricks_delta_tables.py +10 -10
  2. test/integration/connectors/utils/validation/equality.py +2 -1
  3. test/unit/v2/connectors/databricks/__init__.py +0 -0
  4. test/unit/v2/connectors/databricks/test_volumes_table.py +44 -0
  5. test/unit/v2/connectors/sql/test_sql.py +4 -2
  6. unstructured_ingest/__version__.py +1 -1
  7. unstructured_ingest/utils/data_prep.py +11 -3
  8. unstructured_ingest/utils/html.py +109 -0
  9. unstructured_ingest/utils/ndjson.py +52 -0
  10. unstructured_ingest/v2/interfaces/upload_stager.py +3 -13
  11. unstructured_ingest/v2/pipeline/steps/chunk.py +3 -4
  12. unstructured_ingest/v2/pipeline/steps/embed.py +3 -4
  13. unstructured_ingest/v2/pipeline/steps/partition.py +3 -4
  14. unstructured_ingest/v2/processes/connectors/confluence.py +95 -25
  15. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +14 -11
  16. unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -2
  17. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -8
  18. unstructured_ingest/v2/processes/connectors/fsspec/box.py +7 -7
  19. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +9 -9
  20. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +41 -9
  21. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +7 -7
  22. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +8 -8
  23. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +5 -5
  24. unstructured_ingest/v2/processes/connectors/sql/__init__.py +4 -0
  25. unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +15 -15
  26. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +2 -1
  27. unstructured_ingest/v2/processes/connectors/sql/sql.py +14 -7
  28. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +2 -1
  29. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +270 -0
  30. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/METADATA +23 -20
  31. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/RECORD +35 -30
  32. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/LICENSE.md +0 -0
  33. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/WHEEL +0 -0
  34. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/entry_points.txt +0 -0
  35. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/top_level.txt +0 -0
@@ -3,10 +3,11 @@ import os
3
3
  from contextlib import contextmanager
4
4
  from dataclasses import dataclass
5
5
  from pathlib import Path
6
- from typing import Any, Generator, Optional
6
+ from typing import Any, Generator
7
7
 
8
8
  from pydantic import Field
9
9
 
10
+ from unstructured_ingest.utils.data_prep import write_data
10
11
  from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
11
12
  from unstructured_ingest.v2.logger import logger
12
13
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -14,9 +15,9 @@ from unstructured_ingest.v2.processes.connector_registry import (
14
15
  )
15
16
  from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
16
17
  from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
17
- DatabrickDeltaTablesConnectionConfig,
18
- DatabrickDeltaTablesUploadStager,
19
- DatabrickDeltaTablesUploadStagerConfig,
18
+ DatabricksDeltaTablesConnectionConfig,
19
+ DatabricksDeltaTablesUploadStager,
20
+ DatabricksDeltaTablesUploadStagerConfig,
20
21
  )
21
22
 
22
23
  CONNECTOR_TYPE = "databricks_volume_delta_tables"
@@ -28,17 +29,16 @@ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMix
28
29
 
29
30
 
30
31
  @dataclass
31
- class DatabricksVolumeDeltaTableStager(DatabrickDeltaTablesUploadStager):
32
- def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
32
+ class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
33
+ def write_output(self, output_path: Path, data: list[dict]) -> None:
33
34
  # To avoid new line issues when migrating from volumes into delta tables, omit indenting
34
35
  # and always write it as a json file
35
- with output_path.with_suffix(".json").open("w") as f:
36
- json.dump(data, f)
36
+ write_data(path=output_path.with_suffix(".json"), data=data, indent=None)
37
37
 
38
38
 
39
39
  @dataclass
40
40
  class DatabricksVolumeDeltaTableUploader(Uploader):
41
- connection_config: DatabrickDeltaTablesConnectionConfig
41
+ connection_config: DatabricksDeltaTablesConnectionConfig
42
42
  upload_config: DatabricksVolumeDeltaTableUploaderConfig
43
43
  connector_type: str = CONNECTOR_TYPE
44
44
 
@@ -78,7 +78,10 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
78
78
  @contextmanager
79
79
  def get_cursor(self, **connect_kwargs) -> Generator[Any, None, None]:
80
80
  with self.connection_config.get_cursor(**connect_kwargs) as cursor:
81
+ logger.debug(f"executing: USE CATALOG: '{self.upload_config.catalog}'")
81
82
  cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
83
+ logger.debug(f"executing: USE DATABASE: {self.upload_config.database}")
84
+ cursor.execute(f"USE DATABASE {self.upload_config.database}")
82
85
  yield cursor
83
86
 
84
87
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -98,9 +101,9 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
98
101
 
99
102
 
100
103
  databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
101
- connection_config=DatabrickDeltaTablesConnectionConfig,
104
+ connection_config=DatabricksDeltaTablesConnectionConfig,
102
105
  uploader=DatabricksVolumeDeltaTableUploader,
103
106
  uploader_config=DatabricksVolumeDeltaTableUploaderConfig,
104
107
  upload_stager=DatabricksVolumeDeltaTableStager,
105
- upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
108
+ upload_stager_config=DatabricksDeltaTablesUploadStagerConfig,
106
109
  )
@@ -4,7 +4,7 @@ from typing import Any
4
4
 
5
5
  import pandas as pd
6
6
 
7
- from unstructured_ingest.utils.data_prep import get_data
7
+ from unstructured_ingest.utils.data_prep import get_data, write_data
8
8
  from unstructured_ingest.v2.interfaces import FileData, UploadStager
9
9
  from unstructured_ingest.v2.utils import get_enhanced_element_id
10
10
 
@@ -96,5 +96,5 @@ class BaseDuckDBUploadStager(UploadStager):
96
96
  df[column] = df[column].apply(str)
97
97
 
98
98
  data = df.to_dict(orient="records")
99
- self.write_output(output_path=output_path, data=data)
99
+ write_data(path=output_path, data=data)
100
100
  return output_path
@@ -128,22 +128,22 @@ class AzureIndexer(FsspecIndexer):
128
128
  def sterilize_info(self, file_data: dict) -> dict:
129
129
  return sterilize_dict(data=file_data, default=azure_json_serial)
130
130
 
131
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
132
- path = file_data["name"]
131
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
132
+ path = file_info["name"]
133
133
  date_created = (
134
- str(file_data.get("creation_time").timestamp())
135
- if "creation_time" in file_data
134
+ str(file_info.get("creation_time").timestamp())
135
+ if "creation_time" in file_info
136
136
  else None
137
137
  )
138
138
  date_modified = (
139
- str(file_data.get("last_modified").timestamp())
140
- if "last_modified" in file_data
139
+ str(file_info.get("last_modified").timestamp())
140
+ if "last_modified" in file_info
141
141
  else None
142
142
  )
143
143
 
144
- file_size = file_data.get("size") if "size" in file_data else None
144
+ file_size = file_info.get("size") if "size" in file_info else None
145
145
 
146
- version = file_data.get("etag")
146
+ version = file_info.get("etag")
147
147
  record_locator = {
148
148
  "protocol": self.index_config.protocol,
149
149
  "remote_file_path": self.index_config.remote_url,
@@ -104,22 +104,22 @@ class BoxIndexer(FsspecIndexer):
104
104
  index_config: BoxIndexerConfig
105
105
  connector_type: str = CONNECTOR_TYPE
106
106
 
107
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
108
- path = file_data["name"]
107
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
108
+ path = file_info["name"]
109
109
  date_created = None
110
110
  date_modified = None
111
- if modified_at_str := file_data.get("modified_at"):
111
+ if modified_at_str := file_info.get("modified_at"):
112
112
  date_modified = str(parser.parse(modified_at_str).timestamp())
113
- if created_at_str := file_data.get("created_at"):
113
+ if created_at_str := file_info.get("created_at"):
114
114
  date_created = str(parser.parse(created_at_str).timestamp())
115
115
 
116
- file_size = file_data.get("size") if "size" in file_data else None
116
+ file_size = file_info.get("size") if "size" in file_info else None
117
117
 
118
- version = file_data.get("id")
118
+ version = file_info.get("id")
119
119
  record_locator = {
120
120
  "protocol": self.index_config.protocol,
121
121
  "remote_file_path": self.index_config.remote_url,
122
- "file_id": file_data.get("id"),
122
+ "file_id": file_info.get("id"),
123
123
  }
124
124
  return FileDataSourceMetadata(
125
125
  date_created=date_created,
@@ -93,15 +93,15 @@ class DropboxIndexer(FsspecIndexer):
93
93
  index_config: DropboxIndexerConfig
94
94
  connector_type: str = CONNECTOR_TYPE
95
95
 
96
- def get_path(self, file_data: dict) -> str:
97
- return file_data["name"]
96
+ def get_path(self, file_info: dict) -> str:
97
+ return file_info["name"]
98
98
 
99
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
100
- path = file_data["name"].lstrip("/")
99
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
100
+ path = file_info["name"].lstrip("/")
101
101
  date_created = None
102
102
  date_modified = None
103
- server_modified = file_data.get("server_modified")
104
- client_modified = file_data.get("client_modified")
103
+ server_modified = file_info.get("server_modified")
104
+ client_modified = file_info.get("client_modified")
105
105
  if server_modified and client_modified and server_modified > client_modified:
106
106
  date_created = str(client_modified.timestamp())
107
107
  date_modified = str(server_modified.timestamp())
@@ -109,13 +109,13 @@ class DropboxIndexer(FsspecIndexer):
109
109
  date_created = str(server_modified.timestamp())
110
110
  date_modified = str(client_modified.timestamp())
111
111
 
112
- file_size = file_data.get("size") if "size" in file_data else None
112
+ file_size = file_info.get("size") if "size" in file_info else None
113
113
 
114
- version = file_data.get("content_hash")
114
+ version = file_info.get("content_hash")
115
115
  record_locator = {
116
116
  "protocol": self.index_config.protocol,
117
117
  "remote_file_path": self.index_config.remote_url,
118
- "file_id": file_data.get("id"),
118
+ "file_id": file_info.get("id"),
119
119
  }
120
120
  return FileDataSourceMetadata(
121
121
  date_created=date_created,
@@ -119,7 +119,7 @@ class FsspecIndexer(Indexer):
119
119
  logger.error(f"failed to validate connection: {e}", exc_info=True)
120
120
  raise self.wrap_error(e=e)
121
121
 
122
- def get_file_data(self) -> list[dict[str, Any]]:
122
+ def get_file_info(self) -> list[dict[str, Any]]:
123
123
  if not self.index_config.recursive:
124
124
  # fs.ls does not walk directories
125
125
  # directories that are listed in cloud storage can cause problems
@@ -156,24 +156,56 @@ class FsspecIndexer(Indexer):
156
156
 
157
157
  return random.sample(files, n)
158
158
 
159
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
159
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
160
160
  raise NotImplementedError()
161
161
 
162
- def get_path(self, file_data: dict) -> str:
163
- return file_data["name"]
162
+ def get_path(self, file_info: dict) -> str:
163
+ return file_info["name"]
164
164
 
165
165
  def sterilize_info(self, file_data: dict) -> dict:
166
166
  return sterilize_dict(data=file_data)
167
167
 
168
+ def create_init_file_data(self, remote_filepath: Optional[str] = None) -> FileData:
169
+ # Create initial file data that requires no network calls and is constructed purely
170
+ # with information that exists in the config
171
+ remote_filepath = remote_filepath or self.index_config.remote_url
172
+ path_without_protocol = remote_filepath.split("://")[1]
173
+ rel_path = remote_filepath.replace(path_without_protocol, "").lstrip("/")
174
+ return FileData(
175
+ identifier=str(uuid5(NAMESPACE_DNS, remote_filepath)),
176
+ connector_type=self.connector_type,
177
+ display_name=remote_filepath,
178
+ source_identifiers=SourceIdentifiers(
179
+ filename=Path(remote_filepath).name,
180
+ rel_path=rel_path or None,
181
+ fullpath=remote_filepath,
182
+ ),
183
+ metadata=FileDataSourceMetadata(url=remote_filepath),
184
+ )
185
+
186
+ def hydrate_file_data(self, init_file_data: FileData):
187
+ # Get file info
188
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
189
+ files = client.ls(self.index_config.path_without_protocol, detail=True)
190
+ filtered_files = [
191
+ file for file in files if file.get("size") > 0 and file.get("type") == "file"
192
+ ]
193
+ if not filtered_files:
194
+ raise ValueError(f"{init_file_data} did not reference any valid file")
195
+ if len(filtered_files) > 1:
196
+ raise ValueError(f"{init_file_data} referenced more than one file")
197
+ file_info = filtered_files[0]
198
+ init_file_data.additional_metadata = self.get_metadata(file_info=file_info)
199
+
168
200
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
169
- files = self.get_file_data()
170
- for file_data in files:
171
- file_path = self.get_path(file_data=file_data)
201
+ files = self.get_file_info()
202
+ for file_info in files:
203
+ file_path = self.get_path(file_info=file_info)
172
204
  # Note: we remove any remaining leading slashes (Box introduces these)
173
205
  # to get a valid relative path
174
206
  rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
175
207
 
176
- additional_metadata = self.sterilize_info(file_data=file_data)
208
+ additional_metadata = self.sterilize_info(file_data=file_info)
177
209
  additional_metadata["original_file_path"] = file_path
178
210
  yield FileData(
179
211
  identifier=str(uuid5(NAMESPACE_DNS, file_path)),
@@ -183,7 +215,7 @@ class FsspecIndexer(Indexer):
183
215
  rel_path=rel_path or None,
184
216
  fullpath=file_path,
185
217
  ),
186
- metadata=self.get_metadata(file_data=file_data),
218
+ metadata=self.get_metadata(file_info=file_info),
187
219
  additional_metadata=additional_metadata,
188
220
  display_name=file_path,
189
221
  )
@@ -131,22 +131,22 @@ class GcsIndexer(FsspecIndexer):
131
131
  index_config: GcsIndexerConfig
132
132
  connector_type: str = CONNECTOR_TYPE
133
133
 
134
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
135
- path = file_data["name"]
134
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
135
+ path = file_info["name"]
136
136
  date_created = None
137
137
  date_modified = None
138
- if modified_at_str := file_data.get("updated"):
138
+ if modified_at_str := file_info.get("updated"):
139
139
  date_modified = str(parser.parse(modified_at_str).timestamp())
140
- if created_at_str := file_data.get("timeCreated"):
140
+ if created_at_str := file_info.get("timeCreated"):
141
141
  date_created = str(parser.parse(created_at_str).timestamp())
142
142
 
143
- file_size = file_data.get("size") if "size" in file_data else None
143
+ file_size = file_info.get("size") if "size" in file_info else None
144
144
 
145
- version = file_data.get("etag")
145
+ version = file_info.get("etag")
146
146
  record_locator = {
147
147
  "protocol": self.index_config.protocol,
148
148
  "remote_file_path": self.index_config.remote_url,
149
- "file_id": file_data.get("id"),
149
+ "file_id": file_info.get("id"),
150
150
  }
151
151
  return FileDataSourceMetadata(
152
152
  date_created=date_created,
@@ -110,22 +110,22 @@ class S3Indexer(FsspecIndexer):
110
110
  def wrap_error(self, e: Exception) -> Exception:
111
111
  return self.connection_config.wrap_error(e=e)
112
112
 
113
- def get_path(self, file_data: dict) -> str:
114
- return file_data["Key"]
113
+ def get_path(self, file_info: dict) -> str:
114
+ return file_info["Key"]
115
115
 
116
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
117
- path = file_data["Key"]
116
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
117
+ path = file_info["Key"]
118
118
  date_created = None
119
119
  date_modified = None
120
- modified = file_data.get("LastModified")
120
+ modified = file_info.get("LastModified")
121
121
  if modified:
122
122
  date_created = str(modified.timestamp())
123
123
  date_modified = str(modified.timestamp())
124
124
 
125
- file_size = file_data.get("size") if "size" in file_data else None
126
- file_size = file_size or file_data.get("Size")
125
+ file_size = file_info.get("size") if "size" in file_info else None
126
+ file_size = file_size or file_info.get("Size")
127
127
 
128
- version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
128
+ version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
129
129
  metadata: dict[str, str] = {}
130
130
  with contextlib.suppress(AttributeError):
131
131
  with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
@@ -107,12 +107,12 @@ class SftpIndexer(FsspecIndexer):
107
107
  file.identifier = new_identifier
108
108
  yield file
109
109
 
110
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
111
- path = file_data["name"]
112
- date_created = str(file_data.get("time").timestamp()) if "time" in file_data else None
113
- date_modified = str(file_data.get("mtime").timestamp()) if "mtime" in file_data else None
110
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
111
+ path = file_info["name"]
112
+ date_created = str(file_info.get("time").timestamp()) if "time" in file_info else None
113
+ date_modified = str(file_info.get("mtime").timestamp()) if "mtime" in file_info else None
114
114
 
115
- file_size = file_data.get("size") if "size" in file_data else None
115
+ file_size = file_info.get("size") if "size" in file_info else None
116
116
 
117
117
  record_locator = {
118
118
  "protocol": self.index_config.protocol,
@@ -15,11 +15,14 @@ from .snowflake import CONNECTOR_TYPE as SNOWFLAKE_CONNECTOR_TYPE
15
15
  from .snowflake import snowflake_destination_entry, snowflake_source_entry
16
16
  from .sqlite import CONNECTOR_TYPE as SQLITE_CONNECTOR_TYPE
17
17
  from .sqlite import sqlite_destination_entry, sqlite_source_entry
18
+ from .vastdb import CONNECTOR_TYPE as VASTDB_CONNECTOR_TYPE
19
+ from .vastdb import vastdb_destination_entry, vastdb_source_entry
18
20
 
19
21
  add_source_entry(source_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_source_entry)
20
22
  add_source_entry(source_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_source_entry)
21
23
  add_source_entry(source_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_source_entry)
22
24
  add_source_entry(source_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_source_entry)
25
+ add_source_entry(source_type=VASTDB_CONNECTOR_TYPE, entry=vastdb_source_entry)
23
26
 
24
27
  add_destination_entry(destination_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_destination_entry)
25
28
  add_destination_entry(destination_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_destination_entry)
@@ -31,3 +34,4 @@ add_destination_entry(
31
34
  destination_type=DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE,
32
35
  entry=databricks_delta_tables_destination_entry,
33
36
  )
37
+ add_destination_entry(destination_type=VASTDB_CONNECTOR_TYPE, entry=vastdb_destination_entry)
@@ -31,7 +31,7 @@ if TYPE_CHECKING:
31
31
  CONNECTOR_TYPE = "databricks_delta_tables"
32
32
 
33
33
 
34
- class DatabrickDeltaTablesAccessConfig(SQLAccessConfig):
34
+ class DatabricksDeltaTablesAccessConfig(SQLAccessConfig):
35
35
  token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
36
36
  client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
37
37
  client_secret: Optional[str] = Field(
@@ -39,8 +39,8 @@ class DatabrickDeltaTablesAccessConfig(SQLAccessConfig):
39
39
  )
40
40
 
41
41
 
42
- class DatabrickDeltaTablesConnectionConfig(SQLConnectionConfig):
43
- access_config: Secret[DatabrickDeltaTablesAccessConfig]
42
+ class DatabricksDeltaTablesConnectionConfig(SQLConnectionConfig):
43
+ access_config: Secret[DatabricksDeltaTablesAccessConfig]
44
44
  server_hostname: str = Field(description="server hostname connection config value")
45
45
  http_path: str = Field(description="http path connection config value")
46
46
  user_agent: str = "unstructuredio_oss"
@@ -102,24 +102,24 @@ class DatabrickDeltaTablesConnectionConfig(SQLConnectionConfig):
102
102
  yield cursor
103
103
 
104
104
 
105
- class DatabrickDeltaTablesUploadStagerConfig(SQLUploadStagerConfig):
105
+ class DatabricksDeltaTablesUploadStagerConfig(SQLUploadStagerConfig):
106
106
  pass
107
107
 
108
108
 
109
- class DatabrickDeltaTablesUploadStager(SQLUploadStager):
110
- upload_stager_config: DatabrickDeltaTablesUploadStagerConfig
109
+ class DatabricksDeltaTablesUploadStager(SQLUploadStager):
110
+ upload_stager_config: DatabricksDeltaTablesUploadStagerConfig
111
111
 
112
112
 
113
- class DatabrickDeltaTablesUploaderConfig(SQLUploaderConfig):
113
+ class DatabricksDeltaTablesUploaderConfig(SQLUploaderConfig):
114
114
  catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
115
115
  database: str = Field(description="Database name", default="default")
116
116
  table_name: str = Field(description="Table name")
117
117
 
118
118
 
119
119
  @dataclass
120
- class DatabrickDeltaTablesUploader(SQLUploader):
121
- upload_config: DatabrickDeltaTablesUploaderConfig
122
- connection_config: DatabrickDeltaTablesConnectionConfig
120
+ class DatabricksDeltaTablesUploader(SQLUploader):
121
+ upload_config: DatabricksDeltaTablesUploaderConfig
122
+ connection_config: DatabricksDeltaTablesConnectionConfig
123
123
  connector_type: str = CONNECTOR_TYPE
124
124
 
125
125
  @contextmanager
@@ -205,9 +205,9 @@ class DatabrickDeltaTablesUploader(SQLUploader):
205
205
 
206
206
 
207
207
  databricks_delta_tables_destination_entry = DestinationRegistryEntry(
208
- connection_config=DatabrickDeltaTablesConnectionConfig,
209
- uploader=DatabrickDeltaTablesUploader,
210
- uploader_config=DatabrickDeltaTablesUploaderConfig,
211
- upload_stager=DatabrickDeltaTablesUploadStager,
212
- upload_stager_config=DatabrickDeltaTablesUploadStagerConfig,
208
+ connection_config=DatabricksDeltaTablesConnectionConfig,
209
+ uploader=DatabricksDeltaTablesUploader,
210
+ uploader_config=DatabricksDeltaTablesUploaderConfig,
211
+ upload_stager=DatabricksDeltaTablesUploadStager,
212
+ upload_stager_config=DatabricksDeltaTablesUploadStagerConfig,
213
213
  )
@@ -3,6 +3,7 @@ from contextlib import contextmanager
3
3
  from dataclasses import dataclass, field
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
+ import pandas as pd
6
7
  from pydantic import Field, Secret
7
8
 
8
9
  from unstructured_ingest.v2.logger import logger
@@ -139,7 +140,7 @@ class SingleStoreUploader(SQLUploader):
139
140
  if isinstance(value, (list, dict)):
140
141
  value = json.dumps(value)
141
142
  if column_name in _DATE_COLUMNS:
142
- if value is None:
143
+ if value is None or pd.isna(value):
143
144
  parsed.append(None)
144
145
  else:
145
146
  parsed.append(parse_date_string(value))
@@ -14,7 +14,7 @@ from dateutil import parser
14
14
  from pydantic import BaseModel, Field, Secret
15
15
 
16
16
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
17
- from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dataframe
17
+ from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dataframe, write_data
18
18
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
19
19
  from unstructured_ingest.v2.interfaces import (
20
20
  AccessConfig,
@@ -292,6 +292,9 @@ class SQLUploadStager(UploadStager):
292
292
  df[column] = df[column].apply(str)
293
293
  return df
294
294
 
295
+ def write_output(self, output_path: Path, data: list[dict]) -> None:
296
+ write_data(path=output_path, data=data)
297
+
295
298
  def run(
296
299
  self,
297
300
  elements_filepath: Path,
@@ -332,6 +335,7 @@ class SQLUploader(Uploader):
332
335
  upload_config: SQLUploaderConfig
333
336
  connection_config: SQLConnectionConfig
334
337
  values_delimiter: str = "?"
338
+ _columns: list[str] = field(init=False, default=None)
335
339
 
336
340
  def precheck(self) -> None:
337
341
  try:
@@ -354,7 +358,7 @@ class SQLUploader(Uploader):
354
358
  parsed = []
355
359
  for column_name, value in zip(columns, row):
356
360
  if column_name in _DATE_COLUMNS:
357
- if value is None:
361
+ if value is None or pd.isna(value): # pandas is nan
358
362
  parsed.append(None)
359
363
  else:
360
364
  parsed.append(parse_date_string(value))
@@ -364,8 +368,9 @@ class SQLUploader(Uploader):
364
368
  return output
365
369
 
366
370
  def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
371
+ table_columns = self.get_table_columns()
367
372
  columns = set(df.columns)
368
- schema_fields = set(columns)
373
+ schema_fields = set(table_columns)
369
374
  columns_to_drop = columns - schema_fields
370
375
  missing_columns = schema_fields - columns
371
376
 
@@ -395,8 +400,8 @@ class SQLUploader(Uploader):
395
400
  f"record id column "
396
401
  f"{self.upload_config.record_id_key}, skipping delete"
397
402
  )
403
+ df = self._fit_to_schema(df=df)
398
404
  df.replace({np.nan: None}, inplace=True)
399
- self._fit_to_schema(df=df)
400
405
 
401
406
  columns = list(df.columns)
402
407
  stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
@@ -424,9 +429,11 @@ class SQLUploader(Uploader):
424
429
  cursor.executemany(stmt, values)
425
430
 
426
431
  def get_table_columns(self) -> list[str]:
427
- with self.get_cursor() as cursor:
428
- cursor.execute(f"SELECT * from {self.upload_config.table_name}")
429
- return [desc[0] for desc in cursor.description]
432
+ if self._columns is None:
433
+ with self.get_cursor() as cursor:
434
+ cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
435
+ self._columns = [desc[0] for desc in cursor.description]
436
+ return self._columns
430
437
 
431
438
  def can_delete(self) -> bool:
432
439
  return self.upload_config.record_id_key in self.get_table_columns()
@@ -4,6 +4,7 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import TYPE_CHECKING, Any, Generator
6
6
 
7
+ import pandas as pd
7
8
  from pydantic import Field, Secret, model_validator
8
9
 
9
10
  from unstructured_ingest.v2.logger import logger
@@ -141,7 +142,7 @@ class SQLiteUploader(SQLUploader):
141
142
  if isinstance(value, (list, dict)):
142
143
  value = json.dumps(value)
143
144
  if column_name in _DATE_COLUMNS:
144
- if value is None:
145
+ if value is None or pd.isna(value):
145
146
  parsed.append(None)
146
147
  else:
147
148
  parsed.append(parse_date_string(value))