unstructured-ingest 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (34) hide show
  1. test/integration/connectors/test_confluence.py +4 -4
  2. test/integration/connectors/utils/validation/equality.py +2 -1
  3. test/unit/v2/connectors/sql/__init__.py +0 -0
  4. test/unit/v2/connectors/sql/test_sql.py +74 -0
  5. test/unit/v2/connectors/test_confluence.py +6 -6
  6. unstructured_ingest/__version__.py +1 -1
  7. unstructured_ingest/utils/data_prep.py +11 -3
  8. unstructured_ingest/utils/html.py +109 -0
  9. unstructured_ingest/utils/ndjson.py +52 -0
  10. unstructured_ingest/v2/interfaces/upload_stager.py +3 -13
  11. unstructured_ingest/v2/pipeline/steps/chunk.py +3 -4
  12. unstructured_ingest/v2/pipeline/steps/embed.py +3 -4
  13. unstructured_ingest/v2/pipeline/steps/partition.py +3 -4
  14. unstructured_ingest/v2/processes/connectors/confluence.py +125 -35
  15. unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -2
  16. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -8
  17. unstructured_ingest/v2/processes/connectors/fsspec/box.py +7 -7
  18. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +9 -9
  19. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +41 -9
  20. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +7 -7
  21. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +8 -8
  22. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +5 -5
  23. unstructured_ingest/v2/processes/connectors/sql/__init__.py +4 -0
  24. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +2 -1
  25. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
  26. unstructured_ingest/v2/processes/connectors/sql/sql.py +14 -8
  27. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +2 -1
  28. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +270 -0
  29. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/METADATA +18 -15
  30. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/RECORD +34 -29
  31. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/LICENSE.md +0 -0
  32. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/WHEEL +0 -0
  33. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/entry_points.txt +0 -0
  34. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ from contextlib import contextmanager
1
2
  from dataclasses import dataclass, field
2
3
  from pathlib import Path
3
4
  from typing import TYPE_CHECKING, Generator, List, Optional
@@ -17,6 +18,7 @@ from unstructured_ingest.v2.interfaces import (
17
18
  Indexer,
18
19
  IndexerConfig,
19
20
  SourceIdentifiers,
21
+ download_responses,
20
22
  )
21
23
  from unstructured_ingest.v2.logger import logger
22
24
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -30,38 +32,60 @@ CONNECTOR_TYPE = "confluence"
30
32
 
31
33
 
32
34
  class ConfluenceAccessConfig(AccessConfig):
33
- api_token: Optional[str] = Field(description="Confluence API token", default=None)
34
- access_token: Optional[str] = Field(
35
- description="Confluence Personal Access Token", default=None
35
+ password: Optional[str] = Field(
36
+ description="Confluence password or Cloud API token",
37
+ default=None,
38
+ )
39
+ token: Optional[str] = Field(
40
+ description="Confluence Personal Access Token",
41
+ default=None,
36
42
  )
37
43
 
38
44
 
39
45
  class ConfluenceConnectionConfig(ConnectionConfig):
40
46
  url: str = Field(description="URL of the Confluence instance")
41
- user_email: Optional[str] = Field(description="User email for authentication", default=None)
47
+ username: Optional[str] = Field(
48
+ description="Username or email for authentication",
49
+ default=None,
50
+ )
51
+ cloud: bool = Field(description="Authenticate to Confluence Cloud", default=False)
42
52
  access_config: Secret[ConfluenceAccessConfig] = Field(
43
53
  description="Access configuration for Confluence"
44
54
  )
45
55
 
46
56
  def model_post_init(self, __context):
47
57
  access_configs = self.access_config.get_secret_value()
48
- basic_auth = self.user_email and access_configs.api_token
49
- pat_auth = access_configs.access_token
58
+ basic_auth = self.username and access_configs.password
59
+ pat_auth = access_configs.token
60
+ if self.cloud and not basic_auth:
61
+ raise ValueError(
62
+ "cloud authentication requires username and API token (--password), "
63
+ "see: https://atlassian-python-api.readthedocs.io/"
64
+ )
50
65
  if basic_auth and pat_auth:
51
- raise ValueError("both forms of auth provided, only one allowed")
66
+ raise ValueError(
67
+ "both password and token provided, only one allowed, "
68
+ "see: https://atlassian-python-api.readthedocs.io/"
69
+ )
52
70
  if not (basic_auth or pat_auth):
53
- raise ValueError("neither forms of auth provided")
71
+ raise ValueError(
72
+ "no form of auth provided, see: https://atlassian-python-api.readthedocs.io/"
73
+ )
54
74
 
55
75
  @requires_dependencies(["atlassian"], extras="confluence")
76
+ @contextmanager
56
77
  def get_client(self) -> "Confluence":
57
78
  from atlassian import Confluence
58
79
 
59
80
  access_configs = self.access_config.get_secret_value()
60
- return Confluence(
81
+ with Confluence(
61
82
  url=self.url,
62
- username=self.user_email,
63
- password=access_configs.api_token,
64
- )
83
+ username=self.username,
84
+ password=access_configs.password,
85
+ token=access_configs.token,
86
+ cloud=self.cloud,
87
+ ) as client:
88
+ yield client
65
89
 
66
90
 
67
91
  class ConfluenceIndexerConfig(IndexerConfig):
@@ -83,8 +107,8 @@ class ConfluenceIndexer(Indexer):
83
107
 
84
108
  # Attempt to retrieve a list of spaces with limit=1.
85
109
  # This should only succeed if all creds are valid
86
- client = self.connection_config.get_client()
87
- client.get_all_spaces(limit=1)
110
+ with self.connection_config.get_client() as client:
111
+ client.get_all_spaces(limit=1)
88
112
  logger.info("Connection to Confluence successful.")
89
113
  return True
90
114
  except Exception as e:
@@ -96,21 +120,21 @@ class ConfluenceIndexer(Indexer):
96
120
  if spaces:
97
121
  return spaces
98
122
  else:
99
- client = self.connection_config.get_client()
100
- all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
123
+ with self.connection_config.get_client() as client:
124
+ all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
101
125
  space_ids = [space["key"] for space in all_spaces["results"]]
102
126
  return space_ids
103
127
 
104
128
  def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
105
- client = self.connection_config.get_client()
106
- pages = client.get_all_pages_from_space(
107
- space=space_id,
108
- start=0,
109
- limit=self.index_config.max_num_of_docs_from_each_space,
110
- expand=None,
111
- content_type="page",
112
- status=None,
113
- )
129
+ with self.connection_config.get_client() as client:
130
+ pages = client.get_all_pages_from_space(
131
+ space=space_id,
132
+ start=0,
133
+ limit=self.index_config.max_num_of_docs_from_each_space,
134
+ expand=None,
135
+ content_type="page",
136
+ status=None,
137
+ )
114
138
  doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
115
139
  return doc_ids
116
140
 
@@ -157,7 +181,18 @@ class ConfluenceIndexer(Indexer):
157
181
 
158
182
 
159
183
  class ConfluenceDownloaderConfig(DownloaderConfig):
160
- pass
184
+ extract_images: bool = Field(
185
+ default=False,
186
+ description="if true, will download images and replace "
187
+ "the html content with base64 encoded images",
188
+ )
189
+ extract_files: bool = Field(
190
+ default=False, description="if true, will download any embedded files"
191
+ )
192
+ force_download: bool = Field(
193
+ default=False,
194
+ description="if true, will redownload extracted files even if they already exist locally",
195
+ )
161
196
 
162
197
 
163
198
  @dataclass
@@ -166,14 +201,37 @@ class ConfluenceDownloader(Downloader):
166
201
  download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
167
202
  connector_type: str = CONNECTOR_TYPE
168
203
 
169
- def run(self, file_data: FileData, **kwargs) -> DownloadResponse:
204
+ def download_embedded_files(
205
+ self, session, html: str, current_file_data: FileData
206
+ ) -> list[DownloadResponse]:
207
+ if not self.download_config.extract_files:
208
+ return []
209
+ from unstructured_ingest.utils.html import download_embedded_files
210
+
211
+ filepath = current_file_data.source_identifiers.relative_path
212
+ download_path = Path(self.download_dir) / filepath
213
+ download_dir = download_path.with_suffix("")
214
+ return download_embedded_files(
215
+ download_dir=download_dir,
216
+ original_filedata=current_file_data,
217
+ original_html=html,
218
+ session=session,
219
+ force_download=self.download_config.force_download,
220
+ )
221
+
222
+ def run(self, file_data: FileData, **kwargs) -> download_responses:
223
+ from bs4 import BeautifulSoup
224
+
225
+ from unstructured_ingest.utils.html import convert_image_tags
226
+
170
227
  doc_id = file_data.identifier
171
228
  try:
172
- client = self.connection_config.get_client()
173
- page = client.get_page_by_id(
174
- page_id=doc_id,
175
- expand="history.lastUpdated,version,body.view",
176
- )
229
+ with self.connection_config.get_client() as client:
230
+ page = client.get_page_by_id(
231
+ page_id=doc_id,
232
+ expand="history.lastUpdated,version,body.view",
233
+ )
234
+
177
235
  except Exception as e:
178
236
  logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
179
237
  raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
@@ -182,20 +240,52 @@ class ConfluenceDownloader(Downloader):
182
240
  raise ValueError(f"Page with ID {doc_id} does not exist.")
183
241
 
184
242
  content = page["body"]["view"]["value"]
243
+ # This supports v2 html parsing in unstructured
244
+ title = page["title"]
245
+ title_html = f"<title>{title}</title>"
246
+ content = f"<body class='Document' >{title_html}{content}</body>"
247
+ if self.download_config.extract_images:
248
+ with self.connection_config.get_client() as client:
249
+ content = convert_image_tags(
250
+ url=file_data.metadata.url, original_html=content, session=client._session
251
+ )
185
252
 
186
253
  filepath = file_data.source_identifiers.relative_path
187
254
  download_path = Path(self.download_dir) / filepath
188
255
  download_path.parent.mkdir(parents=True, exist_ok=True)
189
256
  with open(download_path, "w", encoding="utf8") as f:
190
- f.write(content)
257
+ soup = BeautifulSoup(content, "html.parser")
258
+ f.write(soup.prettify())
191
259
 
192
260
  # Update file_data with metadata
193
261
  file_data.metadata.date_created = page["history"]["createdDate"]
194
262
  file_data.metadata.date_modified = page["version"]["when"]
195
263
  file_data.metadata.version = str(page["version"]["number"])
196
- file_data.display_name = page["title"]
264
+ file_data.display_name = title
197
265
 
198
- return self.generate_download_response(file_data=file_data, download_path=download_path)
266
+ download_response = self.generate_download_response(
267
+ file_data=file_data, download_path=download_path
268
+ )
269
+ if self.download_config.extract_files:
270
+ with self.connection_config.get_client() as client:
271
+ extracted_download_responses = self.download_embedded_files(
272
+ html=content,
273
+ current_file_data=download_response["file_data"],
274
+ session=client._session,
275
+ )
276
+ if extracted_download_responses:
277
+ for dr in extracted_download_responses:
278
+ fd = dr["file_data"]
279
+ source_file_path = Path(file_data.source_identifiers.fullpath).with_suffix(
280
+ ""
281
+ )
282
+ new_fullpath = source_file_path / fd.source_identifiers.filename
283
+ fd.source_identifiers = SourceIdentifiers(
284
+ fullpath=new_fullpath.as_posix(), filename=new_fullpath.name
285
+ )
286
+ extracted_download_responses.append(download_response)
287
+ return extracted_download_responses
288
+ return download_response
199
289
 
200
290
 
201
291
  confluence_source_entry = SourceRegistryEntry(
@@ -4,7 +4,7 @@ from typing import Any
4
4
 
5
5
  import pandas as pd
6
6
 
7
- from unstructured_ingest.utils.data_prep import get_data
7
+ from unstructured_ingest.utils.data_prep import get_data, write_data
8
8
  from unstructured_ingest.v2.interfaces import FileData, UploadStager
9
9
  from unstructured_ingest.v2.utils import get_enhanced_element_id
10
10
 
@@ -96,5 +96,5 @@ class BaseDuckDBUploadStager(UploadStager):
96
96
  df[column] = df[column].apply(str)
97
97
 
98
98
  data = df.to_dict(orient="records")
99
- self.write_output(output_path=output_path, data=data)
99
+ write_data(path=output_path, data=data)
100
100
  return output_path
@@ -128,22 +128,22 @@ class AzureIndexer(FsspecIndexer):
128
128
  def sterilize_info(self, file_data: dict) -> dict:
129
129
  return sterilize_dict(data=file_data, default=azure_json_serial)
130
130
 
131
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
132
- path = file_data["name"]
131
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
132
+ path = file_info["name"]
133
133
  date_created = (
134
- str(file_data.get("creation_time").timestamp())
135
- if "creation_time" in file_data
134
+ str(file_info.get("creation_time").timestamp())
135
+ if "creation_time" in file_info
136
136
  else None
137
137
  )
138
138
  date_modified = (
139
- str(file_data.get("last_modified").timestamp())
140
- if "last_modified" in file_data
139
+ str(file_info.get("last_modified").timestamp())
140
+ if "last_modified" in file_info
141
141
  else None
142
142
  )
143
143
 
144
- file_size = file_data.get("size") if "size" in file_data else None
144
+ file_size = file_info.get("size") if "size" in file_info else None
145
145
 
146
- version = file_data.get("etag")
146
+ version = file_info.get("etag")
147
147
  record_locator = {
148
148
  "protocol": self.index_config.protocol,
149
149
  "remote_file_path": self.index_config.remote_url,
@@ -104,22 +104,22 @@ class BoxIndexer(FsspecIndexer):
104
104
  index_config: BoxIndexerConfig
105
105
  connector_type: str = CONNECTOR_TYPE
106
106
 
107
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
108
- path = file_data["name"]
107
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
108
+ path = file_info["name"]
109
109
  date_created = None
110
110
  date_modified = None
111
- if modified_at_str := file_data.get("modified_at"):
111
+ if modified_at_str := file_info.get("modified_at"):
112
112
  date_modified = str(parser.parse(modified_at_str).timestamp())
113
- if created_at_str := file_data.get("created_at"):
113
+ if created_at_str := file_info.get("created_at"):
114
114
  date_created = str(parser.parse(created_at_str).timestamp())
115
115
 
116
- file_size = file_data.get("size") if "size" in file_data else None
116
+ file_size = file_info.get("size") if "size" in file_info else None
117
117
 
118
- version = file_data.get("id")
118
+ version = file_info.get("id")
119
119
  record_locator = {
120
120
  "protocol": self.index_config.protocol,
121
121
  "remote_file_path": self.index_config.remote_url,
122
- "file_id": file_data.get("id"),
122
+ "file_id": file_info.get("id"),
123
123
  }
124
124
  return FileDataSourceMetadata(
125
125
  date_created=date_created,
@@ -93,15 +93,15 @@ class DropboxIndexer(FsspecIndexer):
93
93
  index_config: DropboxIndexerConfig
94
94
  connector_type: str = CONNECTOR_TYPE
95
95
 
96
- def get_path(self, file_data: dict) -> str:
97
- return file_data["name"]
96
+ def get_path(self, file_info: dict) -> str:
97
+ return file_info["name"]
98
98
 
99
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
100
- path = file_data["name"].lstrip("/")
99
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
100
+ path = file_info["name"].lstrip("/")
101
101
  date_created = None
102
102
  date_modified = None
103
- server_modified = file_data.get("server_modified")
104
- client_modified = file_data.get("client_modified")
103
+ server_modified = file_info.get("server_modified")
104
+ client_modified = file_info.get("client_modified")
105
105
  if server_modified and client_modified and server_modified > client_modified:
106
106
  date_created = str(client_modified.timestamp())
107
107
  date_modified = str(server_modified.timestamp())
@@ -109,13 +109,13 @@ class DropboxIndexer(FsspecIndexer):
109
109
  date_created = str(server_modified.timestamp())
110
110
  date_modified = str(client_modified.timestamp())
111
111
 
112
- file_size = file_data.get("size") if "size" in file_data else None
112
+ file_size = file_info.get("size") if "size" in file_info else None
113
113
 
114
- version = file_data.get("content_hash")
114
+ version = file_info.get("content_hash")
115
115
  record_locator = {
116
116
  "protocol": self.index_config.protocol,
117
117
  "remote_file_path": self.index_config.remote_url,
118
- "file_id": file_data.get("id"),
118
+ "file_id": file_info.get("id"),
119
119
  }
120
120
  return FileDataSourceMetadata(
121
121
  date_created=date_created,
@@ -119,7 +119,7 @@ class FsspecIndexer(Indexer):
119
119
  logger.error(f"failed to validate connection: {e}", exc_info=True)
120
120
  raise self.wrap_error(e=e)
121
121
 
122
- def get_file_data(self) -> list[dict[str, Any]]:
122
+ def get_file_info(self) -> list[dict[str, Any]]:
123
123
  if not self.index_config.recursive:
124
124
  # fs.ls does not walk directories
125
125
  # directories that are listed in cloud storage can cause problems
@@ -156,24 +156,56 @@ class FsspecIndexer(Indexer):
156
156
 
157
157
  return random.sample(files, n)
158
158
 
159
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
159
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
160
160
  raise NotImplementedError()
161
161
 
162
- def get_path(self, file_data: dict) -> str:
163
- return file_data["name"]
162
+ def get_path(self, file_info: dict) -> str:
163
+ return file_info["name"]
164
164
 
165
165
  def sterilize_info(self, file_data: dict) -> dict:
166
166
  return sterilize_dict(data=file_data)
167
167
 
168
+ def create_init_file_data(self, remote_filepath: Optional[str] = None) -> FileData:
169
+ # Create initial file data that requires no network calls and is constructed purely
170
+ # with information that exists in the config
171
+ remote_filepath = remote_filepath or self.index_config.remote_url
172
+ path_without_protocol = remote_filepath.split("://")[1]
173
+ rel_path = remote_filepath.replace(path_without_protocol, "").lstrip("/")
174
+ return FileData(
175
+ identifier=str(uuid5(NAMESPACE_DNS, remote_filepath)),
176
+ connector_type=self.connector_type,
177
+ display_name=remote_filepath,
178
+ source_identifiers=SourceIdentifiers(
179
+ filename=Path(remote_filepath).name,
180
+ rel_path=rel_path or None,
181
+ fullpath=remote_filepath,
182
+ ),
183
+ metadata=FileDataSourceMetadata(url=remote_filepath),
184
+ )
185
+
186
+ def hydrate_file_data(self, init_file_data: FileData):
187
+ # Get file info
188
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
189
+ files = client.ls(self.index_config.path_without_protocol, detail=True)
190
+ filtered_files = [
191
+ file for file in files if file.get("size") > 0 and file.get("type") == "file"
192
+ ]
193
+ if not filtered_files:
194
+ raise ValueError(f"{init_file_data} did not reference any valid file")
195
+ if len(filtered_files) > 1:
196
+ raise ValueError(f"{init_file_data} referenced more than one file")
197
+ file_info = filtered_files[0]
198
+ init_file_data.additional_metadata = self.get_metadata(file_info=file_info)
199
+
168
200
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
169
- files = self.get_file_data()
170
- for file_data in files:
171
- file_path = self.get_path(file_data=file_data)
201
+ files = self.get_file_info()
202
+ for file_info in files:
203
+ file_path = self.get_path(file_info=file_info)
172
204
  # Note: we remove any remaining leading slashes (Box introduces these)
173
205
  # to get a valid relative path
174
206
  rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
175
207
 
176
- additional_metadata = self.sterilize_info(file_data=file_data)
208
+ additional_metadata = self.sterilize_info(file_data=file_info)
177
209
  additional_metadata["original_file_path"] = file_path
178
210
  yield FileData(
179
211
  identifier=str(uuid5(NAMESPACE_DNS, file_path)),
@@ -183,7 +215,7 @@ class FsspecIndexer(Indexer):
183
215
  rel_path=rel_path or None,
184
216
  fullpath=file_path,
185
217
  ),
186
- metadata=self.get_metadata(file_data=file_data),
218
+ metadata=self.get_metadata(file_info=file_info),
187
219
  additional_metadata=additional_metadata,
188
220
  display_name=file_path,
189
221
  )
@@ -131,22 +131,22 @@ class GcsIndexer(FsspecIndexer):
131
131
  index_config: GcsIndexerConfig
132
132
  connector_type: str = CONNECTOR_TYPE
133
133
 
134
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
135
- path = file_data["name"]
134
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
135
+ path = file_info["name"]
136
136
  date_created = None
137
137
  date_modified = None
138
- if modified_at_str := file_data.get("updated"):
138
+ if modified_at_str := file_info.get("updated"):
139
139
  date_modified = str(parser.parse(modified_at_str).timestamp())
140
- if created_at_str := file_data.get("timeCreated"):
140
+ if created_at_str := file_info.get("timeCreated"):
141
141
  date_created = str(parser.parse(created_at_str).timestamp())
142
142
 
143
- file_size = file_data.get("size") if "size" in file_data else None
143
+ file_size = file_info.get("size") if "size" in file_info else None
144
144
 
145
- version = file_data.get("etag")
145
+ version = file_info.get("etag")
146
146
  record_locator = {
147
147
  "protocol": self.index_config.protocol,
148
148
  "remote_file_path": self.index_config.remote_url,
149
- "file_id": file_data.get("id"),
149
+ "file_id": file_info.get("id"),
150
150
  }
151
151
  return FileDataSourceMetadata(
152
152
  date_created=date_created,
@@ -110,22 +110,22 @@ class S3Indexer(FsspecIndexer):
110
110
  def wrap_error(self, e: Exception) -> Exception:
111
111
  return self.connection_config.wrap_error(e=e)
112
112
 
113
- def get_path(self, file_data: dict) -> str:
114
- return file_data["Key"]
113
+ def get_path(self, file_info: dict) -> str:
114
+ return file_info["Key"]
115
115
 
116
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
117
- path = file_data["Key"]
116
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
117
+ path = file_info["Key"]
118
118
  date_created = None
119
119
  date_modified = None
120
- modified = file_data.get("LastModified")
120
+ modified = file_info.get("LastModified")
121
121
  if modified:
122
122
  date_created = str(modified.timestamp())
123
123
  date_modified = str(modified.timestamp())
124
124
 
125
- file_size = file_data.get("size") if "size" in file_data else None
126
- file_size = file_size or file_data.get("Size")
125
+ file_size = file_info.get("size") if "size" in file_info else None
126
+ file_size = file_size or file_info.get("Size")
127
127
 
128
- version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
128
+ version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
129
129
  metadata: dict[str, str] = {}
130
130
  with contextlib.suppress(AttributeError):
131
131
  with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
@@ -107,12 +107,12 @@ class SftpIndexer(FsspecIndexer):
107
107
  file.identifier = new_identifier
108
108
  yield file
109
109
 
110
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
111
- path = file_data["name"]
112
- date_created = str(file_data.get("time").timestamp()) if "time" in file_data else None
113
- date_modified = str(file_data.get("mtime").timestamp()) if "mtime" in file_data else None
110
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
111
+ path = file_info["name"]
112
+ date_created = str(file_info.get("time").timestamp()) if "time" in file_info else None
113
+ date_modified = str(file_info.get("mtime").timestamp()) if "mtime" in file_info else None
114
114
 
115
- file_size = file_data.get("size") if "size" in file_data else None
115
+ file_size = file_info.get("size") if "size" in file_info else None
116
116
 
117
117
  record_locator = {
118
118
  "protocol": self.index_config.protocol,
@@ -15,11 +15,14 @@ from .snowflake import CONNECTOR_TYPE as SNOWFLAKE_CONNECTOR_TYPE
15
15
  from .snowflake import snowflake_destination_entry, snowflake_source_entry
16
16
  from .sqlite import CONNECTOR_TYPE as SQLITE_CONNECTOR_TYPE
17
17
  from .sqlite import sqlite_destination_entry, sqlite_source_entry
18
+ from .vastdb import CONNECTOR_TYPE as VASTDB_CONNECTOR_TYPE
19
+ from .vastdb import vastdb_destination_entry, vastdb_source_entry
18
20
 
19
21
  add_source_entry(source_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_source_entry)
20
22
  add_source_entry(source_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_source_entry)
21
23
  add_source_entry(source_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_source_entry)
22
24
  add_source_entry(source_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_source_entry)
25
+ add_source_entry(source_type=VASTDB_CONNECTOR_TYPE, entry=vastdb_source_entry)
23
26
 
24
27
  add_destination_entry(destination_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_destination_entry)
25
28
  add_destination_entry(destination_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_destination_entry)
@@ -31,3 +34,4 @@ add_destination_entry(
31
34
  destination_type=DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE,
32
35
  entry=databricks_delta_tables_destination_entry,
33
36
  )
37
+ add_destination_entry(destination_type=VASTDB_CONNECTOR_TYPE, entry=vastdb_destination_entry)
@@ -3,6 +3,7 @@ from contextlib import contextmanager
3
3
  from dataclasses import dataclass, field
4
4
  from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
+ import pandas as pd
6
7
  from pydantic import Field, Secret
7
8
 
8
9
  from unstructured_ingest.v2.logger import logger
@@ -139,7 +140,7 @@ class SingleStoreUploader(SQLUploader):
139
140
  if isinstance(value, (list, dict)):
140
141
  value = json.dumps(value)
141
142
  if column_name in _DATE_COLUMNS:
142
- if value is None:
143
+ if value is None or pd.isna(value):
143
144
  parsed.append(None)
144
145
  else:
145
146
  parsed.append(parse_date_string(value))
@@ -170,7 +170,7 @@ class SnowflakeUploader(SQLUploader):
170
170
  f"{self.upload_config.record_id_key}, skipping delete"
171
171
  )
172
172
  df.replace({np.nan: None}, inplace=True)
173
- self._fit_to_schema(df=df, columns=self.get_table_columns())
173
+ self._fit_to_schema(df=df)
174
174
 
175
175
  columns = list(df.columns)
176
176
  stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
@@ -14,7 +14,7 @@ from dateutil import parser
14
14
  from pydantic import BaseModel, Field, Secret
15
15
 
16
16
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
17
- from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dataframe
17
+ from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dataframe, write_data
18
18
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
19
19
  from unstructured_ingest.v2.interfaces import (
20
20
  AccessConfig,
@@ -310,9 +310,11 @@ class SQLUploadStager(UploadStager):
310
310
  )
311
311
  df = self.conform_dataframe(df=df)
312
312
 
313
+ output_filename_suffix = Path(elements_filepath).suffix
314
+ output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
313
315
  output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
314
316
 
315
- self.write_output(output_path=output_path, data=df.to_dict(orient="records"))
317
+ write_data(path=output_path, data=df.to_dict(orient="records"))
316
318
  return output_path
317
319
 
318
320
 
@@ -330,6 +332,7 @@ class SQLUploader(Uploader):
330
332
  upload_config: SQLUploaderConfig
331
333
  connection_config: SQLConnectionConfig
332
334
  values_delimiter: str = "?"
335
+ _columns: list[str] = field(init=False, default=None)
333
336
 
334
337
  def precheck(self) -> None:
335
338
  try:
@@ -352,7 +355,7 @@ class SQLUploader(Uploader):
352
355
  parsed = []
353
356
  for column_name, value in zip(columns, row):
354
357
  if column_name in _DATE_COLUMNS:
355
- if value is None:
358
+ if value is None or pd.isna(value): # pandas is nan
356
359
  parsed.append(None)
357
360
  else:
358
361
  parsed.append(parse_date_string(value))
@@ -362,8 +365,9 @@ class SQLUploader(Uploader):
362
365
  return output
363
366
 
364
367
  def _fit_to_schema(self, df: pd.DataFrame) -> pd.DataFrame:
368
+ table_columns = self.get_table_columns()
365
369
  columns = set(df.columns)
366
- schema_fields = set(columns)
370
+ schema_fields = set(table_columns)
367
371
  columns_to_drop = columns - schema_fields
368
372
  missing_columns = schema_fields - columns
369
373
 
@@ -393,8 +397,8 @@ class SQLUploader(Uploader):
393
397
  f"record id column "
394
398
  f"{self.upload_config.record_id_key}, skipping delete"
395
399
  )
400
+ df = self._fit_to_schema(df=df)
396
401
  df.replace({np.nan: None}, inplace=True)
397
- self._fit_to_schema(df=df)
398
402
 
399
403
  columns = list(df.columns)
400
404
  stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
@@ -422,9 +426,11 @@ class SQLUploader(Uploader):
422
426
  cursor.executemany(stmt, values)
423
427
 
424
428
  def get_table_columns(self) -> list[str]:
425
- with self.get_cursor() as cursor:
426
- cursor.execute(f"SELECT * from {self.upload_config.table_name}")
427
- return [desc[0] for desc in cursor.description]
429
+ if self._columns is None:
430
+ with self.get_cursor() as cursor:
431
+ cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
432
+ self._columns = [desc[0] for desc in cursor.description]
433
+ return self._columns
428
434
 
429
435
  def can_delete(self) -> bool:
430
436
  return self.upload_config.record_id_key in self.get_table_columns()