unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (64) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +102 -91
  10. test/integration/connectors/sql/test_singlestore.py +111 -99
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +86 -75
  13. test/integration/connectors/test_astradb.py +22 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +4 -4
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +3 -3
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +15 -91
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  35. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  36. unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
  37. unstructured_ingest/v2/processes/connectors/astradb.py +8 -30
  38. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  39. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  40. unstructured_ingest/v2/processes/connectors/couchbase.py +42 -52
  41. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  42. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  43. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  44. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  45. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +5 -30
  46. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  47. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  48. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  49. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  50. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  51. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  52. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  53. unstructured_ingest/v2/processes/connectors/mongodb.py +4 -8
  54. unstructured_ingest/v2/processes/connectors/neo4j.py +381 -0
  55. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
  56. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  57. unstructured_ingest/v2/processes/connectors/sql/sql.py +41 -40
  58. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  59. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/METADATA +18 -14
  60. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/RECORD +64 -56
  61. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/LICENSE.md +0 -0
  62. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/WHEEL +0 -0
  63. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/entry_points.txt +0 -0
  64. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,5 @@
1
- import json
2
1
  from dataclasses import dataclass, field
3
2
  from datetime import date, datetime
4
- from pathlib import Path
5
3
  from typing import TYPE_CHECKING, Annotated, Any, Optional
6
4
 
7
5
  from dateutil import parser
@@ -42,7 +40,6 @@ class ChromaAccessConfig(AccessConfig):
42
40
 
43
41
 
44
42
  class ChromaConnectionConfig(ConnectionConfig):
45
- collection_name: str = Field(description="The name of the Chroma collection to write into.")
46
43
  access_config: Secret[ChromaAccessConfig] = Field(
47
44
  default=ChromaAccessConfig(), validate_default=True
48
45
  )
@@ -62,6 +59,32 @@ class ChromaConnectionConfig(ConnectionConfig):
62
59
  )
63
60
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
64
61
 
62
+ @requires_dependencies(["chromadb"], extras="chroma")
63
+ def get_client(self) -> "Client":
64
+ import chromadb
65
+
66
+ access_config = self.access_config.get_secret_value()
67
+ if path := self.path:
68
+ return chromadb.PersistentClient(
69
+ path=path,
70
+ settings=access_config.settings,
71
+ tenant=self.tenant,
72
+ database=self.database,
73
+ )
74
+
75
+ elif (host := self.host) and (port := self.port):
76
+ return chromadb.HttpClient(
77
+ host=host,
78
+ port=str(port),
79
+ ssl=self.ssl,
80
+ headers=access_config.headers,
81
+ settings=access_config.settings,
82
+ tenant=self.tenant,
83
+ database=self.database,
84
+ )
85
+ else:
86
+ raise ValueError("Chroma connector requires either path or host and port to be set.")
87
+
65
88
 
66
89
  class ChromaUploadStagerConfig(UploadStagerConfig):
67
90
  pass
@@ -82,11 +105,11 @@ class ChromaUploadStager(UploadStager):
82
105
  logger.debug(f"date {date_string} string not a timestamp: {e}")
83
106
  return parser.parse(date_string)
84
107
 
85
- @staticmethod
86
- def conform_dict(data: dict, file_data: FileData) -> dict:
108
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
87
109
  """
88
110
  Prepares dictionary in the format that Chroma requires
89
111
  """
112
+ data = element_dict.copy()
90
113
  return {
91
114
  "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
92
115
  "embedding": data.pop("embeddings", None),
@@ -94,26 +117,9 @@ class ChromaUploadStager(UploadStager):
94
117
  "metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True),
95
118
  }
96
119
 
97
- def run(
98
- self,
99
- elements_filepath: Path,
100
- file_data: FileData,
101
- output_dir: Path,
102
- output_filename: str,
103
- **kwargs: Any,
104
- ) -> Path:
105
- with open(elements_filepath) as elements_file:
106
- elements_contents = json.load(elements_file)
107
- conformed_elements = [
108
- self.conform_dict(data=element, file_data=file_data) for element in elements_contents
109
- ]
110
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
111
- with open(output_path, "w") as output_file:
112
- json.dump(conformed_elements, output_file)
113
- return output_path
114
-
115
120
 
116
121
  class ChromaUploaderConfig(UploaderConfig):
122
+ collection_name: str = Field(description="The name of the Chroma collection to write into.")
117
123
  batch_size: int = Field(default=100, description="Number of records per batch")
118
124
 
119
125
 
@@ -125,37 +131,11 @@ class ChromaUploader(Uploader):
125
131
 
126
132
  def precheck(self) -> None:
127
133
  try:
128
- self.create_client()
134
+ self.connection_config.get_client()
129
135
  except Exception as e:
130
136
  logger.error(f"failed to validate connection: {e}", exc_info=True)
131
137
  raise DestinationConnectionError(f"failed to validate connection: {e}")
132
138
 
133
- @requires_dependencies(["chromadb"], extras="chroma")
134
- def create_client(self) -> "Client":
135
- import chromadb
136
-
137
- access_config = self.connection_config.access_config.get_secret_value()
138
- if self.connection_config.path:
139
- return chromadb.PersistentClient(
140
- path=self.connection_config.path,
141
- settings=access_config.settings,
142
- tenant=self.connection_config.tenant,
143
- database=self.connection_config.database,
144
- )
145
-
146
- elif self.connection_config.host and self.connection_config.port:
147
- return chromadb.HttpClient(
148
- host=self.connection_config.host,
149
- port=self.connection_config.port,
150
- ssl=self.connection_config.ssl,
151
- headers=access_config.headers,
152
- settings=access_config.settings,
153
- tenant=self.connection_config.tenant,
154
- database=self.connection_config.database,
155
- )
156
- else:
157
- raise ValueError("Chroma connector requires either path or host and port to be set.")
158
-
159
139
  @DestinationConnectionError.wrap
160
140
  def upsert_batch(self, collection, batch):
161
141
 
@@ -189,19 +169,16 @@ class ChromaUploader(Uploader):
189
169
  )
190
170
  return chroma_dict
191
171
 
192
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
193
- with path.open("r") as file:
194
- elements_dict = json.load(file)
195
-
172
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
196
173
  logger.info(
197
- f"writing {len(elements_dict)} objects to destination "
198
- f"collection {self.connection_config.collection_name} "
174
+ f"writing {len(data)} objects to destination "
175
+ f"collection {self.upload_config.collection_name} "
199
176
  f"at {self.connection_config.host}",
200
177
  )
201
- client = self.create_client()
178
+ client = self.connection_config.get_client()
202
179
 
203
- collection = client.get_or_create_collection(name=self.connection_config.collection_name)
204
- for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
180
+ collection = client.get_or_create_collection(name=self.upload_config.collection_name)
181
+ for chunk in batch_generator(data, self.upload_config.batch_size):
205
182
  self.upsert_batch(collection, self.prepare_chroma_list(chunk))
206
183
 
207
184
 
@@ -1,7 +1,7 @@
1
1
  import hashlib
2
- import json
3
2
  import sys
4
3
  import time
4
+ from contextlib import contextmanager
5
5
  from dataclasses import dataclass, field
6
6
  from datetime import timedelta
7
7
  from pathlib import Path
@@ -65,7 +65,8 @@ class CouchbaseConnectionConfig(ConnectionConfig):
65
65
  access_config: Secret[CouchbaseAccessConfig]
66
66
 
67
67
  @requires_dependencies(["couchbase"], extras="couchbase")
68
- def connect_to_couchbase(self) -> "Cluster":
68
+ @contextmanager
69
+ def get_client(self) -> Generator["Cluster", None, None]:
69
70
  from couchbase.auth import PasswordAuthenticator
70
71
  from couchbase.cluster import Cluster
71
72
  from couchbase.options import ClusterOptions
@@ -73,9 +74,14 @@ class CouchbaseConnectionConfig(ConnectionConfig):
73
74
  auth = PasswordAuthenticator(self.username, self.access_config.get_secret_value().password)
74
75
  options = ClusterOptions(auth)
75
76
  options.apply_profile("wan_development")
76
- cluster = Cluster(self.connection_string, options)
77
- cluster.wait_until_ready(timedelta(seconds=5))
78
- return cluster
77
+ cluster = None
78
+ try:
79
+ cluster = Cluster(self.connection_string, options)
80
+ cluster.wait_until_ready(timedelta(seconds=5))
81
+ yield cluster
82
+ finally:
83
+ if cluster:
84
+ cluster.close()
79
85
 
80
86
 
81
87
  class CouchbaseUploadStagerConfig(UploadStagerConfig):
@@ -88,32 +94,16 @@ class CouchbaseUploadStager(UploadStager):
88
94
  default_factory=lambda: CouchbaseUploadStagerConfig()
89
95
  )
90
96
 
91
- def run(
92
- self,
93
- elements_filepath: Path,
94
- output_dir: Path,
95
- output_filename: str,
96
- **kwargs: Any,
97
- ) -> Path:
98
- with open(elements_filepath) as elements_file:
99
- elements_contents = json.load(elements_file)
100
-
101
- output_elements = []
102
- for element in elements_contents:
103
- new_doc = {
104
- element["element_id"]: {
105
- "embedding": element.get("embeddings", None),
106
- "text": element.get("text", None),
107
- "metadata": element.get("metadata", None),
108
- "type": element.get("type", None),
109
- }
97
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
98
+ data = element_dict.copy()
99
+ return {
100
+ data["element_id"]: {
101
+ "embedding": data.get("embeddings", None),
102
+ "text": data.get("text", None),
103
+ "metadata": data.get("metadata", None),
104
+ "type": data.get("type", None),
110
105
  }
111
- output_elements.append(new_doc)
112
-
113
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
114
- with open(output_path, "w") as output_file:
115
- json.dump(output_elements, output_file)
116
- return output_path
106
+ }
117
107
 
118
108
 
119
109
  class CouchbaseUploaderConfig(UploaderConfig):
@@ -128,26 +118,26 @@ class CouchbaseUploader(Uploader):
128
118
 
129
119
  def precheck(self) -> None:
130
120
  try:
131
- self.connection_config.connect_to_couchbase()
121
+ self.connection_config.get_client()
132
122
  except Exception as e:
133
123
  logger.error(f"Failed to validate connection {e}", exc_info=True)
134
124
  raise DestinationConnectionError(f"failed to validate connection: {e}")
135
125
 
136
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
137
- with path.open("r") as file:
138
- elements_dict = json.load(file)
126
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
139
127
  logger.info(
140
- f"writing {len(elements_dict)} objects to destination "
128
+ f"writing {len(data)} objects to destination "
141
129
  f"bucket, {self.connection_config.bucket} "
142
130
  f"at {self.connection_config.connection_string}",
143
131
  )
144
- cluster = self.connection_config.connect_to_couchbase()
145
- bucket = cluster.bucket(self.connection_config.bucket)
146
- scope = bucket.scope(self.connection_config.scope)
147
- collection = scope.collection(self.connection_config.collection)
132
+ with self.connection_config.get_client() as client:
133
+ bucket = client.bucket(self.connection_config.bucket)
134
+ scope = bucket.scope(self.connection_config.scope)
135
+ collection = scope.collection(self.connection_config.collection)
148
136
 
149
- for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
150
- collection.upsert_multi({doc_id: doc for doc in chunk for doc_id, doc in doc.items()})
137
+ for chunk in batch_generator(data, self.upload_config.batch_size):
138
+ collection.upsert_multi(
139
+ {doc_id: doc for doc in chunk for doc_id, doc in doc.items()}
140
+ )
151
141
 
152
142
 
153
143
  class CouchbaseIndexerConfig(IndexerConfig):
@@ -162,7 +152,7 @@ class CouchbaseIndexer(Indexer):
162
152
 
163
153
  def precheck(self) -> None:
164
154
  try:
165
- self.connection_config.connect_to_couchbase()
155
+ self.connection_config.get_client()
166
156
  except Exception as e:
167
157
  logger.error(f"Failed to validate connection {e}", exc_info=True)
168
158
  raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -180,10 +170,10 @@ class CouchbaseIndexer(Indexer):
180
170
  attempts = 0
181
171
  while attempts < max_attempts:
182
172
  try:
183
- cluster = self.connection_config.connect_to_couchbase()
184
- result = cluster.query(query)
185
- document_ids = [row["id"] for row in result]
186
- return document_ids
173
+ with self.connection_config.get_client() as client:
174
+ result = client.query(query)
175
+ document_ids = [row["id"] for row in result]
176
+ return document_ids
187
177
  except Exception as e:
188
178
  attempts += 1
189
179
  time.sleep(3)
@@ -294,13 +284,13 @@ class CouchbaseDownloader(Downloader):
294
284
  bucket_name: str = file_data.additional_metadata["bucket"]
295
285
  ids: list[str] = file_data.additional_metadata["ids"]
296
286
 
297
- cluster = self.connection_config.connect_to_couchbase()
298
- bucket = cluster.bucket(bucket_name)
299
- scope = bucket.scope(self.connection_config.scope)
300
- collection = scope.collection(self.connection_config.collection)
287
+ with self.connection_config.get_client() as client:
288
+ bucket = client.bucket(bucket_name)
289
+ scope = bucket.scope(self.connection_config.scope)
290
+ collection = scope.collection(self.connection_config.collection)
301
291
 
302
- download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
303
- return list(download_resp)
292
+ download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
293
+ return list(download_resp)
304
294
 
305
295
  def process_doc_id(self, doc_id, collection, bucket_name, file_data):
306
296
  result = collection.get(doc_id)
@@ -11,6 +11,7 @@ import pandas as pd
11
11
  from pydantic import Field, Secret
12
12
 
13
13
  from unstructured_ingest.error import DestinationConnectionError
14
+ from unstructured_ingest.utils.data_prep import get_data_df
14
15
  from unstructured_ingest.utils.dep_check import requires_dependencies
15
16
  from unstructured_ingest.utils.table import convert_to_pandas_dataframe
16
17
  from unstructured_ingest.v2.interfaces import (
@@ -28,6 +29,7 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
28
29
  CONNECTOR_TYPE = "delta_table"
29
30
 
30
31
 
32
+ @requires_dependencies(["deltalake"], extras="delta-table")
31
33
  def write_deltalake_with_error_handling(queue, **kwargs):
32
34
  from deltalake.writer import write_deltalake
33
35
 
@@ -136,39 +138,7 @@ class DeltaTableUploader(Uploader):
136
138
  logger.error(f"failed to validate connection: {e}", exc_info=True)
137
139
  raise DestinationConnectionError(f"failed to validate connection: {e}")
138
140
 
139
- def process_csv(self, csv_paths: list[Path]) -> pd.DataFrame:
140
- logger.debug(f"uploading content from {len(csv_paths)} csv files")
141
- df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
142
- return df
143
-
144
- def process_json(self, json_paths: list[Path]) -> pd.DataFrame:
145
- logger.debug(f"uploading content from {len(json_paths)} json files")
146
- all_records = []
147
- for p in json_paths:
148
- with open(p) as json_file:
149
- all_records.extend(json.load(json_file))
150
-
151
- return pd.DataFrame(data=all_records)
152
-
153
- def process_parquet(self, parquet_paths: list[Path]) -> pd.DataFrame:
154
- logger.debug(f"uploading content from {len(parquet_paths)} parquet files")
155
- df = pd.concat((pd.read_parquet(path) for path in parquet_paths), ignore_index=True)
156
- return df
157
-
158
- def read_dataframe(self, path: Path) -> pd.DataFrame:
159
- if path.suffix == ".csv":
160
- return self.process_csv(csv_paths=[path])
161
- elif path.suffix == ".json":
162
- return self.process_json(json_paths=[path])
163
- elif path.suffix == ".parquet":
164
- return self.process_parquet(parquet_paths=[path])
165
- else:
166
- raise ValueError(f"Unsupported file type, must be parquet, json or csv file: {path}")
167
-
168
- @requires_dependencies(["deltalake"], extras="delta-table")
169
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
170
-
171
- df = self.read_dataframe(path)
141
+ def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
172
142
  updated_upload_path = os.path.join(
173
143
  self.connection_config.table_uri, file_data.source_identifiers.relative_path
174
144
  )
@@ -203,6 +173,14 @@ class DeltaTableUploader(Uploader):
203
173
  logger.error(f"Exception occurred in write_deltalake: {error_message}")
204
174
  raise RuntimeError(f"Error in write_deltalake: {error_message}")
205
175
 
176
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
177
+ df = pd.DataFrame(data=data)
178
+ self.upload_dataframe(df=df, file_data=file_data)
179
+
180
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
181
+ df = get_data_df(path)
182
+ self.upload_dataframe(df=df, file_data=file_data)
183
+
206
184
 
207
185
  delta_table_destination_entry = DestinationRegistryEntry(
208
186
  connection_config=DeltaTableConnectionConfig,
@@ -1,5 +1,3 @@
1
- import json
2
- import uuid
3
1
  from dataclasses import dataclass
4
2
  from pathlib import Path
5
3
  from typing import Any
@@ -7,6 +5,7 @@ from typing import Any
7
5
  import pandas as pd
8
6
 
9
7
  from unstructured_ingest.v2.interfaces import FileData, UploadStager
8
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
10
9
 
11
10
  _COLUMNS = (
12
11
  "id",
@@ -56,6 +55,22 @@ _COLUMNS = (
56
55
  @dataclass
57
56
  class BaseDuckDBUploadStager(UploadStager):
58
57
 
58
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
59
+ data = element_dict.copy()
60
+ metadata: dict[str, Any] = data.pop("metadata", {})
61
+ data_source = metadata.pop("data_source", {})
62
+ coordinates = metadata.pop("coordinates", {})
63
+
64
+ data.update(metadata)
65
+ data.update(data_source)
66
+ data.update(coordinates)
67
+
68
+ data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
69
+
70
+ # remove extraneous, not supported columns
71
+ data = {k: v for k, v in data.items() if k in _COLUMNS}
72
+ return data
73
+
59
74
  def run(
60
75
  self,
61
76
  elements_filepath: Path,
@@ -64,29 +79,14 @@ class BaseDuckDBUploadStager(UploadStager):
64
79
  output_filename: str,
65
80
  **kwargs: Any,
66
81
  ) -> Path:
67
- with open(elements_filepath) as elements_file:
68
- elements_contents: list[dict] = json.load(elements_file)
69
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
70
- output_path.parent.mkdir(parents=True, exist_ok=True)
71
-
72
- output = []
73
- for data in elements_contents:
74
- metadata: dict[str, Any] = data.pop("metadata", {})
75
- data_source = metadata.pop("data_source", {})
76
- coordinates = metadata.pop("coordinates", {})
77
-
78
- data.update(metadata)
79
- data.update(data_source)
80
- data.update(coordinates)
81
-
82
- data["id"] = str(uuid.uuid4())
83
-
84
- # remove extraneous, not supported columns
85
- data = {k: v for k, v in data.items() if k in _COLUMNS}
86
-
87
- output.append(data)
82
+ elements_contents = self.get_data(elements_filepath=elements_filepath)
83
+ output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
88
84
 
89
- df = pd.DataFrame.from_dict(output)
85
+ output = [
86
+ self.conform_dict(element_dict=element_dict, file_data=file_data)
87
+ for element_dict in elements_contents
88
+ ]
89
+ df = pd.DataFrame(data=output)
90
90
 
91
91
  for column in filter(
92
92
  lambda x: x in df.columns,
@@ -94,6 +94,6 @@ class BaseDuckDBUploadStager(UploadStager):
94
94
  ):
95
95
  df[column] = df[column].apply(str)
96
96
 
97
- with output_path.open("w") as output_file:
98
- df.to_json(output_file, orient="records", lines=True)
97
+ data = df.to_dict(orient="records")
98
+ self.write_output(output_path=output_path, data=data)
99
99
  return output_path
@@ -1,11 +1,13 @@
1
+ from contextlib import contextmanager
1
2
  from dataclasses import dataclass, field
2
3
  from pathlib import Path
3
- from typing import TYPE_CHECKING, Any, Callable, Optional
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
5
 
5
6
  import pandas as pd
6
7
  from pydantic import Field, Secret
7
8
 
8
9
  from unstructured_ingest.error import DestinationConnectionError
10
+ from unstructured_ingest.utils.data_prep import get_data_df
9
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
12
  from unstructured_ingest.v2.interfaces import (
11
13
  AccessConfig,
@@ -55,6 +57,20 @@ class DuckDBConnectionConfig(ConnectionConfig):
55
57
  "through the `database` argument"
56
58
  )
57
59
 
60
+ @requires_dependencies(["duckdb"], extras="duckdb")
61
+ @contextmanager
62
+ def get_client(self) -> Generator["DuckDBConnection", None, None]:
63
+ import duckdb
64
+
65
+ with duckdb.connect(self.database) as client:
66
+ yield client
67
+
68
+ @contextmanager
69
+ def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
70
+ with self.get_client() as client:
71
+ with client.cursor() as cursor:
72
+ yield cursor
73
+
58
74
 
59
75
  class DuckDBUploadStagerConfig(UploadStagerConfig):
60
76
  pass
@@ -79,34 +95,27 @@ class DuckDBUploader(Uploader):
79
95
 
80
96
  def precheck(self) -> None:
81
97
  try:
82
- cursor = self.connection().cursor()
83
- cursor.execute("SELECT 1;")
84
- cursor.close()
98
+ with self.connection_config.get_cursor() as cursor:
99
+ cursor.execute("SELECT 1;")
85
100
  except Exception as e:
86
101
  logger.error(f"failed to validate connection: {e}", exc_info=True)
87
102
  raise DestinationConnectionError(f"failed to validate connection: {e}")
88
103
 
89
- @property
90
- def connection(self) -> Callable[[], "DuckDBConnection"]:
91
- return self._make_duckdb_connection
104
+ def upload_dataframe(self, df: pd.DataFrame) -> None:
105
+ logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
92
106
 
93
- @requires_dependencies(["duckdb"], extras="duckdb")
94
- def _make_duckdb_connection(self) -> "DuckDBConnection":
95
- import duckdb
96
-
97
- return duckdb.connect(self.connection_config.database)
98
-
99
- def upload_contents(self, path: Path) -> None:
100
- df_elements = pd.read_json(path, orient="records", lines=True)
101
- logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
102
-
103
- with self.connection() as conn:
107
+ with self.connection_config.get_client() as conn:
104
108
  conn.query(
105
- f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df_elements" # noqa: E501
109
+ f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
106
110
  )
107
111
 
112
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
113
+ df = pd.DataFrame(data=data)
114
+ self.upload_dataframe(df=df)
115
+
108
116
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
109
- self.upload_contents(path=path)
117
+ df = get_data_df(path)
118
+ self.upload_dataframe(df=df)
110
119
 
111
120
 
112
121
  duckdb_destination_entry = DestinationRegistryEntry(
@@ -1,12 +1,14 @@
1
+ from contextlib import contextmanager
1
2
  from dataclasses import dataclass, field
2
3
  from pathlib import Path
3
- from typing import TYPE_CHECKING, Any, Callable, Optional
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
5
 
5
6
  import pandas as pd
6
7
  from pydantic import Field, Secret
7
8
 
8
9
  from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
9
10
  from unstructured_ingest.error import DestinationConnectionError
11
+ from unstructured_ingest.utils.data_prep import get_data_df
10
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
13
  from unstructured_ingest.v2.interfaces import (
12
14
  AccessConfig,
@@ -27,13 +29,12 @@ CONNECTOR_TYPE = "motherduck"
27
29
 
28
30
 
29
31
  class MotherDuckAccessConfig(AccessConfig):
30
- md_token: Optional[str] = Field(default=None, description="MotherDuck token")
32
+ md_token: str = Field(default=None, description="MotherDuck token")
31
33
 
32
34
 
33
35
  class MotherDuckConnectionConfig(ConnectionConfig):
34
36
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
35
- database: Optional[str] = Field(
36
- default=None,
37
+ database: str = Field(
37
38
  description="Database name. Name of the MotherDuck database.",
38
39
  )
39
40
  db_schema: Optional[str] = Field(
@@ -48,17 +49,26 @@ class MotherDuckConnectionConfig(ConnectionConfig):
48
49
  default=MotherDuckAccessConfig(), validate_default=True
49
50
  )
50
51
 
51
- def __post_init__(self):
52
- if self.database is None:
53
- raise ValueError(
54
- "A MotherDuck connection requires a database (string) to be passed "
55
- "through the `database` argument"
56
- )
57
- if self.access_config.md_token is None:
58
- raise ValueError(
59
- "A MotherDuck connection requires a md_token (MotherDuck token) to be passed "
60
- "using MotherDuckAccessConfig through the `access_config` argument"
61
- )
52
+ @requires_dependencies(["duckdb"], extras="duckdb")
53
+ @contextmanager
54
+ def get_client(self) -> Generator["MotherDuckConnection", None, None]:
55
+ import duckdb
56
+
57
+ access_config = self.access_config.get_secret_value()
58
+ with duckdb.connect(
59
+ f"md:?motherduck_token={access_config.md_token}",
60
+ config={
61
+ "custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
62
+ },
63
+ ) as conn:
64
+ conn.sql(f"USE {self.database}")
65
+ yield conn
66
+
67
+ @contextmanager
68
+ def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
69
+ with self.get_client() as client:
70
+ with client.cursor() as cursor:
71
+ yield cursor
62
72
 
63
73
 
64
74
  class MotherDuckUploadStagerConfig(UploadStagerConfig):
@@ -84,44 +94,27 @@ class MotherDuckUploader(Uploader):
84
94
 
85
95
  def precheck(self) -> None:
86
96
  try:
87
- cursor = self.connection().cursor()
88
- cursor.execute("SELECT 1;")
89
- cursor.close()
97
+ with self.connection_config.get_cursor() as cursor:
98
+ cursor.execute("SELECT 1;")
90
99
  except Exception as e:
91
100
  logger.error(f"failed to validate connection: {e}", exc_info=True)
92
101
  raise DestinationConnectionError(f"failed to validate connection: {e}")
93
102
 
94
- @property
95
- def connection(self) -> Callable[[], "MotherDuckConnection"]:
96
- return self._make_motherduck_connection
97
-
98
- @requires_dependencies(["duckdb"], extras="duckdb")
99
- def _make_motherduck_connection(self) -> "MotherDuckConnection":
100
- import duckdb
103
+ def upload_dataframe(self, df: pd.DataFrame) -> None:
104
+ logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
101
105
 
102
- access_config = self.connection_config.access_config.get_secret_value()
103
- conn = duckdb.connect(
104
- f"md:?motherduck_token={access_config.md_token}",
105
- config={
106
- "custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
107
- },
108
- )
109
-
110
- conn.sql(f"USE {self.connection_config.database}")
111
-
112
- return conn
113
-
114
- def upload_contents(self, path: Path) -> None:
115
- df_elements = pd.read_json(path, orient="records", lines=True)
116
- logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
117
-
118
- with self.connection() as conn:
106
+ with self.connection_config.get_client() as conn:
119
107
  conn.query(
120
- f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df_elements" # noqa: E501
108
+ f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
121
109
  )
122
110
 
111
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
112
+ df = pd.DataFrame(data=data)
113
+ self.upload_dataframe(df=df)
114
+
123
115
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
124
- self.upload_contents(path=path)
116
+ df = get_data_df(path)
117
+ self.upload_dataframe(df=df)
125
118
 
126
119
 
127
120
  motherduck_destination_entry = DestinationRegistryEntry(