unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +102 -91
- test/integration/connectors/sql/test_singlestore.py +111 -99
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +86 -75
- test/integration/connectors/test_astradb.py +22 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +4 -4
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +3 -3
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +15 -91
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
- unstructured_ingest/v2/processes/connectors/astradb.py +8 -30
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +42 -52
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +5 -30
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +4 -8
- unstructured_ingest/v2/processes/connectors/neo4j.py +381 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/sql.py +41 -40
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/METADATA +18 -14
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/RECORD +64 -56
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.9.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from dataclasses import dataclass, field
|
|
3
2
|
from datetime import date, datetime
|
|
4
|
-
from pathlib import Path
|
|
5
3
|
from typing import TYPE_CHECKING, Annotated, Any, Optional
|
|
6
4
|
|
|
7
5
|
from dateutil import parser
|
|
@@ -42,7 +40,6 @@ class ChromaAccessConfig(AccessConfig):
|
|
|
42
40
|
|
|
43
41
|
|
|
44
42
|
class ChromaConnectionConfig(ConnectionConfig):
|
|
45
|
-
collection_name: str = Field(description="The name of the Chroma collection to write into.")
|
|
46
43
|
access_config: Secret[ChromaAccessConfig] = Field(
|
|
47
44
|
default=ChromaAccessConfig(), validate_default=True
|
|
48
45
|
)
|
|
@@ -62,6 +59,32 @@ class ChromaConnectionConfig(ConnectionConfig):
|
|
|
62
59
|
)
|
|
63
60
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
64
61
|
|
|
62
|
+
@requires_dependencies(["chromadb"], extras="chroma")
|
|
63
|
+
def get_client(self) -> "Client":
|
|
64
|
+
import chromadb
|
|
65
|
+
|
|
66
|
+
access_config = self.access_config.get_secret_value()
|
|
67
|
+
if path := self.path:
|
|
68
|
+
return chromadb.PersistentClient(
|
|
69
|
+
path=path,
|
|
70
|
+
settings=access_config.settings,
|
|
71
|
+
tenant=self.tenant,
|
|
72
|
+
database=self.database,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
elif (host := self.host) and (port := self.port):
|
|
76
|
+
return chromadb.HttpClient(
|
|
77
|
+
host=host,
|
|
78
|
+
port=str(port),
|
|
79
|
+
ssl=self.ssl,
|
|
80
|
+
headers=access_config.headers,
|
|
81
|
+
settings=access_config.settings,
|
|
82
|
+
tenant=self.tenant,
|
|
83
|
+
database=self.database,
|
|
84
|
+
)
|
|
85
|
+
else:
|
|
86
|
+
raise ValueError("Chroma connector requires either path or host and port to be set.")
|
|
87
|
+
|
|
65
88
|
|
|
66
89
|
class ChromaUploadStagerConfig(UploadStagerConfig):
|
|
67
90
|
pass
|
|
@@ -82,11 +105,11 @@ class ChromaUploadStager(UploadStager):
|
|
|
82
105
|
logger.debug(f"date {date_string} string not a timestamp: {e}")
|
|
83
106
|
return parser.parse(date_string)
|
|
84
107
|
|
|
85
|
-
|
|
86
|
-
def conform_dict(data: dict, file_data: FileData) -> dict:
|
|
108
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
87
109
|
"""
|
|
88
110
|
Prepares dictionary in the format that Chroma requires
|
|
89
111
|
"""
|
|
112
|
+
data = element_dict.copy()
|
|
90
113
|
return {
|
|
91
114
|
"id": get_enhanced_element_id(element_dict=data, file_data=file_data),
|
|
92
115
|
"embedding": data.pop("embeddings", None),
|
|
@@ -94,26 +117,9 @@ class ChromaUploadStager(UploadStager):
|
|
|
94
117
|
"metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True),
|
|
95
118
|
}
|
|
96
119
|
|
|
97
|
-
def run(
|
|
98
|
-
self,
|
|
99
|
-
elements_filepath: Path,
|
|
100
|
-
file_data: FileData,
|
|
101
|
-
output_dir: Path,
|
|
102
|
-
output_filename: str,
|
|
103
|
-
**kwargs: Any,
|
|
104
|
-
) -> Path:
|
|
105
|
-
with open(elements_filepath) as elements_file:
|
|
106
|
-
elements_contents = json.load(elements_file)
|
|
107
|
-
conformed_elements = [
|
|
108
|
-
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
109
|
-
]
|
|
110
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
111
|
-
with open(output_path, "w") as output_file:
|
|
112
|
-
json.dump(conformed_elements, output_file)
|
|
113
|
-
return output_path
|
|
114
|
-
|
|
115
120
|
|
|
116
121
|
class ChromaUploaderConfig(UploaderConfig):
|
|
122
|
+
collection_name: str = Field(description="The name of the Chroma collection to write into.")
|
|
117
123
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
118
124
|
|
|
119
125
|
|
|
@@ -125,37 +131,11 @@ class ChromaUploader(Uploader):
|
|
|
125
131
|
|
|
126
132
|
def precheck(self) -> None:
|
|
127
133
|
try:
|
|
128
|
-
self.
|
|
134
|
+
self.connection_config.get_client()
|
|
129
135
|
except Exception as e:
|
|
130
136
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
131
137
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
132
138
|
|
|
133
|
-
@requires_dependencies(["chromadb"], extras="chroma")
|
|
134
|
-
def create_client(self) -> "Client":
|
|
135
|
-
import chromadb
|
|
136
|
-
|
|
137
|
-
access_config = self.connection_config.access_config.get_secret_value()
|
|
138
|
-
if self.connection_config.path:
|
|
139
|
-
return chromadb.PersistentClient(
|
|
140
|
-
path=self.connection_config.path,
|
|
141
|
-
settings=access_config.settings,
|
|
142
|
-
tenant=self.connection_config.tenant,
|
|
143
|
-
database=self.connection_config.database,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
elif self.connection_config.host and self.connection_config.port:
|
|
147
|
-
return chromadb.HttpClient(
|
|
148
|
-
host=self.connection_config.host,
|
|
149
|
-
port=self.connection_config.port,
|
|
150
|
-
ssl=self.connection_config.ssl,
|
|
151
|
-
headers=access_config.headers,
|
|
152
|
-
settings=access_config.settings,
|
|
153
|
-
tenant=self.connection_config.tenant,
|
|
154
|
-
database=self.connection_config.database,
|
|
155
|
-
)
|
|
156
|
-
else:
|
|
157
|
-
raise ValueError("Chroma connector requires either path or host and port to be set.")
|
|
158
|
-
|
|
159
139
|
@DestinationConnectionError.wrap
|
|
160
140
|
def upsert_batch(self, collection, batch):
|
|
161
141
|
|
|
@@ -189,19 +169,16 @@ class ChromaUploader(Uploader):
|
|
|
189
169
|
)
|
|
190
170
|
return chroma_dict
|
|
191
171
|
|
|
192
|
-
def
|
|
193
|
-
with path.open("r") as file:
|
|
194
|
-
elements_dict = json.load(file)
|
|
195
|
-
|
|
172
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
196
173
|
logger.info(
|
|
197
|
-
f"writing {len(
|
|
198
|
-
f"collection {self.
|
|
174
|
+
f"writing {len(data)} objects to destination "
|
|
175
|
+
f"collection {self.upload_config.collection_name} "
|
|
199
176
|
f"at {self.connection_config.host}",
|
|
200
177
|
)
|
|
201
|
-
client = self.
|
|
178
|
+
client = self.connection_config.get_client()
|
|
202
179
|
|
|
203
|
-
collection = client.get_or_create_collection(name=self.
|
|
204
|
-
for chunk in batch_generator(
|
|
180
|
+
collection = client.get_or_create_collection(name=self.upload_config.collection_name)
|
|
181
|
+
for chunk in batch_generator(data, self.upload_config.batch_size):
|
|
205
182
|
self.upsert_batch(collection, self.prepare_chroma_list(chunk))
|
|
206
183
|
|
|
207
184
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
-
import json
|
|
3
2
|
import sys
|
|
4
3
|
import time
|
|
4
|
+
from contextlib import contextmanager
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
6
|
from datetime import timedelta
|
|
7
7
|
from pathlib import Path
|
|
@@ -65,7 +65,8 @@ class CouchbaseConnectionConfig(ConnectionConfig):
|
|
|
65
65
|
access_config: Secret[CouchbaseAccessConfig]
|
|
66
66
|
|
|
67
67
|
@requires_dependencies(["couchbase"], extras="couchbase")
|
|
68
|
-
|
|
68
|
+
@contextmanager
|
|
69
|
+
def get_client(self) -> Generator["Cluster", None, None]:
|
|
69
70
|
from couchbase.auth import PasswordAuthenticator
|
|
70
71
|
from couchbase.cluster import Cluster
|
|
71
72
|
from couchbase.options import ClusterOptions
|
|
@@ -73,9 +74,14 @@ class CouchbaseConnectionConfig(ConnectionConfig):
|
|
|
73
74
|
auth = PasswordAuthenticator(self.username, self.access_config.get_secret_value().password)
|
|
74
75
|
options = ClusterOptions(auth)
|
|
75
76
|
options.apply_profile("wan_development")
|
|
76
|
-
cluster =
|
|
77
|
-
|
|
78
|
-
|
|
77
|
+
cluster = None
|
|
78
|
+
try:
|
|
79
|
+
cluster = Cluster(self.connection_string, options)
|
|
80
|
+
cluster.wait_until_ready(timedelta(seconds=5))
|
|
81
|
+
yield cluster
|
|
82
|
+
finally:
|
|
83
|
+
if cluster:
|
|
84
|
+
cluster.close()
|
|
79
85
|
|
|
80
86
|
|
|
81
87
|
class CouchbaseUploadStagerConfig(UploadStagerConfig):
|
|
@@ -88,32 +94,16 @@ class CouchbaseUploadStager(UploadStager):
|
|
|
88
94
|
default_factory=lambda: CouchbaseUploadStagerConfig()
|
|
89
95
|
)
|
|
90
96
|
|
|
91
|
-
def
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
elements_contents = json.load(elements_file)
|
|
100
|
-
|
|
101
|
-
output_elements = []
|
|
102
|
-
for element in elements_contents:
|
|
103
|
-
new_doc = {
|
|
104
|
-
element["element_id"]: {
|
|
105
|
-
"embedding": element.get("embeddings", None),
|
|
106
|
-
"text": element.get("text", None),
|
|
107
|
-
"metadata": element.get("metadata", None),
|
|
108
|
-
"type": element.get("type", None),
|
|
109
|
-
}
|
|
97
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
98
|
+
data = element_dict.copy()
|
|
99
|
+
return {
|
|
100
|
+
data["element_id"]: {
|
|
101
|
+
"embedding": data.get("embeddings", None),
|
|
102
|
+
"text": data.get("text", None),
|
|
103
|
+
"metadata": data.get("metadata", None),
|
|
104
|
+
"type": data.get("type", None),
|
|
110
105
|
}
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
114
|
-
with open(output_path, "w") as output_file:
|
|
115
|
-
json.dump(output_elements, output_file)
|
|
116
|
-
return output_path
|
|
106
|
+
}
|
|
117
107
|
|
|
118
108
|
|
|
119
109
|
class CouchbaseUploaderConfig(UploaderConfig):
|
|
@@ -128,26 +118,26 @@ class CouchbaseUploader(Uploader):
|
|
|
128
118
|
|
|
129
119
|
def precheck(self) -> None:
|
|
130
120
|
try:
|
|
131
|
-
self.connection_config.
|
|
121
|
+
self.connection_config.get_client()
|
|
132
122
|
except Exception as e:
|
|
133
123
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
134
124
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
135
125
|
|
|
136
|
-
def
|
|
137
|
-
with path.open("r") as file:
|
|
138
|
-
elements_dict = json.load(file)
|
|
126
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
139
127
|
logger.info(
|
|
140
|
-
f"writing {len(
|
|
128
|
+
f"writing {len(data)} objects to destination "
|
|
141
129
|
f"bucket, {self.connection_config.bucket} "
|
|
142
130
|
f"at {self.connection_config.connection_string}",
|
|
143
131
|
)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
132
|
+
with self.connection_config.get_client() as client:
|
|
133
|
+
bucket = client.bucket(self.connection_config.bucket)
|
|
134
|
+
scope = bucket.scope(self.connection_config.scope)
|
|
135
|
+
collection = scope.collection(self.connection_config.collection)
|
|
148
136
|
|
|
149
|
-
|
|
150
|
-
|
|
137
|
+
for chunk in batch_generator(data, self.upload_config.batch_size):
|
|
138
|
+
collection.upsert_multi(
|
|
139
|
+
{doc_id: doc for doc in chunk for doc_id, doc in doc.items()}
|
|
140
|
+
)
|
|
151
141
|
|
|
152
142
|
|
|
153
143
|
class CouchbaseIndexerConfig(IndexerConfig):
|
|
@@ -162,7 +152,7 @@ class CouchbaseIndexer(Indexer):
|
|
|
162
152
|
|
|
163
153
|
def precheck(self) -> None:
|
|
164
154
|
try:
|
|
165
|
-
self.connection_config.
|
|
155
|
+
self.connection_config.get_client()
|
|
166
156
|
except Exception as e:
|
|
167
157
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
168
158
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -180,10 +170,10 @@ class CouchbaseIndexer(Indexer):
|
|
|
180
170
|
attempts = 0
|
|
181
171
|
while attempts < max_attempts:
|
|
182
172
|
try:
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
173
|
+
with self.connection_config.get_client() as client:
|
|
174
|
+
result = client.query(query)
|
|
175
|
+
document_ids = [row["id"] for row in result]
|
|
176
|
+
return document_ids
|
|
187
177
|
except Exception as e:
|
|
188
178
|
attempts += 1
|
|
189
179
|
time.sleep(3)
|
|
@@ -294,13 +284,13 @@ class CouchbaseDownloader(Downloader):
|
|
|
294
284
|
bucket_name: str = file_data.additional_metadata["bucket"]
|
|
295
285
|
ids: list[str] = file_data.additional_metadata["ids"]
|
|
296
286
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
287
|
+
with self.connection_config.get_client() as client:
|
|
288
|
+
bucket = client.bucket(bucket_name)
|
|
289
|
+
scope = bucket.scope(self.connection_config.scope)
|
|
290
|
+
collection = scope.collection(self.connection_config.collection)
|
|
301
291
|
|
|
302
|
-
|
|
303
|
-
|
|
292
|
+
download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
|
|
293
|
+
return list(download_resp)
|
|
304
294
|
|
|
305
295
|
def process_doc_id(self, doc_id, collection, bucket_name, file_data):
|
|
306
296
|
result = collection.get(doc_id)
|
|
@@ -11,6 +11,7 @@ import pandas as pd
|
|
|
11
11
|
from pydantic import Field, Secret
|
|
12
12
|
|
|
13
13
|
from unstructured_ingest.error import DestinationConnectionError
|
|
14
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
14
15
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
16
|
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
16
17
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -28,6 +29,7 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
|
|
|
28
29
|
CONNECTOR_TYPE = "delta_table"
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
31
33
|
def write_deltalake_with_error_handling(queue, **kwargs):
|
|
32
34
|
from deltalake.writer import write_deltalake
|
|
33
35
|
|
|
@@ -136,39 +138,7 @@ class DeltaTableUploader(Uploader):
|
|
|
136
138
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
137
139
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
138
140
|
|
|
139
|
-
def
|
|
140
|
-
logger.debug(f"uploading content from {len(csv_paths)} csv files")
|
|
141
|
-
df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
|
|
142
|
-
return df
|
|
143
|
-
|
|
144
|
-
def process_json(self, json_paths: list[Path]) -> pd.DataFrame:
|
|
145
|
-
logger.debug(f"uploading content from {len(json_paths)} json files")
|
|
146
|
-
all_records = []
|
|
147
|
-
for p in json_paths:
|
|
148
|
-
with open(p) as json_file:
|
|
149
|
-
all_records.extend(json.load(json_file))
|
|
150
|
-
|
|
151
|
-
return pd.DataFrame(data=all_records)
|
|
152
|
-
|
|
153
|
-
def process_parquet(self, parquet_paths: list[Path]) -> pd.DataFrame:
|
|
154
|
-
logger.debug(f"uploading content from {len(parquet_paths)} parquet files")
|
|
155
|
-
df = pd.concat((pd.read_parquet(path) for path in parquet_paths), ignore_index=True)
|
|
156
|
-
return df
|
|
157
|
-
|
|
158
|
-
def read_dataframe(self, path: Path) -> pd.DataFrame:
|
|
159
|
-
if path.suffix == ".csv":
|
|
160
|
-
return self.process_csv(csv_paths=[path])
|
|
161
|
-
elif path.suffix == ".json":
|
|
162
|
-
return self.process_json(json_paths=[path])
|
|
163
|
-
elif path.suffix == ".parquet":
|
|
164
|
-
return self.process_parquet(parquet_paths=[path])
|
|
165
|
-
else:
|
|
166
|
-
raise ValueError(f"Unsupported file type, must be parquet, json or csv file: {path}")
|
|
167
|
-
|
|
168
|
-
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
169
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
170
|
-
|
|
171
|
-
df = self.read_dataframe(path)
|
|
141
|
+
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
172
142
|
updated_upload_path = os.path.join(
|
|
173
143
|
self.connection_config.table_uri, file_data.source_identifiers.relative_path
|
|
174
144
|
)
|
|
@@ -203,6 +173,14 @@ class DeltaTableUploader(Uploader):
|
|
|
203
173
|
logger.error(f"Exception occurred in write_deltalake: {error_message}")
|
|
204
174
|
raise RuntimeError(f"Error in write_deltalake: {error_message}")
|
|
205
175
|
|
|
176
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
177
|
+
df = pd.DataFrame(data=data)
|
|
178
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
179
|
+
|
|
180
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
181
|
+
df = get_data_df(path)
|
|
182
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
183
|
+
|
|
206
184
|
|
|
207
185
|
delta_table_destination_entry = DestinationRegistryEntry(
|
|
208
186
|
connection_config=DeltaTableConnectionConfig,
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import uuid
|
|
3
1
|
from dataclasses import dataclass
|
|
4
2
|
from pathlib import Path
|
|
5
3
|
from typing import Any
|
|
@@ -7,6 +5,7 @@ from typing import Any
|
|
|
7
5
|
import pandas as pd
|
|
8
6
|
|
|
9
7
|
from unstructured_ingest.v2.interfaces import FileData, UploadStager
|
|
8
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
10
9
|
|
|
11
10
|
_COLUMNS = (
|
|
12
11
|
"id",
|
|
@@ -56,6 +55,22 @@ _COLUMNS = (
|
|
|
56
55
|
@dataclass
|
|
57
56
|
class BaseDuckDBUploadStager(UploadStager):
|
|
58
57
|
|
|
58
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
59
|
+
data = element_dict.copy()
|
|
60
|
+
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
61
|
+
data_source = metadata.pop("data_source", {})
|
|
62
|
+
coordinates = metadata.pop("coordinates", {})
|
|
63
|
+
|
|
64
|
+
data.update(metadata)
|
|
65
|
+
data.update(data_source)
|
|
66
|
+
data.update(coordinates)
|
|
67
|
+
|
|
68
|
+
data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
|
|
69
|
+
|
|
70
|
+
# remove extraneous, not supported columns
|
|
71
|
+
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
72
|
+
return data
|
|
73
|
+
|
|
59
74
|
def run(
|
|
60
75
|
self,
|
|
61
76
|
elements_filepath: Path,
|
|
@@ -64,29 +79,14 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
64
79
|
output_filename: str,
|
|
65
80
|
**kwargs: Any,
|
|
66
81
|
) -> Path:
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
70
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
71
|
-
|
|
72
|
-
output = []
|
|
73
|
-
for data in elements_contents:
|
|
74
|
-
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
75
|
-
data_source = metadata.pop("data_source", {})
|
|
76
|
-
coordinates = metadata.pop("coordinates", {})
|
|
77
|
-
|
|
78
|
-
data.update(metadata)
|
|
79
|
-
data.update(data_source)
|
|
80
|
-
data.update(coordinates)
|
|
81
|
-
|
|
82
|
-
data["id"] = str(uuid.uuid4())
|
|
83
|
-
|
|
84
|
-
# remove extraneous, not supported columns
|
|
85
|
-
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
86
|
-
|
|
87
|
-
output.append(data)
|
|
82
|
+
elements_contents = self.get_data(elements_filepath=elements_filepath)
|
|
83
|
+
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
88
84
|
|
|
89
|
-
|
|
85
|
+
output = [
|
|
86
|
+
self.conform_dict(element_dict=element_dict, file_data=file_data)
|
|
87
|
+
for element_dict in elements_contents
|
|
88
|
+
]
|
|
89
|
+
df = pd.DataFrame(data=output)
|
|
90
90
|
|
|
91
91
|
for column in filter(
|
|
92
92
|
lambda x: x in df.columns,
|
|
@@ -94,6 +94,6 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
94
94
|
):
|
|
95
95
|
df[column] = df[column].apply(str)
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
97
|
+
data = df.to_dict(orient="records")
|
|
98
|
+
self.write_output(output_path=output_path, data=data)
|
|
99
99
|
return output_path
|
|
@@ -1,11 +1,13 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
1
2
|
from dataclasses import dataclass, field
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
from typing import TYPE_CHECKING, Any,
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
5
|
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pydantic import Field, Secret
|
|
7
8
|
|
|
8
9
|
from unstructured_ingest.error import DestinationConnectionError
|
|
10
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
9
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
12
|
from unstructured_ingest.v2.interfaces import (
|
|
11
13
|
AccessConfig,
|
|
@@ -55,6 +57,20 @@ class DuckDBConnectionConfig(ConnectionConfig):
|
|
|
55
57
|
"through the `database` argument"
|
|
56
58
|
)
|
|
57
59
|
|
|
60
|
+
@requires_dependencies(["duckdb"], extras="duckdb")
|
|
61
|
+
@contextmanager
|
|
62
|
+
def get_client(self) -> Generator["DuckDBConnection", None, None]:
|
|
63
|
+
import duckdb
|
|
64
|
+
|
|
65
|
+
with duckdb.connect(self.database) as client:
|
|
66
|
+
yield client
|
|
67
|
+
|
|
68
|
+
@contextmanager
|
|
69
|
+
def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
|
|
70
|
+
with self.get_client() as client:
|
|
71
|
+
with client.cursor() as cursor:
|
|
72
|
+
yield cursor
|
|
73
|
+
|
|
58
74
|
|
|
59
75
|
class DuckDBUploadStagerConfig(UploadStagerConfig):
|
|
60
76
|
pass
|
|
@@ -79,34 +95,27 @@ class DuckDBUploader(Uploader):
|
|
|
79
95
|
|
|
80
96
|
def precheck(self) -> None:
|
|
81
97
|
try:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
cursor.close()
|
|
98
|
+
with self.connection_config.get_cursor() as cursor:
|
|
99
|
+
cursor.execute("SELECT 1;")
|
|
85
100
|
except Exception as e:
|
|
86
101
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
87
102
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
88
103
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
return self._make_duckdb_connection
|
|
104
|
+
def upload_dataframe(self, df: pd.DataFrame) -> None:
|
|
105
|
+
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
92
106
|
|
|
93
|
-
|
|
94
|
-
def _make_duckdb_connection(self) -> "DuckDBConnection":
|
|
95
|
-
import duckdb
|
|
96
|
-
|
|
97
|
-
return duckdb.connect(self.connection_config.database)
|
|
98
|
-
|
|
99
|
-
def upload_contents(self, path: Path) -> None:
|
|
100
|
-
df_elements = pd.read_json(path, orient="records", lines=True)
|
|
101
|
-
logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
|
|
102
|
-
|
|
103
|
-
with self.connection() as conn:
|
|
107
|
+
with self.connection_config.get_client() as conn:
|
|
104
108
|
conn.query(
|
|
105
|
-
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM
|
|
109
|
+
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
|
|
106
110
|
)
|
|
107
111
|
|
|
112
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
113
|
+
df = pd.DataFrame(data=data)
|
|
114
|
+
self.upload_dataframe(df=df)
|
|
115
|
+
|
|
108
116
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
109
|
-
|
|
117
|
+
df = get_data_df(path)
|
|
118
|
+
self.upload_dataframe(df=df)
|
|
110
119
|
|
|
111
120
|
|
|
112
121
|
duckdb_destination_entry = DestinationRegistryEntry(
|
|
@@ -1,12 +1,14 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
1
2
|
from dataclasses import dataclass, field
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
from typing import TYPE_CHECKING, Any,
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
5
|
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pydantic import Field, Secret
|
|
7
8
|
|
|
8
9
|
from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
|
|
9
10
|
from unstructured_ingest.error import DestinationConnectionError
|
|
11
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
10
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
13
|
from unstructured_ingest.v2.interfaces import (
|
|
12
14
|
AccessConfig,
|
|
@@ -27,13 +29,12 @@ CONNECTOR_TYPE = "motherduck"
|
|
|
27
29
|
|
|
28
30
|
|
|
29
31
|
class MotherDuckAccessConfig(AccessConfig):
|
|
30
|
-
md_token:
|
|
32
|
+
md_token: str = Field(default=None, description="MotherDuck token")
|
|
31
33
|
|
|
32
34
|
|
|
33
35
|
class MotherDuckConnectionConfig(ConnectionConfig):
|
|
34
36
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
35
|
-
database:
|
|
36
|
-
default=None,
|
|
37
|
+
database: str = Field(
|
|
37
38
|
description="Database name. Name of the MotherDuck database.",
|
|
38
39
|
)
|
|
39
40
|
db_schema: Optional[str] = Field(
|
|
@@ -48,17 +49,26 @@ class MotherDuckConnectionConfig(ConnectionConfig):
|
|
|
48
49
|
default=MotherDuckAccessConfig(), validate_default=True
|
|
49
50
|
)
|
|
50
51
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
"
|
|
61
|
-
|
|
52
|
+
@requires_dependencies(["duckdb"], extras="duckdb")
|
|
53
|
+
@contextmanager
|
|
54
|
+
def get_client(self) -> Generator["MotherDuckConnection", None, None]:
|
|
55
|
+
import duckdb
|
|
56
|
+
|
|
57
|
+
access_config = self.access_config.get_secret_value()
|
|
58
|
+
with duckdb.connect(
|
|
59
|
+
f"md:?motherduck_token={access_config.md_token}",
|
|
60
|
+
config={
|
|
61
|
+
"custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
|
|
62
|
+
},
|
|
63
|
+
) as conn:
|
|
64
|
+
conn.sql(f"USE {self.database}")
|
|
65
|
+
yield conn
|
|
66
|
+
|
|
67
|
+
@contextmanager
|
|
68
|
+
def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
|
|
69
|
+
with self.get_client() as client:
|
|
70
|
+
with client.cursor() as cursor:
|
|
71
|
+
yield cursor
|
|
62
72
|
|
|
63
73
|
|
|
64
74
|
class MotherDuckUploadStagerConfig(UploadStagerConfig):
|
|
@@ -84,44 +94,27 @@ class MotherDuckUploader(Uploader):
|
|
|
84
94
|
|
|
85
95
|
def precheck(self) -> None:
|
|
86
96
|
try:
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
cursor.close()
|
|
97
|
+
with self.connection_config.get_cursor() as cursor:
|
|
98
|
+
cursor.execute("SELECT 1;")
|
|
90
99
|
except Exception as e:
|
|
91
100
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
92
101
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
93
102
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
return self._make_motherduck_connection
|
|
97
|
-
|
|
98
|
-
@requires_dependencies(["duckdb"], extras="duckdb")
|
|
99
|
-
def _make_motherduck_connection(self) -> "MotherDuckConnection":
|
|
100
|
-
import duckdb
|
|
103
|
+
def upload_dataframe(self, df: pd.DataFrame) -> None:
|
|
104
|
+
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
101
105
|
|
|
102
|
-
|
|
103
|
-
conn = duckdb.connect(
|
|
104
|
-
f"md:?motherduck_token={access_config.md_token}",
|
|
105
|
-
config={
|
|
106
|
-
"custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
|
|
107
|
-
},
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
conn.sql(f"USE {self.connection_config.database}")
|
|
111
|
-
|
|
112
|
-
return conn
|
|
113
|
-
|
|
114
|
-
def upload_contents(self, path: Path) -> None:
|
|
115
|
-
df_elements = pd.read_json(path, orient="records", lines=True)
|
|
116
|
-
logger.debug(f"uploading {len(df_elements)} entries to {self.connection_config.database} ")
|
|
117
|
-
|
|
118
|
-
with self.connection() as conn:
|
|
106
|
+
with self.connection_config.get_client() as conn:
|
|
119
107
|
conn.query(
|
|
120
|
-
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM
|
|
108
|
+
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
|
|
121
109
|
)
|
|
122
110
|
|
|
111
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
112
|
+
df = pd.DataFrame(data=data)
|
|
113
|
+
self.upload_dataframe(df=df)
|
|
114
|
+
|
|
123
115
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
124
|
-
|
|
116
|
+
df = get_data_df(path)
|
|
117
|
+
self.upload_dataframe(df=df)
|
|
125
118
|
|
|
126
119
|
|
|
127
120
|
motherduck_destination_entry = DestinationRegistryEntry(
|