unstructured-ingest 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/pipeline/pipeline.py +8 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +6 -1
- unstructured_ingest/v2/processes/connectors/astradb.py +1 -4
- unstructured_ingest/v2/processes/connectors/couchbase.py +204 -22
- unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
- unstructured_ingest/v2/processes/connectors/local.py +5 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +0 -2
- unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
- {unstructured_ingest-0.0.4.dist-info → unstructured_ingest-0.0.5.dist-info}/METADATA +260 -257
- {unstructured_ingest-0.0.4.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +14 -12
- {unstructured_ingest-0.0.4.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.4.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.4.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.5" # pragma: no cover
|
|
@@ -188,22 +188,26 @@ class Pipeline:
|
|
|
188
188
|
indices = self.indexer_step.run()
|
|
189
189
|
indices_inputs = [{"file_data_path": i} for i in indices]
|
|
190
190
|
if not indices_inputs:
|
|
191
|
+
logger.info("No files to process after indexer, exiting")
|
|
191
192
|
return
|
|
192
193
|
|
|
193
194
|
# Initial filtering on indexed content
|
|
194
195
|
indices_inputs = self.apply_filter(records=indices_inputs)
|
|
195
196
|
if not indices_inputs:
|
|
197
|
+
logger.info("No files to process after filtering indexed content, exiting")
|
|
196
198
|
return
|
|
197
199
|
|
|
198
200
|
# Download associated content to local file system
|
|
199
201
|
downloaded_data = self.downloader_step(indices_inputs)
|
|
200
202
|
downloaded_data = self.clean_results(results=downloaded_data)
|
|
201
203
|
if not downloaded_data:
|
|
204
|
+
logger.info("No files to process after downloader, exiting")
|
|
202
205
|
return
|
|
203
206
|
|
|
204
207
|
# Post download filtering
|
|
205
208
|
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
206
209
|
if not downloaded_data:
|
|
210
|
+
logger.info("No files to process after filtering downloaded content, exiting")
|
|
207
211
|
return
|
|
208
212
|
|
|
209
213
|
# Run uncompress if available
|
|
@@ -215,6 +219,7 @@ class Pipeline:
|
|
|
215
219
|
# Post uncompress filtering
|
|
216
220
|
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
217
221
|
if not downloaded_data:
|
|
222
|
+
logger.info("No files to process after filtering uncompressed content, exiting")
|
|
218
223
|
return
|
|
219
224
|
|
|
220
225
|
if not downloaded_data:
|
|
@@ -224,6 +229,7 @@ class Pipeline:
|
|
|
224
229
|
elements = self.partitioner_step(downloaded_data)
|
|
225
230
|
elements = self.clean_results(results=elements)
|
|
226
231
|
if not elements:
|
|
232
|
+
logger.info("No files to process after partitioning, exiting")
|
|
227
233
|
return
|
|
228
234
|
|
|
229
235
|
# Run element specific modifiers
|
|
@@ -231,6 +237,7 @@ class Pipeline:
|
|
|
231
237
|
elements = step(elements) if step else elements
|
|
232
238
|
elements = self.clean_results(results=elements)
|
|
233
239
|
if not elements:
|
|
240
|
+
logger.info(f"No files to process after {step.__class__.__name__}, exiting")
|
|
234
241
|
return
|
|
235
242
|
|
|
236
243
|
# Upload the final result
|
|
@@ -333,7 +340,7 @@ class Pipeline:
|
|
|
333
340
|
)
|
|
334
341
|
if len(destination_entry) != 1:
|
|
335
342
|
raise ValueError(
|
|
336
|
-
"no entry found in
|
|
343
|
+
"no entry found in destination registry with matching uploader, "
|
|
337
344
|
"stager and connection configs"
|
|
338
345
|
)
|
|
339
346
|
|
|
@@ -13,13 +13,15 @@ from .azure_cognitive_search import azure_cognitive_search_destination_entry
|
|
|
13
13
|
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
|
|
14
14
|
from .chroma import chroma_destination_entry
|
|
15
15
|
from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
|
|
16
|
-
from .couchbase import couchbase_destination_entry
|
|
16
|
+
from .couchbase import couchbase_destination_entry, couchbase_source_entry
|
|
17
17
|
from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
|
|
18
18
|
from .databricks_volumes import databricks_volumes_destination_entry
|
|
19
19
|
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
20
20
|
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
21
21
|
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
22
22
|
from .google_drive import google_drive_source_entry
|
|
23
|
+
from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
|
|
24
|
+
from .kdbai import kdbai_destination_entry
|
|
23
25
|
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
|
|
24
26
|
from .local import local_destination_entry, local_source_entry
|
|
25
27
|
from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
|
|
@@ -47,6 +49,7 @@ add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_d
|
|
|
47
49
|
|
|
48
50
|
add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
|
|
49
51
|
|
|
52
|
+
add_source_entry(source_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_source_entry)
|
|
50
53
|
add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
|
|
51
54
|
|
|
52
55
|
add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
|
|
@@ -87,3 +90,5 @@ add_destination_entry(
|
|
|
87
90
|
destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
|
|
88
91
|
entry=azure_cognitive_search_destination_entry,
|
|
89
92
|
)
|
|
93
|
+
|
|
94
|
+
add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
|
|
@@ -31,19 +31,16 @@ if TYPE_CHECKING:
|
|
|
31
31
|
CONNECTOR_TYPE = "astradb"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@dataclass
|
|
35
34
|
class AstraDBAccessConfig(AccessConfig):
|
|
36
35
|
token: str = Field(description="Astra DB Token with access to the database.")
|
|
37
36
|
api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
@dataclass
|
|
41
39
|
class AstraDBConnectionConfig(ConnectionConfig):
|
|
42
|
-
connection_type: str = CONNECTOR_TYPE
|
|
40
|
+
connection_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
43
41
|
access_config: Secret[AstraDBAccessConfig]
|
|
44
42
|
|
|
45
43
|
|
|
46
|
-
@dataclass
|
|
47
44
|
class AstraDBUploadStagerConfig(UploadStagerConfig):
|
|
48
45
|
pass
|
|
49
46
|
|
|
@@ -1,26 +1,42 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import json
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
2
5
|
from dataclasses import dataclass, field
|
|
3
6
|
from datetime import timedelta
|
|
4
7
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Generator, List
|
|
6
9
|
|
|
7
10
|
from pydantic import Field, Secret
|
|
8
11
|
|
|
9
|
-
from unstructured_ingest.error import
|
|
10
|
-
|
|
12
|
+
from unstructured_ingest.error import (
|
|
13
|
+
DestinationConnectionError,
|
|
14
|
+
SourceConnectionError,
|
|
15
|
+
SourceConnectionNetworkError,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
11
18
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
19
|
from unstructured_ingest.v2.interfaces import (
|
|
13
20
|
AccessConfig,
|
|
14
21
|
ConnectionConfig,
|
|
22
|
+
Downloader,
|
|
23
|
+
DownloaderConfig,
|
|
24
|
+
DownloadResponse,
|
|
25
|
+
FileData,
|
|
26
|
+
FileDataSourceMetadata,
|
|
27
|
+
Indexer,
|
|
28
|
+
IndexerConfig,
|
|
15
29
|
UploadContent,
|
|
16
30
|
Uploader,
|
|
17
31
|
UploaderConfig,
|
|
18
32
|
UploadStager,
|
|
19
33
|
UploadStagerConfig,
|
|
34
|
+
download_responses,
|
|
20
35
|
)
|
|
21
36
|
from unstructured_ingest.v2.logger import logger
|
|
22
37
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
38
|
DestinationRegistryEntry,
|
|
39
|
+
SourceRegistryEntry,
|
|
24
40
|
)
|
|
25
41
|
|
|
26
42
|
if TYPE_CHECKING:
|
|
@@ -49,6 +65,19 @@ class CouchbaseConnectionConfig(ConnectionConfig):
|
|
|
49
65
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
50
66
|
access_config: Secret[CouchbaseAccessConfig]
|
|
51
67
|
|
|
68
|
+
@requires_dependencies(["couchbase"], extras="couchbase")
|
|
69
|
+
def connect_to_couchbase(self) -> "Cluster":
|
|
70
|
+
from couchbase.auth import PasswordAuthenticator
|
|
71
|
+
from couchbase.cluster import Cluster
|
|
72
|
+
from couchbase.options import ClusterOptions
|
|
73
|
+
|
|
74
|
+
auth = PasswordAuthenticator(self.username, self.access_config.get_secret_value().password)
|
|
75
|
+
options = ClusterOptions(auth)
|
|
76
|
+
options.apply_profile("wan_development")
|
|
77
|
+
cluster = Cluster(self.connection_string, options)
|
|
78
|
+
cluster.wait_until_ready(timedelta(seconds=5))
|
|
79
|
+
return cluster
|
|
80
|
+
|
|
52
81
|
|
|
53
82
|
class CouchbaseUploadStagerConfig(UploadStagerConfig):
|
|
54
83
|
pass
|
|
@@ -98,26 +127,9 @@ class CouchbaseUploader(Uploader):
|
|
|
98
127
|
upload_config: CouchbaseUploaderConfig
|
|
99
128
|
connector_type: str = CONNECTOR_TYPE
|
|
100
129
|
|
|
101
|
-
@requires_dependencies(["couchbase"], extras="couchbase")
|
|
102
|
-
def connect_to_couchbase(self) -> "Cluster":
|
|
103
|
-
from couchbase.auth import PasswordAuthenticator
|
|
104
|
-
from couchbase.cluster import Cluster
|
|
105
|
-
from couchbase.options import ClusterOptions
|
|
106
|
-
|
|
107
|
-
connection_string = self.connection_config.connection_string
|
|
108
|
-
username = self.connection_config.username
|
|
109
|
-
password = self.connection_config.access_config.get_secret_value().password
|
|
110
|
-
|
|
111
|
-
auth = PasswordAuthenticator(username, password)
|
|
112
|
-
options = ClusterOptions(auth)
|
|
113
|
-
options.apply_profile("wan_development")
|
|
114
|
-
cluster = Cluster(connection_string, options)
|
|
115
|
-
cluster.wait_until_ready(timedelta(seconds=5))
|
|
116
|
-
return cluster
|
|
117
|
-
|
|
118
130
|
def precheck(self) -> None:
|
|
119
131
|
try:
|
|
120
|
-
self.connect_to_couchbase()
|
|
132
|
+
self.connection_config.connect_to_couchbase()
|
|
121
133
|
except Exception as e:
|
|
122
134
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
123
135
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -133,7 +145,7 @@ class CouchbaseUploader(Uploader):
|
|
|
133
145
|
f"bucket, {self.connection_config.bucket} "
|
|
134
146
|
f"at {self.connection_config.connection_string}",
|
|
135
147
|
)
|
|
136
|
-
cluster = self.connect_to_couchbase()
|
|
148
|
+
cluster = self.connection_config.connect_to_couchbase()
|
|
137
149
|
bucket = cluster.bucket(self.connection_config.bucket)
|
|
138
150
|
scope = bucket.scope(self.connection_config.scope)
|
|
139
151
|
collection = scope.collection(self.connection_config.collection)
|
|
@@ -142,6 +154,168 @@ class CouchbaseUploader(Uploader):
|
|
|
142
154
|
collection.upsert_multi({doc_id: doc for doc in chunk for doc_id, doc in doc.items()})
|
|
143
155
|
|
|
144
156
|
|
|
157
|
+
class CouchbaseIndexerConfig(IndexerConfig):
|
|
158
|
+
batch_size: int = Field(default=50, description="Number of documents to index per batch")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@dataclass
|
|
162
|
+
class CouchbaseIndexer(Indexer):
|
|
163
|
+
connection_config: CouchbaseConnectionConfig
|
|
164
|
+
index_config: CouchbaseIndexerConfig
|
|
165
|
+
connector_type: str = CONNECTOR_TYPE
|
|
166
|
+
|
|
167
|
+
def precheck(self) -> None:
|
|
168
|
+
try:
|
|
169
|
+
self.connection_config.connect_to_couchbase()
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
172
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
173
|
+
|
|
174
|
+
@requires_dependencies(["couchbase"], extras="couchbase")
|
|
175
|
+
def _get_doc_ids(self) -> List[str]:
|
|
176
|
+
query = (
|
|
177
|
+
f"SELECT META(d).id "
|
|
178
|
+
f"FROM `{self.connection_config.bucket}`."
|
|
179
|
+
f"`{self.connection_config.scope}`."
|
|
180
|
+
f"`{self.connection_config.collection}` as d"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
max_attempts = 5
|
|
184
|
+
attempts = 0
|
|
185
|
+
while attempts < max_attempts:
|
|
186
|
+
try:
|
|
187
|
+
cluster = self.connection_config.connect_to_couchbase()
|
|
188
|
+
result = cluster.query(query)
|
|
189
|
+
document_ids = [row["id"] for row in result]
|
|
190
|
+
return document_ids
|
|
191
|
+
except Exception as e:
|
|
192
|
+
attempts += 1
|
|
193
|
+
time.sleep(3)
|
|
194
|
+
if attempts == max_attempts:
|
|
195
|
+
raise SourceConnectionError(f"failed to get document ids: {e}")
|
|
196
|
+
|
|
197
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
198
|
+
ids = self._get_doc_ids()
|
|
199
|
+
|
|
200
|
+
id_batches = [
|
|
201
|
+
ids[i * self.index_config.batch_size : (i + 1) * self.index_config.batch_size]
|
|
202
|
+
for i in range(
|
|
203
|
+
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
204
|
+
)
|
|
205
|
+
]
|
|
206
|
+
for batch in id_batches:
|
|
207
|
+
# Make sure the hash is always a positive number to create identified
|
|
208
|
+
identified = str(hash(tuple(batch)) + sys.maxsize + 1)
|
|
209
|
+
yield FileData(
|
|
210
|
+
identifier=identified,
|
|
211
|
+
connector_type=CONNECTOR_TYPE,
|
|
212
|
+
metadata=FileDataSourceMetadata(
|
|
213
|
+
url=f"{self.connection_config.connection_string}/"
|
|
214
|
+
f"{self.connection_config.bucket}",
|
|
215
|
+
date_processed=str(time.time()),
|
|
216
|
+
),
|
|
217
|
+
additional_metadata={
|
|
218
|
+
"ids": list(batch),
|
|
219
|
+
"bucket": self.connection_config.bucket,
|
|
220
|
+
},
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class CouchbaseDownloaderConfig(DownloaderConfig):
|
|
225
|
+
fields: list[str] = field(default_factory=list)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
@dataclass
|
|
229
|
+
class CouchbaseDownloader(Downloader):
|
|
230
|
+
connection_config: CouchbaseConnectionConfig
|
|
231
|
+
download_config: CouchbaseDownloaderConfig
|
|
232
|
+
connector_type: str = CONNECTOR_TYPE
|
|
233
|
+
|
|
234
|
+
def is_async(self) -> bool:
|
|
235
|
+
return False
|
|
236
|
+
|
|
237
|
+
def get_identifier(self, bucket: str, record_id: str) -> str:
|
|
238
|
+
f = f"{bucket}-{record_id}"
|
|
239
|
+
if self.download_config.fields:
|
|
240
|
+
f = "{}-{}".format(
|
|
241
|
+
f,
|
|
242
|
+
hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
|
|
243
|
+
)
|
|
244
|
+
return f
|
|
245
|
+
|
|
246
|
+
def map_cb_results(self, cb_results: dict) -> str:
|
|
247
|
+
doc_body = cb_results
|
|
248
|
+
flattened_dict = flatten_dict(dictionary=doc_body)
|
|
249
|
+
str_values = [str(value) for value in flattened_dict.values()]
|
|
250
|
+
concatenated_values = "\n".join(str_values)
|
|
251
|
+
return concatenated_values
|
|
252
|
+
|
|
253
|
+
def generate_download_response(
|
|
254
|
+
self, result: dict, bucket: str, file_data: FileData
|
|
255
|
+
) -> DownloadResponse:
|
|
256
|
+
record_id = result["id"]
|
|
257
|
+
filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
|
|
258
|
+
filename = f"{filename_id}.txt"
|
|
259
|
+
download_path = self.download_dir / Path(filename)
|
|
260
|
+
logger.debug(
|
|
261
|
+
f"Downloading results from bucket {bucket} and id {record_id} to {download_path}"
|
|
262
|
+
)
|
|
263
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
264
|
+
try:
|
|
265
|
+
with open(download_path, "w", encoding="utf8") as f:
|
|
266
|
+
f.write(self.map_cb_results(cb_results=result))
|
|
267
|
+
except Exception as e:
|
|
268
|
+
logger.error(
|
|
269
|
+
f"failed to download from bucket {bucket} "
|
|
270
|
+
f"and id {record_id} to {download_path}: {e}",
|
|
271
|
+
exc_info=True,
|
|
272
|
+
)
|
|
273
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
274
|
+
return DownloadResponse(
|
|
275
|
+
file_data=FileData(
|
|
276
|
+
identifier=filename_id,
|
|
277
|
+
connector_type=CONNECTOR_TYPE,
|
|
278
|
+
metadata=FileDataSourceMetadata(
|
|
279
|
+
version=None,
|
|
280
|
+
date_processed=str(time.time()),
|
|
281
|
+
record_locator={
|
|
282
|
+
"connection_string": self.connection_config.connection_string,
|
|
283
|
+
"bucket": bucket,
|
|
284
|
+
"scope": self.connection_config.scope,
|
|
285
|
+
"collection": self.connection_config.collection,
|
|
286
|
+
"document_id": record_id,
|
|
287
|
+
},
|
|
288
|
+
),
|
|
289
|
+
),
|
|
290
|
+
path=download_path,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
294
|
+
bucket_name: str = file_data.additional_metadata["bucket"]
|
|
295
|
+
ids: list[str] = file_data.additional_metadata["ids"]
|
|
296
|
+
|
|
297
|
+
cluster = self.connection_config.connect_to_couchbase()
|
|
298
|
+
bucket = cluster.bucket(bucket_name)
|
|
299
|
+
scope = bucket.scope(self.connection_config.scope)
|
|
300
|
+
collection = scope.collection(self.connection_config.collection)
|
|
301
|
+
|
|
302
|
+
download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
|
|
303
|
+
return list(download_resp)
|
|
304
|
+
|
|
305
|
+
def process_doc_id(self, doc_id, collection, bucket_name, file_data):
|
|
306
|
+
result = collection.get(doc_id)
|
|
307
|
+
return self.generate_download_response(
|
|
308
|
+
result=result.content_as[dict], bucket=bucket_name, file_data=file_data
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
def process_all_doc_ids(self, ids, collection, bucket_name, file_data):
|
|
312
|
+
for doc_id in ids:
|
|
313
|
+
yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
|
|
314
|
+
|
|
315
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
316
|
+
raise NotImplementedError()
|
|
317
|
+
|
|
318
|
+
|
|
145
319
|
couchbase_destination_entry = DestinationRegistryEntry(
|
|
146
320
|
connection_config=CouchbaseConnectionConfig,
|
|
147
321
|
uploader=CouchbaseUploader,
|
|
@@ -149,3 +323,11 @@ couchbase_destination_entry = DestinationRegistryEntry(
|
|
|
149
323
|
upload_stager=CouchbaseUploadStager,
|
|
150
324
|
upload_stager_config=CouchbaseUploadStagerConfig,
|
|
151
325
|
)
|
|
326
|
+
|
|
327
|
+
couchbase_source_entry = SourceRegistryEntry(
|
|
328
|
+
connection_config=CouchbaseConnectionConfig,
|
|
329
|
+
indexer=CouchbaseIndexer,
|
|
330
|
+
indexer_config=CouchbaseIndexerConfig,
|
|
331
|
+
downloader=CouchbaseDownloader,
|
|
332
|
+
downloader_config=CouchbaseDownloaderConfig,
|
|
333
|
+
)
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
12
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
from unstructured_ingest.v2.interfaces import (
|
|
15
|
+
AccessConfig,
|
|
16
|
+
ConnectionConfig,
|
|
17
|
+
FileData,
|
|
18
|
+
UploadContent,
|
|
19
|
+
Uploader,
|
|
20
|
+
UploaderConfig,
|
|
21
|
+
UploadStager,
|
|
22
|
+
UploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
26
|
+
DestinationRegistryEntry,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from kdbai_client import Session, Table
|
|
31
|
+
|
|
32
|
+
CONNECTOR_TYPE = "kdbai"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class KdbaiAccessConfig(AccessConfig):
|
|
36
|
+
api_key: Optional[str] = Field(
|
|
37
|
+
default=None,
|
|
38
|
+
description="A string for the api-key, can be left empty "
|
|
39
|
+
"when connecting to local KDBAI instance.",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
SecretKdbaiAccessConfig = Secret[KdbaiAccessConfig]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class KdbaiConnectionConfig(ConnectionConfig):
|
|
47
|
+
access_config: SecretKdbaiAccessConfig = Field(
|
|
48
|
+
default=SecretKdbaiAccessConfig(secret_value=KdbaiAccessConfig())
|
|
49
|
+
)
|
|
50
|
+
endpoint: str = Field(
|
|
51
|
+
default="http://localhost:8082", description="Endpoint url where KDBAI is hosted."
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@requires_dependencies(["kdbai_client"], extras="kdbai")
|
|
55
|
+
def get_session(self) -> "Session":
|
|
56
|
+
from kdbai_client import Session
|
|
57
|
+
|
|
58
|
+
return Session(
|
|
59
|
+
api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class KdbaiUploadStagerConfig(UploadStagerConfig):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class KdbaiUploadStager(UploadStager):
|
|
69
|
+
upload_stager_config: KdbaiUploadStagerConfig = field(default_factory=KdbaiUploadStagerConfig)
|
|
70
|
+
|
|
71
|
+
def run(
|
|
72
|
+
self,
|
|
73
|
+
elements_filepath: Path,
|
|
74
|
+
file_data: FileData,
|
|
75
|
+
output_dir: Path,
|
|
76
|
+
output_filename: str,
|
|
77
|
+
**kwargs: Any,
|
|
78
|
+
) -> Path:
|
|
79
|
+
with open(elements_filepath) as elements_file:
|
|
80
|
+
elements_contents = json.load(elements_file)
|
|
81
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
82
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
|
|
84
|
+
data = []
|
|
85
|
+
for element in elements_contents:
|
|
86
|
+
data.append(
|
|
87
|
+
{
|
|
88
|
+
"id": str(uuid.uuid4()),
|
|
89
|
+
"element_id": element.get("element_id"),
|
|
90
|
+
"document": element.pop("text", None),
|
|
91
|
+
"embeddings": element.get("embeddings"),
|
|
92
|
+
"metadata": flatten_dict(
|
|
93
|
+
dictionary=element.get("metadata"),
|
|
94
|
+
flatten_lists=True,
|
|
95
|
+
remove_none=True,
|
|
96
|
+
),
|
|
97
|
+
}
|
|
98
|
+
)
|
|
99
|
+
logger.debug(f"writing {len(data)} elements to {output_path}")
|
|
100
|
+
with output_path.open("w") as output_file:
|
|
101
|
+
json.dump(data, output_file, indent=2)
|
|
102
|
+
return output_path
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class KdbaiUploaderConfig(UploaderConfig):
|
|
106
|
+
table_name: str = Field(description="The name of the KDBAI table to write into.")
|
|
107
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class KdbaiUploader(Uploader):
|
|
112
|
+
connection_config: KdbaiConnectionConfig
|
|
113
|
+
upload_config: KdbaiUploaderConfig
|
|
114
|
+
connector_type: str = field(default=CONNECTOR_TYPE, init=False)
|
|
115
|
+
|
|
116
|
+
def precheck(self) -> None:
|
|
117
|
+
try:
|
|
118
|
+
self.get_table()
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
121
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
122
|
+
|
|
123
|
+
def get_table(self) -> "Table":
|
|
124
|
+
session: Session = self.connection_config.get_session()
|
|
125
|
+
table = session.table(self.upload_config.table_name)
|
|
126
|
+
return table
|
|
127
|
+
|
|
128
|
+
def upsert_batch(self, batch: pd.DataFrame):
|
|
129
|
+
table = self.get_table()
|
|
130
|
+
table.insert(data=batch)
|
|
131
|
+
|
|
132
|
+
def process_dataframe(self, df: pd.DataFrame):
|
|
133
|
+
logger.debug(
|
|
134
|
+
f"uploading {len(df)} entries to {self.connection_config.endpoint} "
|
|
135
|
+
f"db in table {self.upload_config.table_name}"
|
|
136
|
+
)
|
|
137
|
+
for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
|
|
138
|
+
self.upsert_batch(batch=batch_df)
|
|
139
|
+
|
|
140
|
+
def process_csv(self, csv_paths: list[Path]):
|
|
141
|
+
logger.debug(f"uploading content from {len(csv_paths)} csv files")
|
|
142
|
+
df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
|
|
143
|
+
self.process_dataframe(df=df)
|
|
144
|
+
|
|
145
|
+
def process_json(self, json_paths: list[Path]):
|
|
146
|
+
logger.debug(f"uploading content from {len(json_paths)} json files")
|
|
147
|
+
all_records = []
|
|
148
|
+
for p in json_paths:
|
|
149
|
+
with open(p) as json_file:
|
|
150
|
+
all_records.extend(json.load(json_file))
|
|
151
|
+
|
|
152
|
+
df = pd.DataFrame(data=all_records)
|
|
153
|
+
self.process_dataframe(df=df)
|
|
154
|
+
|
|
155
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
156
|
+
csv_paths = [c.path for c in contents if c.path.suffix == ".csv"]
|
|
157
|
+
if csv_paths:
|
|
158
|
+
self.process_csv(csv_paths=csv_paths)
|
|
159
|
+
json_paths = [c.path for c in contents if c.path.suffix == ".json"]
|
|
160
|
+
if json_paths:
|
|
161
|
+
self.process_json(json_paths=json_paths)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
kdbai_destination_entry = DestinationRegistryEntry(
|
|
165
|
+
connection_config=KdbaiConnectionConfig,
|
|
166
|
+
uploader=KdbaiUploader,
|
|
167
|
+
uploader_config=KdbaiUploaderConfig,
|
|
168
|
+
upload_stager=KdbaiUploadStager,
|
|
169
|
+
upload_stager_config=KdbaiUploadStagerConfig,
|
|
170
|
+
)
|
|
@@ -71,9 +71,12 @@ class LocalIndexer(Indexer):
|
|
|
71
71
|
input_path = self.index_config.path
|
|
72
72
|
if input_path.is_file():
|
|
73
73
|
return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
|
|
74
|
+
files = []
|
|
74
75
|
if self.index_config.recursive:
|
|
75
|
-
|
|
76
|
-
|
|
76
|
+
files.extend(list(input_path.rglob("*")))
|
|
77
|
+
else:
|
|
78
|
+
files.extend(list(input_path.glob("*")))
|
|
79
|
+
return [f for f in files if f.is_file()]
|
|
77
80
|
|
|
78
81
|
def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
|
|
79
82
|
stats = path.stat()
|
|
@@ -42,7 +42,6 @@ SecretPineconeAccessConfig = Secret[PineconeAccessConfig]
|
|
|
42
42
|
|
|
43
43
|
class PineconeConnectionConfig(ConnectionConfig):
|
|
44
44
|
index_name: str = Field(description="Name of the index to connect to.")
|
|
45
|
-
environment: str = Field(description="Environment to connect to.")
|
|
46
45
|
access_config: SecretPineconeAccessConfig = Field(
|
|
47
46
|
default_factory=lambda: SecretPineconeAccessConfig(secret_value=PineconeAccessConfig())
|
|
48
47
|
)
|
|
@@ -155,7 +154,6 @@ class PineconeUploader(Uploader):
|
|
|
155
154
|
logger.info(
|
|
156
155
|
f"writing document batches to destination"
|
|
157
156
|
f" index named {self.connection_config.index_name}"
|
|
158
|
-
f" environment named {self.connection_config.environment}"
|
|
159
157
|
f" with batch size {self.upload_config.batch_size}"
|
|
160
158
|
f" with {self.upload_config.num_processes} (number of) processes"
|
|
161
159
|
)
|