unstructured-ingest 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.0.4" # pragma: no cover
1
+ __version__ = "0.0.6" # pragma: no cover
@@ -188,22 +188,26 @@ class Pipeline:
188
188
  indices = self.indexer_step.run()
189
189
  indices_inputs = [{"file_data_path": i} for i in indices]
190
190
  if not indices_inputs:
191
+ logger.info("No files to process after indexer, exiting")
191
192
  return
192
193
 
193
194
  # Initial filtering on indexed content
194
195
  indices_inputs = self.apply_filter(records=indices_inputs)
195
196
  if not indices_inputs:
197
+ logger.info("No files to process after filtering indexed content, exiting")
196
198
  return
197
199
 
198
200
  # Download associated content to local file system
199
201
  downloaded_data = self.downloader_step(indices_inputs)
200
202
  downloaded_data = self.clean_results(results=downloaded_data)
201
203
  if not downloaded_data:
204
+ logger.info("No files to process after downloader, exiting")
202
205
  return
203
206
 
204
207
  # Post download filtering
205
208
  downloaded_data = self.apply_filter(records=downloaded_data)
206
209
  if not downloaded_data:
210
+ logger.info("No files to process after filtering downloaded content, exiting")
207
211
  return
208
212
 
209
213
  # Run uncompress if available
@@ -215,6 +219,7 @@ class Pipeline:
215
219
  # Post uncompress filtering
216
220
  downloaded_data = self.apply_filter(records=downloaded_data)
217
221
  if not downloaded_data:
222
+ logger.info("No files to process after filtering uncompressed content, exiting")
218
223
  return
219
224
 
220
225
  if not downloaded_data:
@@ -224,6 +229,7 @@ class Pipeline:
224
229
  elements = self.partitioner_step(downloaded_data)
225
230
  elements = self.clean_results(results=elements)
226
231
  if not elements:
232
+ logger.info("No files to process after partitioning, exiting")
227
233
  return
228
234
 
229
235
  # Run element specific modifiers
@@ -231,6 +237,7 @@ class Pipeline:
231
237
  elements = step(elements) if step else elements
232
238
  elements = self.clean_results(results=elements)
233
239
  if not elements:
240
+ logger.info(f"No files to process after {step.__class__.__name__}, exiting")
234
241
  return
235
242
 
236
243
  # Upload the final result
@@ -333,7 +340,7 @@ class Pipeline:
333
340
  )
334
341
  if len(destination_entry) != 1:
335
342
  raise ValueError(
336
- "no entry found in source registry with matching uploader, "
343
+ "no entry found in destination registry with matching uploader, "
337
344
  "stager and connection configs"
338
345
  )
339
346
 
@@ -112,6 +112,7 @@ class Chunker(BaseProcess, ABC):
112
112
  @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
113
113
  async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
114
114
  from unstructured_client import UnstructuredClient
115
+ from unstructured_client.models.operations import PartitionRequest
115
116
  from unstructured_client.models.shared import Files, PartitionParameters
116
117
 
117
118
  client = UnstructuredClient(
@@ -137,7 +138,8 @@ class Chunker(BaseProcess, ABC):
137
138
  )
138
139
  filtered_partition_request["files"] = files
139
140
  partition_params = PartitionParameters(**filtered_partition_request)
140
- resp = client.general.partition(partition_params)
141
+ partition_request_obj = PartitionRequest(partition_params)
142
+ resp = client.general.partition(partition_request_obj)
141
143
  elements = resp.elements or []
142
144
  elements = assign_and_map_hash_ids(elements=elements)
143
145
  return elements
@@ -13,13 +13,15 @@ from .azure_cognitive_search import azure_cognitive_search_destination_entry
13
13
  from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
14
14
  from .chroma import chroma_destination_entry
15
15
  from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
16
- from .couchbase import couchbase_destination_entry
16
+ from .couchbase import couchbase_destination_entry, couchbase_source_entry
17
17
  from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
18
18
  from .databricks_volumes import databricks_volumes_destination_entry
19
19
  from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
20
20
  from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
21
21
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
22
22
  from .google_drive import google_drive_source_entry
23
+ from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
24
+ from .kdbai import kdbai_destination_entry
23
25
  from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
24
26
  from .local import local_destination_entry, local_source_entry
25
27
  from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
@@ -47,6 +49,7 @@ add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_d
47
49
 
48
50
  add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
49
51
 
52
+ add_source_entry(source_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_source_entry)
50
53
  add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
51
54
 
52
55
  add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
@@ -87,3 +90,5 @@ add_destination_entry(
87
90
  destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
88
91
  entry=azure_cognitive_search_destination_entry,
89
92
  )
93
+
94
+ add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
@@ -31,19 +31,16 @@ if TYPE_CHECKING:
31
31
  CONNECTOR_TYPE = "astradb"
32
32
 
33
33
 
34
- @dataclass
35
34
  class AstraDBAccessConfig(AccessConfig):
36
35
  token: str = Field(description="Astra DB Token with access to the database.")
37
36
  api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
38
37
 
39
38
 
40
- @dataclass
41
39
  class AstraDBConnectionConfig(ConnectionConfig):
42
- connection_type: str = CONNECTOR_TYPE
40
+ connection_type: str = Field(default=CONNECTOR_TYPE, init=False)
43
41
  access_config: Secret[AstraDBAccessConfig]
44
42
 
45
43
 
46
- @dataclass
47
44
  class AstraDBUploadStagerConfig(UploadStagerConfig):
48
45
  pass
49
46
 
@@ -1,26 +1,42 @@
1
+ import hashlib
1
2
  import json
3
+ import sys
4
+ import time
2
5
  from dataclasses import dataclass, field
3
6
  from datetime import timedelta
4
7
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any
8
+ from typing import TYPE_CHECKING, Any, Generator, List
6
9
 
7
10
  from pydantic import Field, Secret
8
11
 
9
- from unstructured_ingest.error import DestinationConnectionError
10
- from unstructured_ingest.utils.data_prep import batch_generator
12
+ from unstructured_ingest.error import (
13
+ DestinationConnectionError,
14
+ SourceConnectionError,
15
+ SourceConnectionNetworkError,
16
+ )
17
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
11
18
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
19
  from unstructured_ingest.v2.interfaces import (
13
20
  AccessConfig,
14
21
  ConnectionConfig,
22
+ Downloader,
23
+ DownloaderConfig,
24
+ DownloadResponse,
25
+ FileData,
26
+ FileDataSourceMetadata,
27
+ Indexer,
28
+ IndexerConfig,
15
29
  UploadContent,
16
30
  Uploader,
17
31
  UploaderConfig,
18
32
  UploadStager,
19
33
  UploadStagerConfig,
34
+ download_responses,
20
35
  )
21
36
  from unstructured_ingest.v2.logger import logger
22
37
  from unstructured_ingest.v2.processes.connector_registry import (
23
38
  DestinationRegistryEntry,
39
+ SourceRegistryEntry,
24
40
  )
25
41
 
26
42
  if TYPE_CHECKING:
@@ -49,6 +65,19 @@ class CouchbaseConnectionConfig(ConnectionConfig):
49
65
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
50
66
  access_config: Secret[CouchbaseAccessConfig]
51
67
 
68
+ @requires_dependencies(["couchbase"], extras="couchbase")
69
+ def connect_to_couchbase(self) -> "Cluster":
70
+ from couchbase.auth import PasswordAuthenticator
71
+ from couchbase.cluster import Cluster
72
+ from couchbase.options import ClusterOptions
73
+
74
+ auth = PasswordAuthenticator(self.username, self.access_config.get_secret_value().password)
75
+ options = ClusterOptions(auth)
76
+ options.apply_profile("wan_development")
77
+ cluster = Cluster(self.connection_string, options)
78
+ cluster.wait_until_ready(timedelta(seconds=5))
79
+ return cluster
80
+
52
81
 
53
82
  class CouchbaseUploadStagerConfig(UploadStagerConfig):
54
83
  pass
@@ -98,26 +127,9 @@ class CouchbaseUploader(Uploader):
98
127
  upload_config: CouchbaseUploaderConfig
99
128
  connector_type: str = CONNECTOR_TYPE
100
129
 
101
- @requires_dependencies(["couchbase"], extras="couchbase")
102
- def connect_to_couchbase(self) -> "Cluster":
103
- from couchbase.auth import PasswordAuthenticator
104
- from couchbase.cluster import Cluster
105
- from couchbase.options import ClusterOptions
106
-
107
- connection_string = self.connection_config.connection_string
108
- username = self.connection_config.username
109
- password = self.connection_config.access_config.get_secret_value().password
110
-
111
- auth = PasswordAuthenticator(username, password)
112
- options = ClusterOptions(auth)
113
- options.apply_profile("wan_development")
114
- cluster = Cluster(connection_string, options)
115
- cluster.wait_until_ready(timedelta(seconds=5))
116
- return cluster
117
-
118
130
  def precheck(self) -> None:
119
131
  try:
120
- self.connect_to_couchbase()
132
+ self.connection_config.connect_to_couchbase()
121
133
  except Exception as e:
122
134
  logger.error(f"Failed to validate connection {e}", exc_info=True)
123
135
  raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -133,7 +145,7 @@ class CouchbaseUploader(Uploader):
133
145
  f"bucket, {self.connection_config.bucket} "
134
146
  f"at {self.connection_config.connection_string}",
135
147
  )
136
- cluster = self.connect_to_couchbase()
148
+ cluster = self.connection_config.connect_to_couchbase()
137
149
  bucket = cluster.bucket(self.connection_config.bucket)
138
150
  scope = bucket.scope(self.connection_config.scope)
139
151
  collection = scope.collection(self.connection_config.collection)
@@ -142,6 +154,168 @@ class CouchbaseUploader(Uploader):
142
154
  collection.upsert_multi({doc_id: doc for doc in chunk for doc_id, doc in doc.items()})
143
155
 
144
156
 
157
+ class CouchbaseIndexerConfig(IndexerConfig):
158
+ batch_size: int = Field(default=50, description="Number of documents to index per batch")
159
+
160
+
161
+ @dataclass
162
+ class CouchbaseIndexer(Indexer):
163
+ connection_config: CouchbaseConnectionConfig
164
+ index_config: CouchbaseIndexerConfig
165
+ connector_type: str = CONNECTOR_TYPE
166
+
167
+ def precheck(self) -> None:
168
+ try:
169
+ self.connection_config.connect_to_couchbase()
170
+ except Exception as e:
171
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
172
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
173
+
174
+ @requires_dependencies(["couchbase"], extras="couchbase")
175
+ def _get_doc_ids(self) -> List[str]:
176
+ query = (
177
+ f"SELECT META(d).id "
178
+ f"FROM `{self.connection_config.bucket}`."
179
+ f"`{self.connection_config.scope}`."
180
+ f"`{self.connection_config.collection}` as d"
181
+ )
182
+
183
+ max_attempts = 5
184
+ attempts = 0
185
+ while attempts < max_attempts:
186
+ try:
187
+ cluster = self.connection_config.connect_to_couchbase()
188
+ result = cluster.query(query)
189
+ document_ids = [row["id"] for row in result]
190
+ return document_ids
191
+ except Exception as e:
192
+ attempts += 1
193
+ time.sleep(3)
194
+ if attempts == max_attempts:
195
+ raise SourceConnectionError(f"failed to get document ids: {e}")
196
+
197
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
198
+ ids = self._get_doc_ids()
199
+
200
+ id_batches = [
201
+ ids[i * self.index_config.batch_size : (i + 1) * self.index_config.batch_size]
202
+ for i in range(
203
+ (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
204
+ )
205
+ ]
206
+ for batch in id_batches:
207
+ # Make sure the hash is always a positive number to create identified
208
+ identified = str(hash(tuple(batch)) + sys.maxsize + 1)
209
+ yield FileData(
210
+ identifier=identified,
211
+ connector_type=CONNECTOR_TYPE,
212
+ metadata=FileDataSourceMetadata(
213
+ url=f"{self.connection_config.connection_string}/"
214
+ f"{self.connection_config.bucket}",
215
+ date_processed=str(time.time()),
216
+ ),
217
+ additional_metadata={
218
+ "ids": list(batch),
219
+ "bucket": self.connection_config.bucket,
220
+ },
221
+ )
222
+
223
+
224
+ class CouchbaseDownloaderConfig(DownloaderConfig):
225
+ fields: list[str] = field(default_factory=list)
226
+
227
+
228
+ @dataclass
229
+ class CouchbaseDownloader(Downloader):
230
+ connection_config: CouchbaseConnectionConfig
231
+ download_config: CouchbaseDownloaderConfig
232
+ connector_type: str = CONNECTOR_TYPE
233
+
234
+ def is_async(self) -> bool:
235
+ return False
236
+
237
+ def get_identifier(self, bucket: str, record_id: str) -> str:
238
+ f = f"{bucket}-{record_id}"
239
+ if self.download_config.fields:
240
+ f = "{}-{}".format(
241
+ f,
242
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
243
+ )
244
+ return f
245
+
246
+ def map_cb_results(self, cb_results: dict) -> str:
247
+ doc_body = cb_results
248
+ flattened_dict = flatten_dict(dictionary=doc_body)
249
+ str_values = [str(value) for value in flattened_dict.values()]
250
+ concatenated_values = "\n".join(str_values)
251
+ return concatenated_values
252
+
253
+ def generate_download_response(
254
+ self, result: dict, bucket: str, file_data: FileData
255
+ ) -> DownloadResponse:
256
+ record_id = result["id"]
257
+ filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
258
+ filename = f"{filename_id}.txt"
259
+ download_path = self.download_dir / Path(filename)
260
+ logger.debug(
261
+ f"Downloading results from bucket {bucket} and id {record_id} to {download_path}"
262
+ )
263
+ download_path.parent.mkdir(parents=True, exist_ok=True)
264
+ try:
265
+ with open(download_path, "w", encoding="utf8") as f:
266
+ f.write(self.map_cb_results(cb_results=result))
267
+ except Exception as e:
268
+ logger.error(
269
+ f"failed to download from bucket {bucket} "
270
+ f"and id {record_id} to {download_path}: {e}",
271
+ exc_info=True,
272
+ )
273
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
274
+ return DownloadResponse(
275
+ file_data=FileData(
276
+ identifier=filename_id,
277
+ connector_type=CONNECTOR_TYPE,
278
+ metadata=FileDataSourceMetadata(
279
+ version=None,
280
+ date_processed=str(time.time()),
281
+ record_locator={
282
+ "connection_string": self.connection_config.connection_string,
283
+ "bucket": bucket,
284
+ "scope": self.connection_config.scope,
285
+ "collection": self.connection_config.collection,
286
+ "document_id": record_id,
287
+ },
288
+ ),
289
+ ),
290
+ path=download_path,
291
+ )
292
+
293
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
294
+ bucket_name: str = file_data.additional_metadata["bucket"]
295
+ ids: list[str] = file_data.additional_metadata["ids"]
296
+
297
+ cluster = self.connection_config.connect_to_couchbase()
298
+ bucket = cluster.bucket(bucket_name)
299
+ scope = bucket.scope(self.connection_config.scope)
300
+ collection = scope.collection(self.connection_config.collection)
301
+
302
+ download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
303
+ return list(download_resp)
304
+
305
+ def process_doc_id(self, doc_id, collection, bucket_name, file_data):
306
+ result = collection.get(doc_id)
307
+ return self.generate_download_response(
308
+ result=result.content_as[dict], bucket=bucket_name, file_data=file_data
309
+ )
310
+
311
+ def process_all_doc_ids(self, ids, collection, bucket_name, file_data):
312
+ for doc_id in ids:
313
+ yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
314
+
315
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
316
+ raise NotImplementedError()
317
+
318
+
145
319
  couchbase_destination_entry = DestinationRegistryEntry(
146
320
  connection_config=CouchbaseConnectionConfig,
147
321
  uploader=CouchbaseUploader,
@@ -149,3 +323,11 @@ couchbase_destination_entry = DestinationRegistryEntry(
149
323
  upload_stager=CouchbaseUploadStager,
150
324
  upload_stager_config=CouchbaseUploadStagerConfig,
151
325
  )
326
+
327
+ couchbase_source_entry = SourceRegistryEntry(
328
+ connection_config=CouchbaseConnectionConfig,
329
+ indexer=CouchbaseIndexer,
330
+ indexer_config=CouchbaseIndexerConfig,
331
+ downloader=CouchbaseDownloader,
332
+ downloader_config=CouchbaseDownloaderConfig,
333
+ )
@@ -0,0 +1,170 @@
1
+ import json
2
+ import uuid
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Optional
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.error import DestinationConnectionError
12
+ from unstructured_ingest.utils.data_prep import flatten_dict
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+ from unstructured_ingest.v2.interfaces import (
15
+ AccessConfig,
16
+ ConnectionConfig,
17
+ FileData,
18
+ UploadContent,
19
+ Uploader,
20
+ UploaderConfig,
21
+ UploadStager,
22
+ UploadStagerConfig,
23
+ )
24
+ from unstructured_ingest.v2.logger import logger
25
+ from unstructured_ingest.v2.processes.connector_registry import (
26
+ DestinationRegistryEntry,
27
+ )
28
+
29
+ if TYPE_CHECKING:
30
+ from kdbai_client import Session, Table
31
+
32
+ CONNECTOR_TYPE = "kdbai"
33
+
34
+
35
+ class KdbaiAccessConfig(AccessConfig):
36
+ api_key: Optional[str] = Field(
37
+ default=None,
38
+ description="A string for the api-key, can be left empty "
39
+ "when connecting to local KDBAI instance.",
40
+ )
41
+
42
+
43
+ SecretKdbaiAccessConfig = Secret[KdbaiAccessConfig]
44
+
45
+
46
+ class KdbaiConnectionConfig(ConnectionConfig):
47
+ access_config: SecretKdbaiAccessConfig = Field(
48
+ default=SecretKdbaiAccessConfig(secret_value=KdbaiAccessConfig())
49
+ )
50
+ endpoint: str = Field(
51
+ default="http://localhost:8082", description="Endpoint url where KDBAI is hosted."
52
+ )
53
+
54
+ @requires_dependencies(["kdbai_client"], extras="kdbai")
55
+ def get_session(self) -> "Session":
56
+ from kdbai_client import Session
57
+
58
+ return Session(
59
+ api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
60
+ )
61
+
62
+
63
+ class KdbaiUploadStagerConfig(UploadStagerConfig):
64
+ pass
65
+
66
+
67
+ @dataclass
68
+ class KdbaiUploadStager(UploadStager):
69
+ upload_stager_config: KdbaiUploadStagerConfig = field(default_factory=KdbaiUploadStagerConfig)
70
+
71
+ def run(
72
+ self,
73
+ elements_filepath: Path,
74
+ file_data: FileData,
75
+ output_dir: Path,
76
+ output_filename: str,
77
+ **kwargs: Any,
78
+ ) -> Path:
79
+ with open(elements_filepath) as elements_file:
80
+ elements_contents = json.load(elements_file)
81
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
82
+ output_path.parent.mkdir(parents=True, exist_ok=True)
83
+
84
+ data = []
85
+ for element in elements_contents:
86
+ data.append(
87
+ {
88
+ "id": str(uuid.uuid4()),
89
+ "element_id": element.get("element_id"),
90
+ "document": element.pop("text", None),
91
+ "embeddings": element.get("embeddings"),
92
+ "metadata": flatten_dict(
93
+ dictionary=element.get("metadata"),
94
+ flatten_lists=True,
95
+ remove_none=True,
96
+ ),
97
+ }
98
+ )
99
+ logger.debug(f"writing {len(data)} elements to {output_path}")
100
+ with output_path.open("w") as output_file:
101
+ json.dump(data, output_file, indent=2)
102
+ return output_path
103
+
104
+
105
+ class KdbaiUploaderConfig(UploaderConfig):
106
+ table_name: str = Field(description="The name of the KDBAI table to write into.")
107
+ batch_size: int = Field(default=100, description="Number of records per batch")
108
+
109
+
110
+ @dataclass
111
+ class KdbaiUploader(Uploader):
112
+ connection_config: KdbaiConnectionConfig
113
+ upload_config: KdbaiUploaderConfig
114
+ connector_type: str = field(default=CONNECTOR_TYPE, init=False)
115
+
116
+ def precheck(self) -> None:
117
+ try:
118
+ self.get_table()
119
+ except Exception as e:
120
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
121
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
122
+
123
+ def get_table(self) -> "Table":
124
+ session: Session = self.connection_config.get_session()
125
+ table = session.table(self.upload_config.table_name)
126
+ return table
127
+
128
+ def upsert_batch(self, batch: pd.DataFrame):
129
+ table = self.get_table()
130
+ table.insert(data=batch)
131
+
132
+ def process_dataframe(self, df: pd.DataFrame):
133
+ logger.debug(
134
+ f"uploading {len(df)} entries to {self.connection_config.endpoint} "
135
+ f"db in table {self.upload_config.table_name}"
136
+ )
137
+ for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
138
+ self.upsert_batch(batch=batch_df)
139
+
140
+ def process_csv(self, csv_paths: list[Path]):
141
+ logger.debug(f"uploading content from {len(csv_paths)} csv files")
142
+ df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
143
+ self.process_dataframe(df=df)
144
+
145
+ def process_json(self, json_paths: list[Path]):
146
+ logger.debug(f"uploading content from {len(json_paths)} json files")
147
+ all_records = []
148
+ for p in json_paths:
149
+ with open(p) as json_file:
150
+ all_records.extend(json.load(json_file))
151
+
152
+ df = pd.DataFrame(data=all_records)
153
+ self.process_dataframe(df=df)
154
+
155
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
156
+ csv_paths = [c.path for c in contents if c.path.suffix == ".csv"]
157
+ if csv_paths:
158
+ self.process_csv(csv_paths=csv_paths)
159
+ json_paths = [c.path for c in contents if c.path.suffix == ".json"]
160
+ if json_paths:
161
+ self.process_json(json_paths=json_paths)
162
+
163
+
164
+ kdbai_destination_entry = DestinationRegistryEntry(
165
+ connection_config=KdbaiConnectionConfig,
166
+ uploader=KdbaiUploader,
167
+ uploader_config=KdbaiUploaderConfig,
168
+ upload_stager=KdbaiUploadStager,
169
+ upload_stager_config=KdbaiUploadStagerConfig,
170
+ )
@@ -71,9 +71,12 @@ class LocalIndexer(Indexer):
71
71
  input_path = self.index_config.path
72
72
  if input_path.is_file():
73
73
  return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
74
+ files = []
74
75
  if self.index_config.recursive:
75
- return list(input_path.rglob("*"))
76
- return list(input_path.glob("*"))
76
+ files.extend(list(input_path.rglob("*")))
77
+ else:
78
+ files.extend(list(input_path.glob("*")))
79
+ return [f for f in files if f.is_file()]
77
80
 
78
81
  def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
79
82
  stats = path.stat()
@@ -42,7 +42,6 @@ SecretPineconeAccessConfig = Secret[PineconeAccessConfig]
42
42
 
43
43
  class PineconeConnectionConfig(ConnectionConfig):
44
44
  index_name: str = Field(description="Name of the index to connect to.")
45
- environment: str = Field(description="Environment to connect to.")
46
45
  access_config: SecretPineconeAccessConfig = Field(
47
46
  default_factory=lambda: SecretPineconeAccessConfig(secret_value=PineconeAccessConfig())
48
47
  )
@@ -155,7 +154,6 @@ class PineconeUploader(Uploader):
155
154
  logger.info(
156
155
  f"writing document batches to destination"
157
156
  f" index named {self.connection_config.index_name}"
158
- f" environment named {self.connection_config.environment}"
159
157
  f" with batch size {self.upload_config.batch_size}"
160
158
  f" with {self.upload_config.num_processes} (number of) processes"
161
159
  )
@@ -13,6 +13,7 @@ from unstructured_ingest.v2.logger import logger
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  from unstructured_client import UnstructuredClient
16
+ from unstructured_client.models.operations import PartitionRequest
16
17
  from unstructured_client.models.shared import PartitionParameters
17
18
 
18
19
 
@@ -153,7 +154,7 @@ class Partitioner(BaseProcess, ABC):
153
154
  )
154
155
  return self.postprocess(elements=elements_to_dicts(elements))
155
156
 
156
- async def call_api(self, client: "UnstructuredClient", request: "PartitionParameters"):
157
+ async def call_api(self, client: "UnstructuredClient", request: "PartitionRequest"):
157
158
  # TODO when client supports async, run without using run_in_executor
158
159
  # isolate the IO heavy call
159
160
  loop = asyncio.get_event_loop()
@@ -189,6 +190,7 @@ class Partitioner(BaseProcess, ABC):
189
190
  self, filename: Path, metadata: Optional[dict] = None, **kwargs
190
191
  ) -> list[dict]:
191
192
  from unstructured_client import UnstructuredClient
193
+ from unstructured_client.models.operations import PartitionRequest
192
194
 
193
195
  logger.debug(f"partitioning file {filename} with metadata: {metadata}")
194
196
  client = UnstructuredClient(
@@ -196,7 +198,8 @@ class Partitioner(BaseProcess, ABC):
196
198
  api_key_auth=self.config.api_key.get_secret_value(),
197
199
  )
198
200
  partition_params = self.create_partition_parameters(filename=filename)
199
- resp = await self.call_api(client=client, request=partition_params)
201
+ partition_request = PartitionRequest(partition_params)
202
+ resp = await self.call_api(client=client, request=partition_request)
200
203
  elements = resp.elements or []
201
204
  # Append the data source metadata the auto partition does for you
202
205
  for element in elements: