unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +103 -92
  10. test/integration/connectors/sql/test_singlestore.py +112 -100
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +87 -76
  13. test/integration/connectors/test_astradb.py +62 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +6 -6
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +7 -4
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  35. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  36. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  37. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  38. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  39. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  40. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  41. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  42. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  43. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  44. unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
  45. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  46. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  47. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  48. unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
  49. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  50. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  51. unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
  52. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  53. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  54. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  55. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  56. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
  57. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  58. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  59. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  60. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  61. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  62. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  63. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  64. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  66. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  67. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  68. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  69. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  70. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
  72. unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
  73. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
  75. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  77. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  78. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  79. unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
  80. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  81. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  82. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
  83. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
  84. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  87. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import json
2
+ from contextlib import contextmanager
2
3
  from dataclasses import dataclass, field
3
- from pathlib import Path
4
- from typing import TYPE_CHECKING, Any
4
+ from typing import TYPE_CHECKING, Any, Generator
5
5
 
6
6
  from pydantic import Field, Secret
7
7
 
@@ -49,29 +49,33 @@ class AzureAISearchConnectionConfig(ConnectionConfig):
49
49
  access_config: Secret[AzureAISearchAccessConfig]
50
50
 
51
51
  @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
52
- def get_search_client(self) -> "SearchClient":
52
+ @contextmanager
53
+ def get_search_client(self) -> Generator["SearchClient", None, None]:
53
54
  from azure.core.credentials import AzureKeyCredential
54
55
  from azure.search.documents import SearchClient
55
56
 
56
- return SearchClient(
57
+ with SearchClient(
57
58
  endpoint=self.endpoint,
58
59
  index_name=self.index,
59
60
  credential=AzureKeyCredential(
60
61
  self.access_config.get_secret_value().azure_ai_search_key
61
62
  ),
62
- )
63
+ ) as client:
64
+ yield client
63
65
 
64
66
  @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
65
- def get_search_index_client(self) -> "SearchIndexClient":
67
+ @contextmanager
68
+ def get_search_index_client(self) -> Generator["SearchIndexClient", None, None]:
66
69
  from azure.core.credentials import AzureKeyCredential
67
70
  from azure.search.documents.indexes import SearchIndexClient
68
71
 
69
- return SearchIndexClient(
72
+ with SearchIndexClient(
70
73
  endpoint=self.endpoint,
71
74
  credential=AzureKeyCredential(
72
75
  self.access_config.get_secret_value().azure_ai_search_key
73
76
  ),
74
- )
77
+ ) as search_index_client:
78
+ yield search_index_client
75
79
 
76
80
 
77
81
  class AzureAISearchUploadStagerConfig(UploadStagerConfig):
@@ -92,14 +96,13 @@ class AzureAISearchUploadStager(UploadStager):
92
96
  default_factory=lambda: AzureAISearchUploadStagerConfig()
93
97
  )
94
98
 
95
- @staticmethod
96
- def conform_dict(data: dict, file_data: FileData) -> dict:
99
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
97
100
  """
98
101
  updates the dictionary that is from each Element being converted into a dict/json
99
102
  into a dictionary that conforms to the schema expected by the
100
103
  Azure Cognitive Search index
101
104
  """
102
-
105
+ data = element_dict.copy()
103
106
  data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
104
107
  data[RECORD_ID_LABEL] = file_data.identifier
105
108
 
@@ -140,31 +143,6 @@ class AzureAISearchUploadStager(UploadStager):
140
143
  data["metadata"]["page_number"] = str(page_number)
141
144
  return data
142
145
 
143
- def run(
144
- self,
145
- file_data: FileData,
146
- elements_filepath: Path,
147
- output_dir: Path,
148
- output_filename: str,
149
- **kwargs: Any,
150
- ) -> Path:
151
- with open(elements_filepath) as elements_file:
152
- elements_contents = json.load(elements_file)
153
-
154
- conformed_elements = [
155
- self.conform_dict(data=element, file_data=file_data) for element in elements_contents
156
- ]
157
-
158
- if Path(output_filename).suffix != ".json":
159
- output_filename = f"{output_filename}.json"
160
- else:
161
- output_filename = f"{Path(output_filename).stem}.json"
162
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
163
- output_path.parent.mkdir(parents=True, exist_ok=True)
164
- with open(output_path, "w") as output_file:
165
- json.dump(conformed_elements, output_file, indent=2)
166
- return output_path
167
-
168
146
 
169
147
  @dataclass
170
148
  class AzureAISearchUploader(Uploader):
@@ -270,9 +248,7 @@ class AzureAISearchUploader(Uploader):
270
248
  logger.error(f"failed to validate connection: {e}", exc_info=True)
271
249
  raise DestinationConnectionError(f"failed to validate connection: {e}")
272
250
 
273
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
274
- with path.open("r") as file:
275
- elements_dict = json.load(file)
251
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
276
252
  logger.info(
277
253
  f"writing document batches to destination"
278
254
  f" endpoint at {str(self.connection_config.endpoint)}"
@@ -287,7 +263,7 @@ class AzureAISearchUploader(Uploader):
287
263
 
288
264
  batch_size = self.upload_config.batch_size
289
265
  with self.connection_config.get_search_client() as search_client:
290
- for chunk in batch_generator(elements_dict, batch_size):
266
+ for chunk in batch_generator(data, batch_size):
291
267
  self.write_dict(elements_dict=chunk, search_client=search_client) # noqa: E203
292
268
 
293
269
 
@@ -1,7 +1,5 @@
1
- import json
2
1
  from dataclasses import dataclass, field
3
2
  from datetime import date, datetime
4
- from pathlib import Path
5
3
  from typing import TYPE_CHECKING, Annotated, Any, Optional
6
4
 
7
5
  from dateutil import parser
@@ -42,7 +40,6 @@ class ChromaAccessConfig(AccessConfig):
42
40
 
43
41
 
44
42
  class ChromaConnectionConfig(ConnectionConfig):
45
- collection_name: str = Field(description="The name of the Chroma collection to write into.")
46
43
  access_config: Secret[ChromaAccessConfig] = Field(
47
44
  default=ChromaAccessConfig(), validate_default=True
48
45
  )
@@ -62,6 +59,32 @@ class ChromaConnectionConfig(ConnectionConfig):
62
59
  )
63
60
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
64
61
 
62
+ @requires_dependencies(["chromadb"], extras="chroma")
63
+ def get_client(self) -> "Client":
64
+ import chromadb
65
+
66
+ access_config = self.access_config.get_secret_value()
67
+ if path := self.path:
68
+ return chromadb.PersistentClient(
69
+ path=path,
70
+ settings=access_config.settings,
71
+ tenant=self.tenant,
72
+ database=self.database,
73
+ )
74
+
75
+ elif (host := self.host) and (port := self.port):
76
+ return chromadb.HttpClient(
77
+ host=host,
78
+ port=str(port),
79
+ ssl=self.ssl,
80
+ headers=access_config.headers,
81
+ settings=access_config.settings,
82
+ tenant=self.tenant,
83
+ database=self.database,
84
+ )
85
+ else:
86
+ raise ValueError("Chroma connector requires either path or host and port to be set.")
87
+
65
88
 
66
89
  class ChromaUploadStagerConfig(UploadStagerConfig):
67
90
  pass
@@ -82,11 +105,11 @@ class ChromaUploadStager(UploadStager):
82
105
  logger.debug(f"date {date_string} string not a timestamp: {e}")
83
106
  return parser.parse(date_string)
84
107
 
85
- @staticmethod
86
- def conform_dict(data: dict, file_data: FileData) -> dict:
108
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
87
109
  """
88
110
  Prepares dictionary in the format that Chroma requires
89
111
  """
112
+ data = element_dict.copy()
90
113
  return {
91
114
  "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
92
115
  "embedding": data.pop("embeddings", None),
@@ -94,26 +117,9 @@ class ChromaUploadStager(UploadStager):
94
117
  "metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True),
95
118
  }
96
119
 
97
- def run(
98
- self,
99
- elements_filepath: Path,
100
- file_data: FileData,
101
- output_dir: Path,
102
- output_filename: str,
103
- **kwargs: Any,
104
- ) -> Path:
105
- with open(elements_filepath) as elements_file:
106
- elements_contents = json.load(elements_file)
107
- conformed_elements = [
108
- self.conform_dict(data=element, file_data=file_data) for element in elements_contents
109
- ]
110
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
111
- with open(output_path, "w") as output_file:
112
- json.dump(conformed_elements, output_file)
113
- return output_path
114
-
115
120
 
116
121
  class ChromaUploaderConfig(UploaderConfig):
122
+ collection_name: str = Field(description="The name of the Chroma collection to write into.")
117
123
  batch_size: int = Field(default=100, description="Number of records per batch")
118
124
 
119
125
 
@@ -125,37 +131,11 @@ class ChromaUploader(Uploader):
125
131
 
126
132
  def precheck(self) -> None:
127
133
  try:
128
- self.create_client()
134
+ self.connection_config.get_client()
129
135
  except Exception as e:
130
136
  logger.error(f"failed to validate connection: {e}", exc_info=True)
131
137
  raise DestinationConnectionError(f"failed to validate connection: {e}")
132
138
 
133
- @requires_dependencies(["chromadb"], extras="chroma")
134
- def create_client(self) -> "Client":
135
- import chromadb
136
-
137
- access_config = self.connection_config.access_config.get_secret_value()
138
- if self.connection_config.path:
139
- return chromadb.PersistentClient(
140
- path=self.connection_config.path,
141
- settings=access_config.settings,
142
- tenant=self.connection_config.tenant,
143
- database=self.connection_config.database,
144
- )
145
-
146
- elif self.connection_config.host and self.connection_config.port:
147
- return chromadb.HttpClient(
148
- host=self.connection_config.host,
149
- port=self.connection_config.port,
150
- ssl=self.connection_config.ssl,
151
- headers=access_config.headers,
152
- settings=access_config.settings,
153
- tenant=self.connection_config.tenant,
154
- database=self.connection_config.database,
155
- )
156
- else:
157
- raise ValueError("Chroma connector requires either path or host and port to be set.")
158
-
159
139
  @DestinationConnectionError.wrap
160
140
  def upsert_batch(self, collection, batch):
161
141
 
@@ -189,19 +169,16 @@ class ChromaUploader(Uploader):
189
169
  )
190
170
  return chroma_dict
191
171
 
192
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
193
- with path.open("r") as file:
194
- elements_dict = json.load(file)
195
-
172
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
196
173
  logger.info(
197
- f"writing {len(elements_dict)} objects to destination "
198
- f"collection {self.connection_config.collection_name} "
174
+ f"writing {len(data)} objects to destination "
175
+ f"collection {self.upload_config.collection_name} "
199
176
  f"at {self.connection_config.host}",
200
177
  )
201
- client = self.create_client()
178
+ client = self.connection_config.get_client()
202
179
 
203
- collection = client.get_or_create_collection(name=self.connection_config.collection_name)
204
- for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
180
+ collection = client.get_or_create_collection(name=self.upload_config.collection_name)
181
+ for chunk in batch_generator(data, self.upload_config.batch_size):
205
182
  self.upsert_batch(collection, self.prepare_chroma_list(chunk))
206
183
 
207
184
 
@@ -1,13 +1,12 @@
1
1
  import hashlib
2
- import json
3
- import sys
4
2
  import time
3
+ from contextlib import contextmanager
5
4
  from dataclasses import dataclass, field
6
5
  from datetime import timedelta
7
6
  from pathlib import Path
8
7
  from typing import TYPE_CHECKING, Any, Generator, List
9
8
 
10
- from pydantic import Field, Secret
9
+ from pydantic import BaseModel, Field, Secret
11
10
 
12
11
  from unstructured_ingest.error import (
13
12
  DestinationConnectionError,
@@ -18,6 +17,8 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
18
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
19
18
  from unstructured_ingest.v2.interfaces import (
20
19
  AccessConfig,
20
+ BatchFileData,
21
+ BatchItem,
21
22
  ConnectionConfig,
22
23
  Downloader,
23
24
  DownloaderConfig,
@@ -40,11 +41,20 @@ from unstructured_ingest.v2.processes.connector_registry import (
40
41
 
41
42
  if TYPE_CHECKING:
42
43
  from couchbase.cluster import Cluster
44
+ from couchbase.collection import Collection
43
45
 
44
46
  CONNECTOR_TYPE = "couchbase"
45
47
  SERVER_API_VERSION = "1"
46
48
 
47
49
 
50
+ class CouchbaseAdditionalMetadata(BaseModel):
51
+ bucket: str
52
+
53
+
54
+ class CouchbaseBatchFileData(BatchFileData):
55
+ additional_metadata: CouchbaseAdditionalMetadata
56
+
57
+
48
58
  class CouchbaseAccessConfig(AccessConfig):
49
59
  password: str = Field(description="The password for the Couchbase server")
50
60
 
@@ -65,7 +75,8 @@ class CouchbaseConnectionConfig(ConnectionConfig):
65
75
  access_config: Secret[CouchbaseAccessConfig]
66
76
 
67
77
  @requires_dependencies(["couchbase"], extras="couchbase")
68
- def connect_to_couchbase(self) -> "Cluster":
78
+ @contextmanager
79
+ def get_client(self) -> Generator["Cluster", None, None]:
69
80
  from couchbase.auth import PasswordAuthenticator
70
81
  from couchbase.cluster import Cluster
71
82
  from couchbase.options import ClusterOptions
@@ -73,9 +84,14 @@ class CouchbaseConnectionConfig(ConnectionConfig):
73
84
  auth = PasswordAuthenticator(self.username, self.access_config.get_secret_value().password)
74
85
  options = ClusterOptions(auth)
75
86
  options.apply_profile("wan_development")
76
- cluster = Cluster(self.connection_string, options)
77
- cluster.wait_until_ready(timedelta(seconds=5))
78
- return cluster
87
+ cluster = None
88
+ try:
89
+ cluster = Cluster(self.connection_string, options)
90
+ cluster.wait_until_ready(timedelta(seconds=5))
91
+ yield cluster
92
+ finally:
93
+ if cluster:
94
+ cluster.close()
79
95
 
80
96
 
81
97
  class CouchbaseUploadStagerConfig(UploadStagerConfig):
@@ -88,32 +104,16 @@ class CouchbaseUploadStager(UploadStager):
88
104
  default_factory=lambda: CouchbaseUploadStagerConfig()
89
105
  )
90
106
 
91
- def run(
92
- self,
93
- elements_filepath: Path,
94
- output_dir: Path,
95
- output_filename: str,
96
- **kwargs: Any,
97
- ) -> Path:
98
- with open(elements_filepath) as elements_file:
99
- elements_contents = json.load(elements_file)
100
-
101
- output_elements = []
102
- for element in elements_contents:
103
- new_doc = {
104
- element["element_id"]: {
105
- "embedding": element.get("embeddings", None),
106
- "text": element.get("text", None),
107
- "metadata": element.get("metadata", None),
108
- "type": element.get("type", None),
109
- }
107
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
108
+ data = element_dict.copy()
109
+ return {
110
+ data["element_id"]: {
111
+ "embedding": data.get("embeddings", None),
112
+ "text": data.get("text", None),
113
+ "metadata": data.get("metadata", None),
114
+ "type": data.get("type", None),
110
115
  }
111
- output_elements.append(new_doc)
112
-
113
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
114
- with open(output_path, "w") as output_file:
115
- json.dump(output_elements, output_file)
116
- return output_path
116
+ }
117
117
 
118
118
 
119
119
  class CouchbaseUploaderConfig(UploaderConfig):
@@ -128,26 +128,26 @@ class CouchbaseUploader(Uploader):
128
128
 
129
129
  def precheck(self) -> None:
130
130
  try:
131
- self.connection_config.connect_to_couchbase()
131
+ self.connection_config.get_client()
132
132
  except Exception as e:
133
133
  logger.error(f"Failed to validate connection {e}", exc_info=True)
134
134
  raise DestinationConnectionError(f"failed to validate connection: {e}")
135
135
 
136
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
137
- with path.open("r") as file:
138
- elements_dict = json.load(file)
136
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
139
137
  logger.info(
140
- f"writing {len(elements_dict)} objects to destination "
138
+ f"writing {len(data)} objects to destination "
141
139
  f"bucket, {self.connection_config.bucket} "
142
140
  f"at {self.connection_config.connection_string}",
143
141
  )
144
- cluster = self.connection_config.connect_to_couchbase()
145
- bucket = cluster.bucket(self.connection_config.bucket)
146
- scope = bucket.scope(self.connection_config.scope)
147
- collection = scope.collection(self.connection_config.collection)
142
+ with self.connection_config.get_client() as client:
143
+ bucket = client.bucket(self.connection_config.bucket)
144
+ scope = bucket.scope(self.connection_config.scope)
145
+ collection = scope.collection(self.connection_config.collection)
148
146
 
149
- for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
150
- collection.upsert_multi({doc_id: doc for doc in chunk for doc_id, doc in doc.items()})
147
+ for chunk in batch_generator(data, self.upload_config.batch_size):
148
+ collection.upsert_multi(
149
+ {doc_id: doc for doc in chunk for doc_id, doc in doc.items()}
150
+ )
151
151
 
152
152
 
153
153
  class CouchbaseIndexerConfig(IndexerConfig):
@@ -162,7 +162,7 @@ class CouchbaseIndexer(Indexer):
162
162
 
163
163
  def precheck(self) -> None:
164
164
  try:
165
- self.connection_config.connect_to_couchbase()
165
+ self.connection_config.get_client()
166
166
  except Exception as e:
167
167
  logger.error(f"Failed to validate connection {e}", exc_info=True)
168
168
  raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -180,41 +180,31 @@ class CouchbaseIndexer(Indexer):
180
180
  attempts = 0
181
181
  while attempts < max_attempts:
182
182
  try:
183
- cluster = self.connection_config.connect_to_couchbase()
184
- result = cluster.query(query)
185
- document_ids = [row["id"] for row in result]
186
- return document_ids
183
+ with self.connection_config.get_client() as client:
184
+ result = client.query(query)
185
+ document_ids = [row["id"] for row in result]
186
+ return document_ids
187
187
  except Exception as e:
188
188
  attempts += 1
189
189
  time.sleep(3)
190
190
  if attempts == max_attempts:
191
191
  raise SourceConnectionError(f"failed to get document ids: {e}")
192
192
 
193
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
193
+ def run(self, **kwargs: Any) -> Generator[CouchbaseBatchFileData, None, None]:
194
194
  ids = self._get_doc_ids()
195
-
196
- id_batches = [
197
- ids[i * self.index_config.batch_size : (i + 1) * self.index_config.batch_size]
198
- for i in range(
199
- (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
200
- )
201
- ]
202
- for batch in id_batches:
195
+ for batch in batch_generator(ids, self.index_config.batch_size):
203
196
  # Make sure the hash is always a positive number to create identified
204
- identified = str(hash(tuple(batch)) + sys.maxsize + 1)
205
- yield FileData(
206
- identifier=identified,
197
+ yield CouchbaseBatchFileData(
207
198
  connector_type=CONNECTOR_TYPE,
208
- doc_type="batch",
209
199
  metadata=FileDataSourceMetadata(
210
200
  url=f"{self.connection_config.connection_string}/"
211
201
  f"{self.connection_config.bucket}",
212
202
  date_processed=str(time.time()),
213
203
  ),
214
- additional_metadata={
215
- "ids": list(batch),
216
- "bucket": self.connection_config.bucket,
217
- },
204
+ additional_metadata=CouchbaseAdditionalMetadata(
205
+ bucket=self.connection_config.bucket
206
+ ),
207
+ batch_items=[BatchItem(identifier=b) for b in batch],
218
208
  )
219
209
 
220
210
 
@@ -251,7 +241,7 @@ class CouchbaseDownloader(Downloader):
251
241
  return concatenated_values
252
242
 
253
243
  def generate_download_response(
254
- self, result: dict, bucket: str, file_data: FileData
244
+ self, result: dict, bucket: str, file_data: CouchbaseBatchFileData
255
245
  ) -> DownloadResponse:
256
246
  record_id = result[self.download_config.collection_id]
257
247
  filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
@@ -271,44 +261,53 @@ class CouchbaseDownloader(Downloader):
271
261
  exc_info=True,
272
262
  )
273
263
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
274
- return DownloadResponse(
275
- file_data=FileData(
276
- identifier=filename_id,
277
- connector_type=CONNECTOR_TYPE,
278
- metadata=FileDataSourceMetadata(
279
- version=None,
280
- date_processed=str(time.time()),
281
- record_locator={
282
- "connection_string": self.connection_config.connection_string,
283
- "bucket": bucket,
284
- "scope": self.connection_config.scope,
285
- "collection": self.connection_config.collection,
286
- "document_id": record_id,
287
- },
288
- ),
289
- ),
290
- path=download_path,
264
+ cast_file_data = FileData.cast(file_data=file_data)
265
+ cast_file_data.identifier = filename_id
266
+ cast_file_data.metadata.date_processed = str(time.time())
267
+ cast_file_data.metadata.record_locator = {
268
+ "connection_string": self.connection_config.connection_string,
269
+ "bucket": bucket,
270
+ "scope": self.connection_config.scope,
271
+ "collection": self.connection_config.collection,
272
+ "document_id": record_id,
273
+ }
274
+ return super().generate_download_response(
275
+ file_data=cast_file_data,
276
+ download_path=download_path,
291
277
  )
292
278
 
293
279
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
294
- bucket_name: str = file_data.additional_metadata["bucket"]
295
- ids: list[str] = file_data.additional_metadata["ids"]
280
+ couchbase_file_data = CouchbaseBatchFileData.cast(file_data=file_data)
281
+ bucket_name: str = couchbase_file_data.additional_metadata.bucket
282
+ ids: list[str] = [item.identifier for item in couchbase_file_data.batch_items]
296
283
 
297
- cluster = self.connection_config.connect_to_couchbase()
298
- bucket = cluster.bucket(bucket_name)
299
- scope = bucket.scope(self.connection_config.scope)
300
- collection = scope.collection(self.connection_config.collection)
284
+ with self.connection_config.get_client() as client:
285
+ bucket = client.bucket(bucket_name)
286
+ scope = bucket.scope(self.connection_config.scope)
287
+ collection = scope.collection(self.connection_config.collection)
301
288
 
302
- download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
303
- return list(download_resp)
289
+ download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
290
+ return list(download_resp)
304
291
 
305
- def process_doc_id(self, doc_id, collection, bucket_name, file_data):
292
+ def process_doc_id(
293
+ self,
294
+ doc_id: str,
295
+ collection: "Collection",
296
+ bucket_name: str,
297
+ file_data: CouchbaseBatchFileData,
298
+ ):
306
299
  result = collection.get(doc_id)
307
300
  return self.generate_download_response(
308
301
  result=result.content_as[dict], bucket=bucket_name, file_data=file_data
309
302
  )
310
303
 
311
- def process_all_doc_ids(self, ids, collection, bucket_name, file_data):
304
+ def process_all_doc_ids(
305
+ self,
306
+ ids: list[str],
307
+ collection: "Collection",
308
+ bucket_name: str,
309
+ file_data: CouchbaseBatchFileData,
310
+ ):
312
311
  for doc_id in ids:
313
312
  yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
314
313
 
@@ -11,6 +11,7 @@ import pandas as pd
11
11
  from pydantic import Field, Secret
12
12
 
13
13
  from unstructured_ingest.error import DestinationConnectionError
14
+ from unstructured_ingest.utils.data_prep import get_data_df
14
15
  from unstructured_ingest.utils.dep_check import requires_dependencies
15
16
  from unstructured_ingest.utils.table import convert_to_pandas_dataframe
16
17
  from unstructured_ingest.v2.interfaces import (
@@ -28,6 +29,7 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
28
29
  CONNECTOR_TYPE = "delta_table"
29
30
 
30
31
 
32
+ @requires_dependencies(["deltalake"], extras="delta-table")
31
33
  def write_deltalake_with_error_handling(queue, **kwargs):
32
34
  from deltalake.writer import write_deltalake
33
35
 
@@ -136,39 +138,7 @@ class DeltaTableUploader(Uploader):
136
138
  logger.error(f"failed to validate connection: {e}", exc_info=True)
137
139
  raise DestinationConnectionError(f"failed to validate connection: {e}")
138
140
 
139
- def process_csv(self, csv_paths: list[Path]) -> pd.DataFrame:
140
- logger.debug(f"uploading content from {len(csv_paths)} csv files")
141
- df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
142
- return df
143
-
144
- def process_json(self, json_paths: list[Path]) -> pd.DataFrame:
145
- logger.debug(f"uploading content from {len(json_paths)} json files")
146
- all_records = []
147
- for p in json_paths:
148
- with open(p) as json_file:
149
- all_records.extend(json.load(json_file))
150
-
151
- return pd.DataFrame(data=all_records)
152
-
153
- def process_parquet(self, parquet_paths: list[Path]) -> pd.DataFrame:
154
- logger.debug(f"uploading content from {len(parquet_paths)} parquet files")
155
- df = pd.concat((pd.read_parquet(path) for path in parquet_paths), ignore_index=True)
156
- return df
157
-
158
- def read_dataframe(self, path: Path) -> pd.DataFrame:
159
- if path.suffix == ".csv":
160
- return self.process_csv(csv_paths=[path])
161
- elif path.suffix == ".json":
162
- return self.process_json(json_paths=[path])
163
- elif path.suffix == ".parquet":
164
- return self.process_parquet(parquet_paths=[path])
165
- else:
166
- raise ValueError(f"Unsupported file type, must be parquet, json or csv file: {path}")
167
-
168
- @requires_dependencies(["deltalake"], extras="delta-table")
169
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
170
-
171
- df = self.read_dataframe(path)
141
+ def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
172
142
  updated_upload_path = os.path.join(
173
143
  self.connection_config.table_uri, file_data.source_identifiers.relative_path
174
144
  )
@@ -203,6 +173,14 @@ class DeltaTableUploader(Uploader):
203
173
  logger.error(f"Exception occurred in write_deltalake: {error_message}")
204
174
  raise RuntimeError(f"Error in write_deltalake: {error_message}")
205
175
 
176
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
177
+ df = pd.DataFrame(data=data)
178
+ self.upload_dataframe(df=df, file_data=file_data)
179
+
180
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
181
+ df = get_data_df(path)
182
+ self.upload_dataframe(df=df, file_data=file_data)
183
+
206
184
 
207
185
  delta_table_destination_entry = DestinationRegistryEntry(
208
186
  connection_config=DeltaTableConnectionConfig,