unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +103 -92
- test/integration/connectors/sql/test_singlestore.py +112 -100
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +87 -76
- test/integration/connectors/test_astradb.py +62 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +6 -6
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +7 -4
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
- unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from contextlib import contextmanager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
|
-
from
|
|
4
|
-
from typing import TYPE_CHECKING, Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator
|
|
5
5
|
|
|
6
6
|
from pydantic import Field, Secret
|
|
7
7
|
|
|
@@ -49,29 +49,33 @@ class AzureAISearchConnectionConfig(ConnectionConfig):
|
|
|
49
49
|
access_config: Secret[AzureAISearchAccessConfig]
|
|
50
50
|
|
|
51
51
|
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
52
|
-
|
|
52
|
+
@contextmanager
|
|
53
|
+
def get_search_client(self) -> Generator["SearchClient", None, None]:
|
|
53
54
|
from azure.core.credentials import AzureKeyCredential
|
|
54
55
|
from azure.search.documents import SearchClient
|
|
55
56
|
|
|
56
|
-
|
|
57
|
+
with SearchClient(
|
|
57
58
|
endpoint=self.endpoint,
|
|
58
59
|
index_name=self.index,
|
|
59
60
|
credential=AzureKeyCredential(
|
|
60
61
|
self.access_config.get_secret_value().azure_ai_search_key
|
|
61
62
|
),
|
|
62
|
-
)
|
|
63
|
+
) as client:
|
|
64
|
+
yield client
|
|
63
65
|
|
|
64
66
|
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
65
|
-
|
|
67
|
+
@contextmanager
|
|
68
|
+
def get_search_index_client(self) -> Generator["SearchIndexClient", None, None]:
|
|
66
69
|
from azure.core.credentials import AzureKeyCredential
|
|
67
70
|
from azure.search.documents.indexes import SearchIndexClient
|
|
68
71
|
|
|
69
|
-
|
|
72
|
+
with SearchIndexClient(
|
|
70
73
|
endpoint=self.endpoint,
|
|
71
74
|
credential=AzureKeyCredential(
|
|
72
75
|
self.access_config.get_secret_value().azure_ai_search_key
|
|
73
76
|
),
|
|
74
|
-
)
|
|
77
|
+
) as search_index_client:
|
|
78
|
+
yield search_index_client
|
|
75
79
|
|
|
76
80
|
|
|
77
81
|
class AzureAISearchUploadStagerConfig(UploadStagerConfig):
|
|
@@ -92,14 +96,13 @@ class AzureAISearchUploadStager(UploadStager):
|
|
|
92
96
|
default_factory=lambda: AzureAISearchUploadStagerConfig()
|
|
93
97
|
)
|
|
94
98
|
|
|
95
|
-
|
|
96
|
-
def conform_dict(data: dict, file_data: FileData) -> dict:
|
|
99
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
97
100
|
"""
|
|
98
101
|
updates the dictionary that is from each Element being converted into a dict/json
|
|
99
102
|
into a dictionary that conforms to the schema expected by the
|
|
100
103
|
Azure Cognitive Search index
|
|
101
104
|
"""
|
|
102
|
-
|
|
105
|
+
data = element_dict.copy()
|
|
103
106
|
data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
|
|
104
107
|
data[RECORD_ID_LABEL] = file_data.identifier
|
|
105
108
|
|
|
@@ -140,31 +143,6 @@ class AzureAISearchUploadStager(UploadStager):
|
|
|
140
143
|
data["metadata"]["page_number"] = str(page_number)
|
|
141
144
|
return data
|
|
142
145
|
|
|
143
|
-
def run(
|
|
144
|
-
self,
|
|
145
|
-
file_data: FileData,
|
|
146
|
-
elements_filepath: Path,
|
|
147
|
-
output_dir: Path,
|
|
148
|
-
output_filename: str,
|
|
149
|
-
**kwargs: Any,
|
|
150
|
-
) -> Path:
|
|
151
|
-
with open(elements_filepath) as elements_file:
|
|
152
|
-
elements_contents = json.load(elements_file)
|
|
153
|
-
|
|
154
|
-
conformed_elements = [
|
|
155
|
-
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
156
|
-
]
|
|
157
|
-
|
|
158
|
-
if Path(output_filename).suffix != ".json":
|
|
159
|
-
output_filename = f"{output_filename}.json"
|
|
160
|
-
else:
|
|
161
|
-
output_filename = f"{Path(output_filename).stem}.json"
|
|
162
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
163
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
164
|
-
with open(output_path, "w") as output_file:
|
|
165
|
-
json.dump(conformed_elements, output_file, indent=2)
|
|
166
|
-
return output_path
|
|
167
|
-
|
|
168
146
|
|
|
169
147
|
@dataclass
|
|
170
148
|
class AzureAISearchUploader(Uploader):
|
|
@@ -270,9 +248,7 @@ class AzureAISearchUploader(Uploader):
|
|
|
270
248
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
271
249
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
272
250
|
|
|
273
|
-
def
|
|
274
|
-
with path.open("r") as file:
|
|
275
|
-
elements_dict = json.load(file)
|
|
251
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
276
252
|
logger.info(
|
|
277
253
|
f"writing document batches to destination"
|
|
278
254
|
f" endpoint at {str(self.connection_config.endpoint)}"
|
|
@@ -287,7 +263,7 @@ class AzureAISearchUploader(Uploader):
|
|
|
287
263
|
|
|
288
264
|
batch_size = self.upload_config.batch_size
|
|
289
265
|
with self.connection_config.get_search_client() as search_client:
|
|
290
|
-
for chunk in batch_generator(
|
|
266
|
+
for chunk in batch_generator(data, batch_size):
|
|
291
267
|
self.write_dict(elements_dict=chunk, search_client=search_client) # noqa: E203
|
|
292
268
|
|
|
293
269
|
|
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from dataclasses import dataclass, field
|
|
3
2
|
from datetime import date, datetime
|
|
4
|
-
from pathlib import Path
|
|
5
3
|
from typing import TYPE_CHECKING, Annotated, Any, Optional
|
|
6
4
|
|
|
7
5
|
from dateutil import parser
|
|
@@ -42,7 +40,6 @@ class ChromaAccessConfig(AccessConfig):
|
|
|
42
40
|
|
|
43
41
|
|
|
44
42
|
class ChromaConnectionConfig(ConnectionConfig):
|
|
45
|
-
collection_name: str = Field(description="The name of the Chroma collection to write into.")
|
|
46
43
|
access_config: Secret[ChromaAccessConfig] = Field(
|
|
47
44
|
default=ChromaAccessConfig(), validate_default=True
|
|
48
45
|
)
|
|
@@ -62,6 +59,32 @@ class ChromaConnectionConfig(ConnectionConfig):
|
|
|
62
59
|
)
|
|
63
60
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
64
61
|
|
|
62
|
+
@requires_dependencies(["chromadb"], extras="chroma")
|
|
63
|
+
def get_client(self) -> "Client":
|
|
64
|
+
import chromadb
|
|
65
|
+
|
|
66
|
+
access_config = self.access_config.get_secret_value()
|
|
67
|
+
if path := self.path:
|
|
68
|
+
return chromadb.PersistentClient(
|
|
69
|
+
path=path,
|
|
70
|
+
settings=access_config.settings,
|
|
71
|
+
tenant=self.tenant,
|
|
72
|
+
database=self.database,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
elif (host := self.host) and (port := self.port):
|
|
76
|
+
return chromadb.HttpClient(
|
|
77
|
+
host=host,
|
|
78
|
+
port=str(port),
|
|
79
|
+
ssl=self.ssl,
|
|
80
|
+
headers=access_config.headers,
|
|
81
|
+
settings=access_config.settings,
|
|
82
|
+
tenant=self.tenant,
|
|
83
|
+
database=self.database,
|
|
84
|
+
)
|
|
85
|
+
else:
|
|
86
|
+
raise ValueError("Chroma connector requires either path or host and port to be set.")
|
|
87
|
+
|
|
65
88
|
|
|
66
89
|
class ChromaUploadStagerConfig(UploadStagerConfig):
|
|
67
90
|
pass
|
|
@@ -82,11 +105,11 @@ class ChromaUploadStager(UploadStager):
|
|
|
82
105
|
logger.debug(f"date {date_string} string not a timestamp: {e}")
|
|
83
106
|
return parser.parse(date_string)
|
|
84
107
|
|
|
85
|
-
|
|
86
|
-
def conform_dict(data: dict, file_data: FileData) -> dict:
|
|
108
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
87
109
|
"""
|
|
88
110
|
Prepares dictionary in the format that Chroma requires
|
|
89
111
|
"""
|
|
112
|
+
data = element_dict.copy()
|
|
90
113
|
return {
|
|
91
114
|
"id": get_enhanced_element_id(element_dict=data, file_data=file_data),
|
|
92
115
|
"embedding": data.pop("embeddings", None),
|
|
@@ -94,26 +117,9 @@ class ChromaUploadStager(UploadStager):
|
|
|
94
117
|
"metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True),
|
|
95
118
|
}
|
|
96
119
|
|
|
97
|
-
def run(
|
|
98
|
-
self,
|
|
99
|
-
elements_filepath: Path,
|
|
100
|
-
file_data: FileData,
|
|
101
|
-
output_dir: Path,
|
|
102
|
-
output_filename: str,
|
|
103
|
-
**kwargs: Any,
|
|
104
|
-
) -> Path:
|
|
105
|
-
with open(elements_filepath) as elements_file:
|
|
106
|
-
elements_contents = json.load(elements_file)
|
|
107
|
-
conformed_elements = [
|
|
108
|
-
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
109
|
-
]
|
|
110
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
111
|
-
with open(output_path, "w") as output_file:
|
|
112
|
-
json.dump(conformed_elements, output_file)
|
|
113
|
-
return output_path
|
|
114
|
-
|
|
115
120
|
|
|
116
121
|
class ChromaUploaderConfig(UploaderConfig):
|
|
122
|
+
collection_name: str = Field(description="The name of the Chroma collection to write into.")
|
|
117
123
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
118
124
|
|
|
119
125
|
|
|
@@ -125,37 +131,11 @@ class ChromaUploader(Uploader):
|
|
|
125
131
|
|
|
126
132
|
def precheck(self) -> None:
|
|
127
133
|
try:
|
|
128
|
-
self.
|
|
134
|
+
self.connection_config.get_client()
|
|
129
135
|
except Exception as e:
|
|
130
136
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
131
137
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
132
138
|
|
|
133
|
-
@requires_dependencies(["chromadb"], extras="chroma")
|
|
134
|
-
def create_client(self) -> "Client":
|
|
135
|
-
import chromadb
|
|
136
|
-
|
|
137
|
-
access_config = self.connection_config.access_config.get_secret_value()
|
|
138
|
-
if self.connection_config.path:
|
|
139
|
-
return chromadb.PersistentClient(
|
|
140
|
-
path=self.connection_config.path,
|
|
141
|
-
settings=access_config.settings,
|
|
142
|
-
tenant=self.connection_config.tenant,
|
|
143
|
-
database=self.connection_config.database,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
elif self.connection_config.host and self.connection_config.port:
|
|
147
|
-
return chromadb.HttpClient(
|
|
148
|
-
host=self.connection_config.host,
|
|
149
|
-
port=self.connection_config.port,
|
|
150
|
-
ssl=self.connection_config.ssl,
|
|
151
|
-
headers=access_config.headers,
|
|
152
|
-
settings=access_config.settings,
|
|
153
|
-
tenant=self.connection_config.tenant,
|
|
154
|
-
database=self.connection_config.database,
|
|
155
|
-
)
|
|
156
|
-
else:
|
|
157
|
-
raise ValueError("Chroma connector requires either path or host and port to be set.")
|
|
158
|
-
|
|
159
139
|
@DestinationConnectionError.wrap
|
|
160
140
|
def upsert_batch(self, collection, batch):
|
|
161
141
|
|
|
@@ -189,19 +169,16 @@ class ChromaUploader(Uploader):
|
|
|
189
169
|
)
|
|
190
170
|
return chroma_dict
|
|
191
171
|
|
|
192
|
-
def
|
|
193
|
-
with path.open("r") as file:
|
|
194
|
-
elements_dict = json.load(file)
|
|
195
|
-
|
|
172
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
196
173
|
logger.info(
|
|
197
|
-
f"writing {len(
|
|
198
|
-
f"collection {self.
|
|
174
|
+
f"writing {len(data)} objects to destination "
|
|
175
|
+
f"collection {self.upload_config.collection_name} "
|
|
199
176
|
f"at {self.connection_config.host}",
|
|
200
177
|
)
|
|
201
|
-
client = self.
|
|
178
|
+
client = self.connection_config.get_client()
|
|
202
179
|
|
|
203
|
-
collection = client.get_or_create_collection(name=self.
|
|
204
|
-
for chunk in batch_generator(
|
|
180
|
+
collection = client.get_or_create_collection(name=self.upload_config.collection_name)
|
|
181
|
+
for chunk in batch_generator(data, self.upload_config.batch_size):
|
|
205
182
|
self.upsert_batch(collection, self.prepare_chroma_list(chunk))
|
|
206
183
|
|
|
207
184
|
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
-
import json
|
|
3
|
-
import sys
|
|
4
2
|
import time
|
|
3
|
+
from contextlib import contextmanager
|
|
5
4
|
from dataclasses import dataclass, field
|
|
6
5
|
from datetime import timedelta
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
from typing import TYPE_CHECKING, Any, Generator, List
|
|
9
8
|
|
|
10
|
-
from pydantic import Field, Secret
|
|
9
|
+
from pydantic import BaseModel, Field, Secret
|
|
11
10
|
|
|
12
11
|
from unstructured_ingest.error import (
|
|
13
12
|
DestinationConnectionError,
|
|
@@ -18,6 +17,8 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
|
18
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
19
18
|
from unstructured_ingest.v2.interfaces import (
|
|
20
19
|
AccessConfig,
|
|
20
|
+
BatchFileData,
|
|
21
|
+
BatchItem,
|
|
21
22
|
ConnectionConfig,
|
|
22
23
|
Downloader,
|
|
23
24
|
DownloaderConfig,
|
|
@@ -40,11 +41,20 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
40
41
|
|
|
41
42
|
if TYPE_CHECKING:
|
|
42
43
|
from couchbase.cluster import Cluster
|
|
44
|
+
from couchbase.collection import Collection
|
|
43
45
|
|
|
44
46
|
CONNECTOR_TYPE = "couchbase"
|
|
45
47
|
SERVER_API_VERSION = "1"
|
|
46
48
|
|
|
47
49
|
|
|
50
|
+
class CouchbaseAdditionalMetadata(BaseModel):
|
|
51
|
+
bucket: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class CouchbaseBatchFileData(BatchFileData):
|
|
55
|
+
additional_metadata: CouchbaseAdditionalMetadata
|
|
56
|
+
|
|
57
|
+
|
|
48
58
|
class CouchbaseAccessConfig(AccessConfig):
|
|
49
59
|
password: str = Field(description="The password for the Couchbase server")
|
|
50
60
|
|
|
@@ -65,7 +75,8 @@ class CouchbaseConnectionConfig(ConnectionConfig):
|
|
|
65
75
|
access_config: Secret[CouchbaseAccessConfig]
|
|
66
76
|
|
|
67
77
|
@requires_dependencies(["couchbase"], extras="couchbase")
|
|
68
|
-
|
|
78
|
+
@contextmanager
|
|
79
|
+
def get_client(self) -> Generator["Cluster", None, None]:
|
|
69
80
|
from couchbase.auth import PasswordAuthenticator
|
|
70
81
|
from couchbase.cluster import Cluster
|
|
71
82
|
from couchbase.options import ClusterOptions
|
|
@@ -73,9 +84,14 @@ class CouchbaseConnectionConfig(ConnectionConfig):
|
|
|
73
84
|
auth = PasswordAuthenticator(self.username, self.access_config.get_secret_value().password)
|
|
74
85
|
options = ClusterOptions(auth)
|
|
75
86
|
options.apply_profile("wan_development")
|
|
76
|
-
cluster =
|
|
77
|
-
|
|
78
|
-
|
|
87
|
+
cluster = None
|
|
88
|
+
try:
|
|
89
|
+
cluster = Cluster(self.connection_string, options)
|
|
90
|
+
cluster.wait_until_ready(timedelta(seconds=5))
|
|
91
|
+
yield cluster
|
|
92
|
+
finally:
|
|
93
|
+
if cluster:
|
|
94
|
+
cluster.close()
|
|
79
95
|
|
|
80
96
|
|
|
81
97
|
class CouchbaseUploadStagerConfig(UploadStagerConfig):
|
|
@@ -88,32 +104,16 @@ class CouchbaseUploadStager(UploadStager):
|
|
|
88
104
|
default_factory=lambda: CouchbaseUploadStagerConfig()
|
|
89
105
|
)
|
|
90
106
|
|
|
91
|
-
def
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
elements_contents = json.load(elements_file)
|
|
100
|
-
|
|
101
|
-
output_elements = []
|
|
102
|
-
for element in elements_contents:
|
|
103
|
-
new_doc = {
|
|
104
|
-
element["element_id"]: {
|
|
105
|
-
"embedding": element.get("embeddings", None),
|
|
106
|
-
"text": element.get("text", None),
|
|
107
|
-
"metadata": element.get("metadata", None),
|
|
108
|
-
"type": element.get("type", None),
|
|
109
|
-
}
|
|
107
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
108
|
+
data = element_dict.copy()
|
|
109
|
+
return {
|
|
110
|
+
data["element_id"]: {
|
|
111
|
+
"embedding": data.get("embeddings", None),
|
|
112
|
+
"text": data.get("text", None),
|
|
113
|
+
"metadata": data.get("metadata", None),
|
|
114
|
+
"type": data.get("type", None),
|
|
110
115
|
}
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
114
|
-
with open(output_path, "w") as output_file:
|
|
115
|
-
json.dump(output_elements, output_file)
|
|
116
|
-
return output_path
|
|
116
|
+
}
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
class CouchbaseUploaderConfig(UploaderConfig):
|
|
@@ -128,26 +128,26 @@ class CouchbaseUploader(Uploader):
|
|
|
128
128
|
|
|
129
129
|
def precheck(self) -> None:
|
|
130
130
|
try:
|
|
131
|
-
self.connection_config.
|
|
131
|
+
self.connection_config.get_client()
|
|
132
132
|
except Exception as e:
|
|
133
133
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
134
134
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
135
135
|
|
|
136
|
-
def
|
|
137
|
-
with path.open("r") as file:
|
|
138
|
-
elements_dict = json.load(file)
|
|
136
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
139
137
|
logger.info(
|
|
140
|
-
f"writing {len(
|
|
138
|
+
f"writing {len(data)} objects to destination "
|
|
141
139
|
f"bucket, {self.connection_config.bucket} "
|
|
142
140
|
f"at {self.connection_config.connection_string}",
|
|
143
141
|
)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
142
|
+
with self.connection_config.get_client() as client:
|
|
143
|
+
bucket = client.bucket(self.connection_config.bucket)
|
|
144
|
+
scope = bucket.scope(self.connection_config.scope)
|
|
145
|
+
collection = scope.collection(self.connection_config.collection)
|
|
148
146
|
|
|
149
|
-
|
|
150
|
-
|
|
147
|
+
for chunk in batch_generator(data, self.upload_config.batch_size):
|
|
148
|
+
collection.upsert_multi(
|
|
149
|
+
{doc_id: doc for doc in chunk for doc_id, doc in doc.items()}
|
|
150
|
+
)
|
|
151
151
|
|
|
152
152
|
|
|
153
153
|
class CouchbaseIndexerConfig(IndexerConfig):
|
|
@@ -162,7 +162,7 @@ class CouchbaseIndexer(Indexer):
|
|
|
162
162
|
|
|
163
163
|
def precheck(self) -> None:
|
|
164
164
|
try:
|
|
165
|
-
self.connection_config.
|
|
165
|
+
self.connection_config.get_client()
|
|
166
166
|
except Exception as e:
|
|
167
167
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
168
168
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -180,41 +180,31 @@ class CouchbaseIndexer(Indexer):
|
|
|
180
180
|
attempts = 0
|
|
181
181
|
while attempts < max_attempts:
|
|
182
182
|
try:
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
183
|
+
with self.connection_config.get_client() as client:
|
|
184
|
+
result = client.query(query)
|
|
185
|
+
document_ids = [row["id"] for row in result]
|
|
186
|
+
return document_ids
|
|
187
187
|
except Exception as e:
|
|
188
188
|
attempts += 1
|
|
189
189
|
time.sleep(3)
|
|
190
190
|
if attempts == max_attempts:
|
|
191
191
|
raise SourceConnectionError(f"failed to get document ids: {e}")
|
|
192
192
|
|
|
193
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
193
|
+
def run(self, **kwargs: Any) -> Generator[CouchbaseBatchFileData, None, None]:
|
|
194
194
|
ids = self._get_doc_ids()
|
|
195
|
-
|
|
196
|
-
id_batches = [
|
|
197
|
-
ids[i * self.index_config.batch_size : (i + 1) * self.index_config.batch_size]
|
|
198
|
-
for i in range(
|
|
199
|
-
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
200
|
-
)
|
|
201
|
-
]
|
|
202
|
-
for batch in id_batches:
|
|
195
|
+
for batch in batch_generator(ids, self.index_config.batch_size):
|
|
203
196
|
# Make sure the hash is always a positive number to create identified
|
|
204
|
-
|
|
205
|
-
yield FileData(
|
|
206
|
-
identifier=identified,
|
|
197
|
+
yield CouchbaseBatchFileData(
|
|
207
198
|
connector_type=CONNECTOR_TYPE,
|
|
208
|
-
doc_type="batch",
|
|
209
199
|
metadata=FileDataSourceMetadata(
|
|
210
200
|
url=f"{self.connection_config.connection_string}/"
|
|
211
201
|
f"{self.connection_config.bucket}",
|
|
212
202
|
date_processed=str(time.time()),
|
|
213
203
|
),
|
|
214
|
-
additional_metadata=
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
204
|
+
additional_metadata=CouchbaseAdditionalMetadata(
|
|
205
|
+
bucket=self.connection_config.bucket
|
|
206
|
+
),
|
|
207
|
+
batch_items=[BatchItem(identifier=b) for b in batch],
|
|
218
208
|
)
|
|
219
209
|
|
|
220
210
|
|
|
@@ -251,7 +241,7 @@ class CouchbaseDownloader(Downloader):
|
|
|
251
241
|
return concatenated_values
|
|
252
242
|
|
|
253
243
|
def generate_download_response(
|
|
254
|
-
self, result: dict, bucket: str, file_data:
|
|
244
|
+
self, result: dict, bucket: str, file_data: CouchbaseBatchFileData
|
|
255
245
|
) -> DownloadResponse:
|
|
256
246
|
record_id = result[self.download_config.collection_id]
|
|
257
247
|
filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
|
|
@@ -271,44 +261,53 @@ class CouchbaseDownloader(Downloader):
|
|
|
271
261
|
exc_info=True,
|
|
272
262
|
)
|
|
273
263
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
},
|
|
288
|
-
),
|
|
289
|
-
),
|
|
290
|
-
path=download_path,
|
|
264
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
265
|
+
cast_file_data.identifier = filename_id
|
|
266
|
+
cast_file_data.metadata.date_processed = str(time.time())
|
|
267
|
+
cast_file_data.metadata.record_locator = {
|
|
268
|
+
"connection_string": self.connection_config.connection_string,
|
|
269
|
+
"bucket": bucket,
|
|
270
|
+
"scope": self.connection_config.scope,
|
|
271
|
+
"collection": self.connection_config.collection,
|
|
272
|
+
"document_id": record_id,
|
|
273
|
+
}
|
|
274
|
+
return super().generate_download_response(
|
|
275
|
+
file_data=cast_file_data,
|
|
276
|
+
download_path=download_path,
|
|
291
277
|
)
|
|
292
278
|
|
|
293
279
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
294
|
-
|
|
295
|
-
|
|
280
|
+
couchbase_file_data = CouchbaseBatchFileData.cast(file_data=file_data)
|
|
281
|
+
bucket_name: str = couchbase_file_data.additional_metadata.bucket
|
|
282
|
+
ids: list[str] = [item.identifier for item in couchbase_file_data.batch_items]
|
|
296
283
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
284
|
+
with self.connection_config.get_client() as client:
|
|
285
|
+
bucket = client.bucket(bucket_name)
|
|
286
|
+
scope = bucket.scope(self.connection_config.scope)
|
|
287
|
+
collection = scope.collection(self.connection_config.collection)
|
|
301
288
|
|
|
302
|
-
|
|
303
|
-
|
|
289
|
+
download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
|
|
290
|
+
return list(download_resp)
|
|
304
291
|
|
|
305
|
-
def process_doc_id(
|
|
292
|
+
def process_doc_id(
|
|
293
|
+
self,
|
|
294
|
+
doc_id: str,
|
|
295
|
+
collection: "Collection",
|
|
296
|
+
bucket_name: str,
|
|
297
|
+
file_data: CouchbaseBatchFileData,
|
|
298
|
+
):
|
|
306
299
|
result = collection.get(doc_id)
|
|
307
300
|
return self.generate_download_response(
|
|
308
301
|
result=result.content_as[dict], bucket=bucket_name, file_data=file_data
|
|
309
302
|
)
|
|
310
303
|
|
|
311
|
-
def process_all_doc_ids(
|
|
304
|
+
def process_all_doc_ids(
|
|
305
|
+
self,
|
|
306
|
+
ids: list[str],
|
|
307
|
+
collection: "Collection",
|
|
308
|
+
bucket_name: str,
|
|
309
|
+
file_data: CouchbaseBatchFileData,
|
|
310
|
+
):
|
|
312
311
|
for doc_id in ids:
|
|
313
312
|
yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
|
|
314
313
|
|
|
@@ -11,6 +11,7 @@ import pandas as pd
|
|
|
11
11
|
from pydantic import Field, Secret
|
|
12
12
|
|
|
13
13
|
from unstructured_ingest.error import DestinationConnectionError
|
|
14
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
14
15
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
16
|
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
16
17
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -28,6 +29,7 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
|
|
|
28
29
|
CONNECTOR_TYPE = "delta_table"
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
31
33
|
def write_deltalake_with_error_handling(queue, **kwargs):
|
|
32
34
|
from deltalake.writer import write_deltalake
|
|
33
35
|
|
|
@@ -136,39 +138,7 @@ class DeltaTableUploader(Uploader):
|
|
|
136
138
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
137
139
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
138
140
|
|
|
139
|
-
def
|
|
140
|
-
logger.debug(f"uploading content from {len(csv_paths)} csv files")
|
|
141
|
-
df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
|
|
142
|
-
return df
|
|
143
|
-
|
|
144
|
-
def process_json(self, json_paths: list[Path]) -> pd.DataFrame:
|
|
145
|
-
logger.debug(f"uploading content from {len(json_paths)} json files")
|
|
146
|
-
all_records = []
|
|
147
|
-
for p in json_paths:
|
|
148
|
-
with open(p) as json_file:
|
|
149
|
-
all_records.extend(json.load(json_file))
|
|
150
|
-
|
|
151
|
-
return pd.DataFrame(data=all_records)
|
|
152
|
-
|
|
153
|
-
def process_parquet(self, parquet_paths: list[Path]) -> pd.DataFrame:
|
|
154
|
-
logger.debug(f"uploading content from {len(parquet_paths)} parquet files")
|
|
155
|
-
df = pd.concat((pd.read_parquet(path) for path in parquet_paths), ignore_index=True)
|
|
156
|
-
return df
|
|
157
|
-
|
|
158
|
-
def read_dataframe(self, path: Path) -> pd.DataFrame:
|
|
159
|
-
if path.suffix == ".csv":
|
|
160
|
-
return self.process_csv(csv_paths=[path])
|
|
161
|
-
elif path.suffix == ".json":
|
|
162
|
-
return self.process_json(json_paths=[path])
|
|
163
|
-
elif path.suffix == ".parquet":
|
|
164
|
-
return self.process_parquet(parquet_paths=[path])
|
|
165
|
-
else:
|
|
166
|
-
raise ValueError(f"Unsupported file type, must be parquet, json or csv file: {path}")
|
|
167
|
-
|
|
168
|
-
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
169
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
170
|
-
|
|
171
|
-
df = self.read_dataframe(path)
|
|
141
|
+
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
172
142
|
updated_upload_path = os.path.join(
|
|
173
143
|
self.connection_config.table_uri, file_data.source_identifiers.relative_path
|
|
174
144
|
)
|
|
@@ -203,6 +173,14 @@ class DeltaTableUploader(Uploader):
|
|
|
203
173
|
logger.error(f"Exception occurred in write_deltalake: {error_message}")
|
|
204
174
|
raise RuntimeError(f"Error in write_deltalake: {error_message}")
|
|
205
175
|
|
|
176
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
177
|
+
df = pd.DataFrame(data=data)
|
|
178
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
179
|
+
|
|
180
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
181
|
+
df = get_data_df(path)
|
|
182
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
183
|
+
|
|
206
184
|
|
|
207
185
|
delta_table_destination_entry = DestinationRegistryEntry(
|
|
208
186
|
connection_config=DeltaTableConnectionConfig,
|