unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +109 -0
- test/integration/connectors/test_azure_cog_search.py +233 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_kafka.py +167 -0
- test/integration/connectors/test_onedrive.py +112 -0
- test/integration/connectors/test_pinecone.py +161 -0
- test/integration/connectors/test_qdrant.py +137 -0
- test/integration/connectors/test_s3.py +23 -0
- test/integration/connectors/utils/docker.py +2 -1
- test/integration/connectors/utils/validation.py +73 -22
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/__init__.py +2 -2
- unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/kafka.py +0 -1
- unstructured_ingest/interfaces.py +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/processes/chunker.py +2 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
- unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
- unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
- unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
- unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
- unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
- unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
- unstructured_ingest/v2/processes/partitioner.py +14 -3
- unstructured_ingest/v2/unstructured_api.py +24 -10
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
- unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
- /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
- /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
- /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -9,6 +9,7 @@ from pydantic import Field, Secret
|
|
|
9
9
|
from unstructured_ingest.error import DestinationConnectionError
|
|
10
10
|
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
12
13
|
from unstructured_ingest.v2.interfaces import (
|
|
13
14
|
AccessConfig,
|
|
14
15
|
ConnectionConfig,
|
|
@@ -23,6 +24,7 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
|
|
|
23
24
|
|
|
24
25
|
if TYPE_CHECKING:
|
|
25
26
|
from pinecone import Index as PineconeIndex
|
|
27
|
+
from pinecone import Pinecone
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
CONNECTOR_TYPE = "pinecone"
|
|
@@ -43,16 +45,19 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
43
45
|
)
|
|
44
46
|
|
|
45
47
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
46
|
-
def
|
|
48
|
+
def get_client(self, **index_kwargs) -> "Pinecone":
|
|
47
49
|
from pinecone import Pinecone
|
|
48
50
|
|
|
49
51
|
from unstructured_ingest import __version__ as unstructured_version
|
|
50
52
|
|
|
51
|
-
|
|
53
|
+
return Pinecone(
|
|
52
54
|
api_key=self.access_config.get_secret_value().pinecone_api_key,
|
|
53
55
|
source_tag=f"unstructured_ingest=={unstructured_version}",
|
|
54
56
|
)
|
|
55
57
|
|
|
58
|
+
def get_index(self, **index_kwargs) -> "PineconeIndex":
|
|
59
|
+
pc = self.get_client()
|
|
60
|
+
|
|
56
61
|
index = pc.Index(name=self.index_name, **index_kwargs)
|
|
57
62
|
logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
|
|
58
63
|
return index
|
|
@@ -106,7 +111,7 @@ class PineconeUploadStager(UploadStager):
|
|
|
106
111
|
default_factory=lambda: PineconeUploadStagerConfig()
|
|
107
112
|
)
|
|
108
113
|
|
|
109
|
-
def conform_dict(self, element_dict: dict) -> dict:
|
|
114
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
110
115
|
embeddings = element_dict.pop("embeddings", None)
|
|
111
116
|
metadata: dict[str, Any] = element_dict.pop("metadata", {})
|
|
112
117
|
data_source = metadata.pop("data_source", {})
|
|
@@ -121,19 +126,23 @@ class PineconeUploadStager(UploadStager):
|
|
|
121
126
|
}
|
|
122
127
|
)
|
|
123
128
|
|
|
129
|
+
metadata = flatten_dict(
|
|
130
|
+
pinecone_metadata,
|
|
131
|
+
separator="-",
|
|
132
|
+
flatten_lists=True,
|
|
133
|
+
remove_none=True,
|
|
134
|
+
)
|
|
135
|
+
metadata[RECORD_ID_LABEL] = file_data.identifier
|
|
136
|
+
|
|
124
137
|
return {
|
|
125
138
|
"id": str(uuid.uuid4()),
|
|
126
139
|
"values": embeddings,
|
|
127
|
-
"metadata":
|
|
128
|
-
pinecone_metadata,
|
|
129
|
-
separator="-",
|
|
130
|
-
flatten_lists=True,
|
|
131
|
-
remove_none=True,
|
|
132
|
-
),
|
|
140
|
+
"metadata": metadata,
|
|
133
141
|
}
|
|
134
142
|
|
|
135
143
|
def run(
|
|
136
144
|
self,
|
|
145
|
+
file_data: FileData,
|
|
137
146
|
elements_filepath: Path,
|
|
138
147
|
output_dir: Path,
|
|
139
148
|
output_filename: str,
|
|
@@ -143,10 +152,15 @@ class PineconeUploadStager(UploadStager):
|
|
|
143
152
|
elements_contents = json.load(elements_file)
|
|
144
153
|
|
|
145
154
|
conformed_elements = [
|
|
146
|
-
self.conform_dict(element_dict=element)
|
|
155
|
+
self.conform_dict(element_dict=element, file_data=file_data)
|
|
156
|
+
for element in elements_contents
|
|
147
157
|
]
|
|
148
158
|
|
|
149
|
-
|
|
159
|
+
if Path(output_filename).suffix != ".json":
|
|
160
|
+
output_filename = f"{output_filename}.json"
|
|
161
|
+
else:
|
|
162
|
+
output_filename = f"{Path(output_filename).stem}.json"
|
|
163
|
+
output_path = Path(output_dir) / Path(f"{output_filename}")
|
|
150
164
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
151
165
|
|
|
152
166
|
with open(output_path, "w") as output_file:
|
|
@@ -167,6 +181,55 @@ class PineconeUploader(Uploader):
|
|
|
167
181
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
168
182
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
169
183
|
|
|
184
|
+
def pod_delete_by_record_id(self, file_data: FileData) -> None:
|
|
185
|
+
logger.debug(
|
|
186
|
+
f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
187
|
+
f"from pinecone pod index"
|
|
188
|
+
)
|
|
189
|
+
index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
|
|
190
|
+
delete_kwargs = {"filter": {RECORD_ID_LABEL: {"$eq": file_data.identifier}}}
|
|
191
|
+
if namespace := self.upload_config.namespace:
|
|
192
|
+
delete_kwargs["namespace"] = namespace
|
|
193
|
+
|
|
194
|
+
resp = index.delete(**delete_kwargs)
|
|
195
|
+
logger.debug(
|
|
196
|
+
f"deleted any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
197
|
+
f"from pinecone index: {resp}"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def serverless_delete_by_record_id(self, file_data: FileData) -> None:
|
|
201
|
+
logger.debug(
|
|
202
|
+
f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
203
|
+
f"from pinecone serverless index"
|
|
204
|
+
)
|
|
205
|
+
index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
|
|
206
|
+
index_stats = index.describe_index_stats()
|
|
207
|
+
total_vectors = index_stats["total_vector_count"]
|
|
208
|
+
if total_vectors == 0:
|
|
209
|
+
return
|
|
210
|
+
dimension = index_stats["dimension"]
|
|
211
|
+
query_params = {
|
|
212
|
+
"filter": {RECORD_ID_LABEL: {"$eq": file_data.identifier}},
|
|
213
|
+
"vector": [0] * dimension,
|
|
214
|
+
"top_k": total_vectors,
|
|
215
|
+
}
|
|
216
|
+
if namespace := self.upload_config.namespace:
|
|
217
|
+
query_params["namespace"] = namespace
|
|
218
|
+
while True:
|
|
219
|
+
query_results = index.query(**query_params)
|
|
220
|
+
matches = query_results.get("matches", [])
|
|
221
|
+
if not matches:
|
|
222
|
+
break
|
|
223
|
+
ids = [match["id"] for match in matches]
|
|
224
|
+
delete_params = {"ids": ids}
|
|
225
|
+
if namespace := self.upload_config.namespace:
|
|
226
|
+
delete_params["namespace"] = namespace
|
|
227
|
+
index.delete(**delete_params)
|
|
228
|
+
logger.debug(
|
|
229
|
+
f"deleted any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
230
|
+
f"from pinecone index"
|
|
231
|
+
)
|
|
232
|
+
|
|
170
233
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
171
234
|
def upsert_batches_async(self, elements_dict: list[dict]):
|
|
172
235
|
from pinecone.exceptions import PineconeApiException
|
|
@@ -208,7 +271,15 @@ class PineconeUploader(Uploader):
|
|
|
208
271
|
f" index named {self.connection_config.index_name}"
|
|
209
272
|
f" with batch size {self.upload_config.batch_size}"
|
|
210
273
|
)
|
|
211
|
-
|
|
274
|
+
# Determine if serverless or pod based index
|
|
275
|
+
pinecone_client = self.connection_config.get_client()
|
|
276
|
+
index_description = pinecone_client.describe_index(name=self.connection_config.index_name)
|
|
277
|
+
if "serverless" in index_description.get("spec"):
|
|
278
|
+
self.serverless_delete_by_record_id(file_data=file_data)
|
|
279
|
+
elif "pod" in index_description.get("spec"):
|
|
280
|
+
self.pod_delete_by_record_id(file_data=file_data)
|
|
281
|
+
else:
|
|
282
|
+
raise ValueError(f"unexpected spec type in index description: {index_description}")
|
|
212
283
|
self.upsert_batches_async(elements_dict=elements_dict)
|
|
213
284
|
|
|
214
285
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR_TYPE
|
|
8
|
+
from .cloud import qdrant_cloud_destination_entry
|
|
9
|
+
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
|
|
10
|
+
from .local import qdrant_local_destination_entry
|
|
11
|
+
from .server import CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE
|
|
12
|
+
from .server import qdrant_server_destination_entry
|
|
13
|
+
|
|
14
|
+
add_destination_entry(destination_type=CLOUD_CONNECTOR_TYPE, entry=qdrant_cloud_destination_entry)
|
|
15
|
+
add_destination_entry(destination_type=SERVER_CONNECTOR_TYPE, entry=qdrant_server_destination_entry)
|
|
16
|
+
add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=qdrant_local_destination_entry)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
6
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
|
|
7
|
+
QdrantAccessConfig,
|
|
8
|
+
QdrantConnectionConfig,
|
|
9
|
+
QdrantUploader,
|
|
10
|
+
QdrantUploaderConfig,
|
|
11
|
+
QdrantUploadStager,
|
|
12
|
+
QdrantUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "qdrant-cloud"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CloudQdrantAccessConfig(QdrantAccessConfig):
|
|
19
|
+
api_key: str = Field(description="Qdrant API key")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CloudQdrantConnectionConfig(QdrantConnectionConfig):
|
|
23
|
+
url: str = Field(default=None, description="url of Qdrant Cloud")
|
|
24
|
+
access_config: Secret[CloudQdrantAccessConfig]
|
|
25
|
+
|
|
26
|
+
def get_client_kwargs(self) -> dict:
|
|
27
|
+
return {
|
|
28
|
+
"api_key": self.access_config.get_secret_value().api_key,
|
|
29
|
+
"url": self.url,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CloudQdrantUploadStagerConfig(QdrantUploadStagerConfig):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class CloudQdrantUploadStager(QdrantUploadStager):
|
|
39
|
+
upload_stager_config: CloudQdrantUploadStagerConfig
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class CloudQdrantUploaderConfig(QdrantUploaderConfig):
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class CloudQdrantUploader(QdrantUploader):
|
|
48
|
+
connection_config: CloudQdrantConnectionConfig
|
|
49
|
+
upload_config: CloudQdrantUploaderConfig
|
|
50
|
+
connector_type: str = CONNECTOR_TYPE
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
qdrant_cloud_destination_entry = DestinationRegistryEntry(
|
|
54
|
+
connection_config=CloudQdrantConnectionConfig,
|
|
55
|
+
uploader=CloudQdrantUploader,
|
|
56
|
+
uploader_config=CloudQdrantUploaderConfig,
|
|
57
|
+
upload_stager=CloudQdrantUploadStager,
|
|
58
|
+
upload_stager_config=CloudQdrantUploadStagerConfig,
|
|
59
|
+
)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
6
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
|
|
7
|
+
QdrantAccessConfig,
|
|
8
|
+
QdrantConnectionConfig,
|
|
9
|
+
QdrantUploader,
|
|
10
|
+
QdrantUploaderConfig,
|
|
11
|
+
QdrantUploadStager,
|
|
12
|
+
QdrantUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "qdrant-local"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LocalQdrantAccessConfig(QdrantAccessConfig):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LocalQdrantConnectionConfig(QdrantConnectionConfig):
|
|
23
|
+
path: str = Field(default=None, description="Persistence path for QdrantLocal.")
|
|
24
|
+
access_config: Secret[LocalQdrantAccessConfig] = Field(
|
|
25
|
+
default_factory=LocalQdrantAccessConfig, validate_default=True
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def get_client_kwargs(self) -> dict:
|
|
29
|
+
return {"path": self.path}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class LocalQdrantUploadStagerConfig(QdrantUploadStagerConfig):
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class LocalQdrantUploadStager(QdrantUploadStager):
|
|
38
|
+
upload_stager_config: LocalQdrantUploadStagerConfig
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class LocalQdrantUploaderConfig(QdrantUploaderConfig):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class LocalQdrantUploader(QdrantUploader):
|
|
47
|
+
connection_config: LocalQdrantConnectionConfig
|
|
48
|
+
upload_config: LocalQdrantUploaderConfig
|
|
49
|
+
connector_type: str = CONNECTOR_TYPE
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
qdrant_local_destination_entry = DestinationRegistryEntry(
|
|
53
|
+
connection_config=LocalQdrantConnectionConfig,
|
|
54
|
+
uploader=LocalQdrantUploader,
|
|
55
|
+
uploader_config=LocalQdrantUploaderConfig,
|
|
56
|
+
upload_stager=LocalQdrantUploadStager,
|
|
57
|
+
upload_stager_config=LocalQdrantUploadStagerConfig,
|
|
58
|
+
)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import uuid
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from contextlib import asynccontextmanager
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import Field, Secret
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
13
|
+
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
14
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
|
+
from unstructured_ingest.v2.interfaces import (
|
|
16
|
+
AccessConfig,
|
|
17
|
+
ConnectionConfig,
|
|
18
|
+
FileData,
|
|
19
|
+
Uploader,
|
|
20
|
+
UploaderConfig,
|
|
21
|
+
UploadStager,
|
|
22
|
+
UploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from qdrant_client import AsyncQdrantClient
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class QdrantAccessConfig(AccessConfig, ABC):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class QdrantConnectionConfig(ConnectionConfig, ABC):
|
|
35
|
+
access_config: Secret[QdrantAccessConfig] = Field(
|
|
36
|
+
default_factory=QdrantAccessConfig, validate_default=True, description="Access Config"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def get_client_kwargs(self) -> dict:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
@requires_dependencies(["qdrant_client"], extras="qdrant")
|
|
44
|
+
@asynccontextmanager
|
|
45
|
+
async def get_client(self) -> AsyncGenerator["AsyncQdrantClient", None]:
|
|
46
|
+
from qdrant_client.async_qdrant_client import AsyncQdrantClient
|
|
47
|
+
|
|
48
|
+
client_kwargs = self.get_client_kwargs()
|
|
49
|
+
client = AsyncQdrantClient(**client_kwargs)
|
|
50
|
+
try:
|
|
51
|
+
yield client
|
|
52
|
+
finally:
|
|
53
|
+
await client.close()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class QdrantUploadStagerConfig(UploadStagerConfig):
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class QdrantUploadStager(UploadStager, ABC):
|
|
62
|
+
upload_stager_config: QdrantUploadStagerConfig = field(
|
|
63
|
+
default_factory=lambda: QdrantUploadStagerConfig()
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def conform_dict(data: dict) -> dict:
|
|
68
|
+
"""Prepares dictionary in the format that Chroma requires"""
|
|
69
|
+
return {
|
|
70
|
+
"id": str(uuid.uuid4()),
|
|
71
|
+
"vector": data.pop("embeddings", {}),
|
|
72
|
+
"payload": {
|
|
73
|
+
"text": data.pop("text", None),
|
|
74
|
+
"element_serialized": json.dumps(data),
|
|
75
|
+
**flatten_dict(
|
|
76
|
+
data,
|
|
77
|
+
separator="-",
|
|
78
|
+
flatten_lists=True,
|
|
79
|
+
),
|
|
80
|
+
},
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
def run(
|
|
84
|
+
self,
|
|
85
|
+
elements_filepath: Path,
|
|
86
|
+
file_data: FileData,
|
|
87
|
+
output_dir: Path,
|
|
88
|
+
output_filename: str,
|
|
89
|
+
**kwargs: Any,
|
|
90
|
+
) -> Path:
|
|
91
|
+
with open(elements_filepath) as elements_file:
|
|
92
|
+
elements_contents = json.load(elements_file)
|
|
93
|
+
|
|
94
|
+
conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
|
|
95
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
96
|
+
|
|
97
|
+
with open(output_path, "w") as output_file:
|
|
98
|
+
json.dump(conformed_elements, output_file)
|
|
99
|
+
return output_path
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class QdrantUploaderConfig(UploaderConfig):
|
|
103
|
+
collection_name: str = Field(description="Name of the collection.")
|
|
104
|
+
batch_size: int = Field(default=50, description="Number of records per batch.")
|
|
105
|
+
num_processes: Optional[int] = Field(
|
|
106
|
+
default=1,
|
|
107
|
+
description="Optional limit on number of threads to use for upload.",
|
|
108
|
+
deprecated=True,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class QdrantUploader(Uploader, ABC):
|
|
114
|
+
upload_config: QdrantUploaderConfig
|
|
115
|
+
connection_config: QdrantConnectionConfig
|
|
116
|
+
|
|
117
|
+
@DestinationConnectionError.wrap
|
|
118
|
+
def precheck(self) -> None:
|
|
119
|
+
async def check_connection():
|
|
120
|
+
async with self.connection_config.get_client() as async_client:
|
|
121
|
+
await async_client.get_collections()
|
|
122
|
+
|
|
123
|
+
asyncio.run(check_connection())
|
|
124
|
+
|
|
125
|
+
def is_async(self):
|
|
126
|
+
return True
|
|
127
|
+
|
|
128
|
+
async def run_async(
|
|
129
|
+
self,
|
|
130
|
+
path: Path,
|
|
131
|
+
file_data: FileData,
|
|
132
|
+
**kwargs: Any,
|
|
133
|
+
) -> None:
|
|
134
|
+
with path.open("r") as file:
|
|
135
|
+
elements: list[dict] = json.load(file)
|
|
136
|
+
|
|
137
|
+
logger.debug("Loaded %i elements from %s", len(elements), path)
|
|
138
|
+
|
|
139
|
+
batches = list(batch_generator(elements, batch_size=self.upload_config.batch_size))
|
|
140
|
+
logger.debug(
|
|
141
|
+
"Elements split into %i batches of size %i.",
|
|
142
|
+
len(batches),
|
|
143
|
+
self.upload_config.batch_size,
|
|
144
|
+
)
|
|
145
|
+
await asyncio.gather(*[self._upsert_batch(batch) for batch in batches])
|
|
146
|
+
|
|
147
|
+
async def _upsert_batch(self, batch: list[dict]) -> None:
|
|
148
|
+
from qdrant_client import models
|
|
149
|
+
|
|
150
|
+
points: list[models.PointStruct] = [models.PointStruct(**item) for item in batch]
|
|
151
|
+
try:
|
|
152
|
+
logger.debug(
|
|
153
|
+
"Upserting %i points to the '%s' collection.",
|
|
154
|
+
len(points),
|
|
155
|
+
self.upload_config.collection_name,
|
|
156
|
+
)
|
|
157
|
+
async with self.connection_config.get_client() as async_client:
|
|
158
|
+
await async_client.upsert(
|
|
159
|
+
self.upload_config.collection_name, points=points, wait=True
|
|
160
|
+
)
|
|
161
|
+
except Exception as api_error:
|
|
162
|
+
logger.error(
|
|
163
|
+
"Failed to upsert points to the collection due to the following error %s", api_error
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
raise WriteError(f"Qdrant error: {api_error}") from api_error
|
|
167
|
+
|
|
168
|
+
logger.debug("Successfully upsert points to the collection.")
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
6
|
+
from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
|
|
7
|
+
QdrantAccessConfig,
|
|
8
|
+
QdrantConnectionConfig,
|
|
9
|
+
QdrantUploader,
|
|
10
|
+
QdrantUploaderConfig,
|
|
11
|
+
QdrantUploadStager,
|
|
12
|
+
QdrantUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "qdrant-server"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ServerQdrantAccessConfig(QdrantAccessConfig):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ServerQdrantConnectionConfig(QdrantConnectionConfig):
|
|
23
|
+
url: str = Field(default=None, description="url of Qdrant server")
|
|
24
|
+
access_config: Secret[ServerQdrantAccessConfig] = Field(
|
|
25
|
+
default_factory=ServerQdrantAccessConfig, validate_default=True
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def get_client_kwargs(self) -> dict:
|
|
29
|
+
return {
|
|
30
|
+
"url": self.url,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ServerQdrantUploadStagerConfig(QdrantUploadStagerConfig):
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ServerQdrantUploadStager(QdrantUploadStager):
|
|
40
|
+
upload_stager_config: ServerQdrantUploadStagerConfig
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ServerQdrantUploaderConfig(QdrantUploaderConfig):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class ServerQdrantUploader(QdrantUploader):
|
|
49
|
+
connection_config: ServerQdrantConnectionConfig
|
|
50
|
+
upload_config: ServerQdrantUploaderConfig
|
|
51
|
+
connector_type: str = CONNECTOR_TYPE
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
qdrant_server_destination_entry = DestinationRegistryEntry(
|
|
55
|
+
connection_config=ServerQdrantConnectionConfig,
|
|
56
|
+
uploader=ServerQdrantUploader,
|
|
57
|
+
uploader_config=ServerQdrantUploaderConfig,
|
|
58
|
+
upload_stager=ServerQdrantUploadStager,
|
|
59
|
+
upload_stager_config=ServerQdrantUploadStagerConfig,
|
|
60
|
+
)
|
|
@@ -21,7 +21,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
21
21
|
Indexer,
|
|
22
22
|
IndexerConfig,
|
|
23
23
|
SourceIdentifiers,
|
|
24
|
-
download_responses,
|
|
25
24
|
)
|
|
26
25
|
from unstructured_ingest.v2.logger import logger
|
|
27
26
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -426,7 +425,7 @@ class SharepointDownloader(Downloader):
|
|
|
426
425
|
f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
|
|
427
426
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
428
427
|
|
|
429
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
428
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
430
429
|
content_type = file_data.additional_metadata.get("sharepoint_content_type")
|
|
431
430
|
if not content_type:
|
|
432
431
|
raise ValueError(
|
|
@@ -436,6 +435,8 @@ class SharepointDownloader(Downloader):
|
|
|
436
435
|
return self.get_document(file_data=file_data)
|
|
437
436
|
elif content_type == SharepointContentType.SITEPAGE.value:
|
|
438
437
|
return self.get_site_page(file_data=file_data)
|
|
438
|
+
else:
|
|
439
|
+
raise ValueError(f"content type not recognized: {content_type}")
|
|
439
440
|
|
|
440
441
|
|
|
441
442
|
sharepoint_source_entry = SourceRegistryEntry(
|
|
@@ -16,9 +16,9 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
16
16
|
ConnectionConfig,
|
|
17
17
|
Downloader,
|
|
18
18
|
DownloaderConfig,
|
|
19
|
+
DownloadResponse,
|
|
19
20
|
Indexer,
|
|
20
21
|
IndexerConfig,
|
|
21
|
-
download_responses,
|
|
22
22
|
)
|
|
23
23
|
from unstructured_ingest.v2.interfaces.file_data import (
|
|
24
24
|
FileData,
|
|
@@ -161,7 +161,7 @@ class SlackDownloader(Downloader):
|
|
|
161
161
|
def run(self, file_data, **kwargs):
|
|
162
162
|
raise NotImplementedError
|
|
163
163
|
|
|
164
|
-
async def run_async(self, file_data: FileData, **kwargs) ->
|
|
164
|
+
async def run_async(self, file_data: FileData, **kwargs) -> DownloadResponse:
|
|
165
165
|
# NOTE: Indexer should provide source identifiers required to generate the download path
|
|
166
166
|
download_path = self.get_download_path(file_data)
|
|
167
167
|
if download_path is None:
|
|
@@ -98,20 +98,28 @@ class PostgresDownloader(SQLDownloader):
|
|
|
98
98
|
download_config: PostgresDownloaderConfig
|
|
99
99
|
connector_type: str = CONNECTOR_TYPE
|
|
100
100
|
|
|
101
|
+
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
101
102
|
def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
|
|
103
|
+
from psycopg2 import sql
|
|
104
|
+
|
|
102
105
|
table_name = file_data.additional_metadata["table_name"]
|
|
103
106
|
id_column = file_data.additional_metadata["id_column"]
|
|
104
|
-
ids = file_data.additional_metadata["ids"]
|
|
107
|
+
ids = tuple(file_data.additional_metadata["ids"])
|
|
108
|
+
|
|
105
109
|
with self.connection_config.get_cursor() as cursor:
|
|
106
|
-
fields =
|
|
107
|
-
|
|
110
|
+
fields = (
|
|
111
|
+
sql.SQL(",").join(sql.Identifier(field) for field in self.download_config.fields)
|
|
112
|
+
if self.download_config.fields
|
|
113
|
+
else sql.SQL("*")
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
query = sql.SQL("SELECT {fields} FROM {table_name} WHERE {id_column} IN %s").format(
|
|
108
117
|
fields=fields,
|
|
109
|
-
table_name=table_name,
|
|
110
|
-
id_column=id_column,
|
|
111
|
-
ids=",".join([str(i) for i in ids]),
|
|
118
|
+
table_name=sql.Identifier(table_name),
|
|
119
|
+
id_column=sql.Identifier(id_column),
|
|
112
120
|
)
|
|
113
|
-
logger.debug(f"running query: {query}")
|
|
114
|
-
cursor.execute(query)
|
|
121
|
+
logger.debug(f"running query: {cursor.mogrify(query, (ids,))}")
|
|
122
|
+
cursor.execute(query, (ids,))
|
|
115
123
|
rows = cursor.fetchall()
|
|
116
124
|
columns = [col[0] for col in cursor.description]
|
|
117
125
|
return rows, columns
|
|
@@ -70,7 +70,9 @@ class SnowflakeConnectionConfig(SQLConnectionConfig):
|
|
|
70
70
|
connect_kwargs["password"] = self.access_config.get_secret_value().password
|
|
71
71
|
# https://peps.python.org/pep-0249/#paramstyle
|
|
72
72
|
connect_kwargs["paramstyle"] = "qmark"
|
|
73
|
-
|
|
73
|
+
# remove anything that is none
|
|
74
|
+
active_kwargs = {k: v for k, v in connect_kwargs.items() if v is not None}
|
|
75
|
+
connection = connect(**active_kwargs)
|
|
74
76
|
try:
|
|
75
77
|
yield connection
|
|
76
78
|
finally:
|
|
@@ -300,10 +300,8 @@ class SQLUploader(Uploader):
|
|
|
300
300
|
|
|
301
301
|
def precheck(self) -> None:
|
|
302
302
|
try:
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
cursor.execute("SELECT 1;")
|
|
306
|
-
cursor.close()
|
|
303
|
+
with self.connection_config.get_cursor() as cursor:
|
|
304
|
+
cursor.execute("SELECT 1;")
|
|
307
305
|
except Exception as e:
|
|
308
306
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
309
307
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -9,7 +9,7 @@ from unstructured_ingest.utils.data_prep import flatten_dict
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
11
|
from unstructured_ingest.v2.logger import logger
|
|
12
|
-
from unstructured_ingest.v2.unstructured_api import
|
|
12
|
+
from unstructured_ingest.v2.unstructured_api import call_api_async
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class PartitionerConfig(BaseModel):
|
|
@@ -47,7 +47,11 @@ class PartitionerConfig(BaseModel):
|
|
|
47
47
|
)
|
|
48
48
|
metadata_exclude: list[str] = Field(
|
|
49
49
|
default_factory=list,
|
|
50
|
-
description="If set, drop the specified metadata
|
|
50
|
+
description="If set, drop the specified metadata fields if they exist.",
|
|
51
|
+
)
|
|
52
|
+
element_exclude: list[str] = Field(
|
|
53
|
+
default_factory=list,
|
|
54
|
+
description="If set, drop the specified element types, if they exist.",
|
|
51
55
|
)
|
|
52
56
|
metadata_include: list[str] = Field(
|
|
53
57
|
default_factory=list,
|
|
@@ -100,6 +104,13 @@ class Partitioner(BaseProcess, ABC):
|
|
|
100
104
|
|
|
101
105
|
def postprocess(self, elements: list[dict]) -> list[dict]:
|
|
102
106
|
element_dicts = [e.copy() for e in elements]
|
|
107
|
+
if self.config.element_exclude:
|
|
108
|
+
element_dicts = list(
|
|
109
|
+
filter(
|
|
110
|
+
lambda element: element["type"] not in self.config.element_exclude,
|
|
111
|
+
element_dicts,
|
|
112
|
+
)
|
|
113
|
+
)
|
|
103
114
|
for elem in element_dicts:
|
|
104
115
|
if self.config.metadata_exclude:
|
|
105
116
|
ex_list = self.config.metadata_exclude
|
|
@@ -156,7 +167,7 @@ class Partitioner(BaseProcess, ABC):
|
|
|
156
167
|
metadata = metadata or {}
|
|
157
168
|
logger.debug(f"partitioning file {filename} with metadata: {metadata}")
|
|
158
169
|
|
|
159
|
-
elements = await
|
|
170
|
+
elements = await call_api_async(
|
|
160
171
|
server_url=self.config.partition_endpoint,
|
|
161
172
|
api_key=self.config.api_key.get_secret_value(),
|
|
162
173
|
filename=filename,
|