unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (78) hide show
  1. test/integration/connectors/test_astradb.py +109 -0
  2. test/integration/connectors/test_azure_cog_search.py +233 -0
  3. test/integration/connectors/test_confluence.py +113 -0
  4. test/integration/connectors/test_kafka.py +167 -0
  5. test/integration/connectors/test_onedrive.py +112 -0
  6. test/integration/connectors/test_pinecone.py +161 -0
  7. test/integration/connectors/test_qdrant.py +137 -0
  8. test/integration/connectors/test_s3.py +23 -0
  9. test/integration/connectors/utils/docker.py +2 -1
  10. test/integration/connectors/utils/validation.py +73 -22
  11. test/unit/v2/__init__.py +0 -0
  12. test/unit/v2/chunkers/__init__.py +0 -0
  13. test/unit/v2/chunkers/test_chunkers.py +49 -0
  14. test/unit/v2/connectors/__init__.py +0 -0
  15. test/unit/v2/embedders/__init__.py +0 -0
  16. test/unit/v2/embedders/test_bedrock.py +36 -0
  17. test/unit/v2/embedders/test_huggingface.py +48 -0
  18. test/unit/v2/embedders/test_mixedbread.py +37 -0
  19. test/unit/v2/embedders/test_octoai.py +35 -0
  20. test/unit/v2/embedders/test_openai.py +35 -0
  21. test/unit/v2/embedders/test_togetherai.py +37 -0
  22. test/unit/v2/embedders/test_vertexai.py +37 -0
  23. test/unit/v2/embedders/test_voyageai.py +38 -0
  24. test/unit/v2/partitioners/__init__.py +0 -0
  25. test/unit/v2/partitioners/test_partitioner.py +63 -0
  26. test/unit/v2/utils/__init__.py +0 -0
  27. test/unit/v2/utils/data_generator.py +32 -0
  28. unstructured_ingest/__version__.py +1 -1
  29. unstructured_ingest/cli/cmds/__init__.py +2 -2
  30. unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  31. unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  32. unstructured_ingest/connector/kafka.py +0 -1
  33. unstructured_ingest/interfaces.py +7 -7
  34. unstructured_ingest/runner/writers/__init__.py +2 -2
  35. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  36. unstructured_ingest/v2/constants.py +2 -0
  37. unstructured_ingest/v2/processes/chunker.py +2 -2
  38. unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
  39. unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
  40. unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
  41. unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
  42. unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
  43. unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
  44. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
  45. unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
  46. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
  47. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
  48. unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
  49. unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
  50. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  51. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
  52. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
  53. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  54. unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
  55. unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
  56. unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
  57. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  58. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  59. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  60. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
  61. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  62. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
  63. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  64. unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
  65. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
  66. unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
  67. unstructured_ingest/v2/processes/partitioner.py +14 -3
  68. unstructured_ingest/v2/unstructured_api.py +24 -10
  69. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
  70. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
  71. unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
  72. /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
  73. /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
  74. /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
  75. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
  76. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
  77. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
  78. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ from pydantic import Field, Secret
9
9
  from unstructured_ingest.error import DestinationConnectionError
10
10
  from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
12
13
  from unstructured_ingest.v2.interfaces import (
13
14
  AccessConfig,
14
15
  ConnectionConfig,
@@ -23,6 +24,7 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
23
24
 
24
25
  if TYPE_CHECKING:
25
26
  from pinecone import Index as PineconeIndex
27
+ from pinecone import Pinecone
26
28
 
27
29
 
28
30
  CONNECTOR_TYPE = "pinecone"
@@ -43,16 +45,19 @@ class PineconeConnectionConfig(ConnectionConfig):
43
45
  )
44
46
 
45
47
  @requires_dependencies(["pinecone"], extras="pinecone")
46
- def get_index(self, **index_kwargs) -> "PineconeIndex":
48
+ def get_client(self, **index_kwargs) -> "Pinecone":
47
49
  from pinecone import Pinecone
48
50
 
49
51
  from unstructured_ingest import __version__ as unstructured_version
50
52
 
51
- pc = Pinecone(
53
+ return Pinecone(
52
54
  api_key=self.access_config.get_secret_value().pinecone_api_key,
53
55
  source_tag=f"unstructured_ingest=={unstructured_version}",
54
56
  )
55
57
 
58
+ def get_index(self, **index_kwargs) -> "PineconeIndex":
59
+ pc = self.get_client()
60
+
56
61
  index = pc.Index(name=self.index_name, **index_kwargs)
57
62
  logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
58
63
  return index
@@ -106,7 +111,7 @@ class PineconeUploadStager(UploadStager):
106
111
  default_factory=lambda: PineconeUploadStagerConfig()
107
112
  )
108
113
 
109
- def conform_dict(self, element_dict: dict) -> dict:
114
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
110
115
  embeddings = element_dict.pop("embeddings", None)
111
116
  metadata: dict[str, Any] = element_dict.pop("metadata", {})
112
117
  data_source = metadata.pop("data_source", {})
@@ -121,19 +126,23 @@ class PineconeUploadStager(UploadStager):
121
126
  }
122
127
  )
123
128
 
129
+ metadata = flatten_dict(
130
+ pinecone_metadata,
131
+ separator="-",
132
+ flatten_lists=True,
133
+ remove_none=True,
134
+ )
135
+ metadata[RECORD_ID_LABEL] = file_data.identifier
136
+
124
137
  return {
125
138
  "id": str(uuid.uuid4()),
126
139
  "values": embeddings,
127
- "metadata": flatten_dict(
128
- pinecone_metadata,
129
- separator="-",
130
- flatten_lists=True,
131
- remove_none=True,
132
- ),
140
+ "metadata": metadata,
133
141
  }
134
142
 
135
143
  def run(
136
144
  self,
145
+ file_data: FileData,
137
146
  elements_filepath: Path,
138
147
  output_dir: Path,
139
148
  output_filename: str,
@@ -143,10 +152,15 @@ class PineconeUploadStager(UploadStager):
143
152
  elements_contents = json.load(elements_file)
144
153
 
145
154
  conformed_elements = [
146
- self.conform_dict(element_dict=element) for element in elements_contents
155
+ self.conform_dict(element_dict=element, file_data=file_data)
156
+ for element in elements_contents
147
157
  ]
148
158
 
149
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
159
+ if Path(output_filename).suffix != ".json":
160
+ output_filename = f"{output_filename}.json"
161
+ else:
162
+ output_filename = f"{Path(output_filename).stem}.json"
163
+ output_path = Path(output_dir) / Path(f"{output_filename}")
150
164
  output_path.parent.mkdir(parents=True, exist_ok=True)
151
165
 
152
166
  with open(output_path, "w") as output_file:
@@ -167,6 +181,55 @@ class PineconeUploader(Uploader):
167
181
  logger.error(f"failed to validate connection: {e}", exc_info=True)
168
182
  raise DestinationConnectionError(f"failed to validate connection: {e}")
169
183
 
184
+ def pod_delete_by_record_id(self, file_data: FileData) -> None:
185
+ logger.debug(
186
+ f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
187
+ f"from pinecone pod index"
188
+ )
189
+ index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
190
+ delete_kwargs = {"filter": {RECORD_ID_LABEL: {"$eq": file_data.identifier}}}
191
+ if namespace := self.upload_config.namespace:
192
+ delete_kwargs["namespace"] = namespace
193
+
194
+ resp = index.delete(**delete_kwargs)
195
+ logger.debug(
196
+ f"deleted any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
197
+ f"from pinecone index: {resp}"
198
+ )
199
+
200
+ def serverless_delete_by_record_id(self, file_data: FileData) -> None:
201
+ logger.debug(
202
+ f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
203
+ f"from pinecone serverless index"
204
+ )
205
+ index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
206
+ index_stats = index.describe_index_stats()
207
+ total_vectors = index_stats["total_vector_count"]
208
+ if total_vectors == 0:
209
+ return
210
+ dimension = index_stats["dimension"]
211
+ query_params = {
212
+ "filter": {RECORD_ID_LABEL: {"$eq": file_data.identifier}},
213
+ "vector": [0] * dimension,
214
+ "top_k": total_vectors,
215
+ }
216
+ if namespace := self.upload_config.namespace:
217
+ query_params["namespace"] = namespace
218
+ while True:
219
+ query_results = index.query(**query_params)
220
+ matches = query_results.get("matches", [])
221
+ if not matches:
222
+ break
223
+ ids = [match["id"] for match in matches]
224
+ delete_params = {"ids": ids}
225
+ if namespace := self.upload_config.namespace:
226
+ delete_params["namespace"] = namespace
227
+ index.delete(**delete_params)
228
+ logger.debug(
229
+ f"deleted any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
230
+ f"from pinecone index"
231
+ )
232
+
170
233
  @requires_dependencies(["pinecone"], extras="pinecone")
171
234
  def upsert_batches_async(self, elements_dict: list[dict]):
172
235
  from pinecone.exceptions import PineconeApiException
@@ -208,7 +271,15 @@ class PineconeUploader(Uploader):
208
271
  f" index named {self.connection_config.index_name}"
209
272
  f" with batch size {self.upload_config.batch_size}"
210
273
  )
211
-
274
+ # Determine if serverless or pod based index
275
+ pinecone_client = self.connection_config.get_client()
276
+ index_description = pinecone_client.describe_index(name=self.connection_config.index_name)
277
+ if "serverless" in index_description.get("spec"):
278
+ self.serverless_delete_by_record_id(file_data=file_data)
279
+ elif "pod" in index_description.get("spec"):
280
+ self.pod_delete_by_record_id(file_data=file_data)
281
+ else:
282
+ raise ValueError(f"unexpected spec type in index description: {index_description}")
212
283
  self.upsert_batches_async(elements_dict=elements_dict)
213
284
 
214
285
 
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ )
6
+
7
+ from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR_TYPE
8
+ from .cloud import qdrant_cloud_destination_entry
9
+ from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
10
+ from .local import qdrant_local_destination_entry
11
+ from .server import CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE
12
+ from .server import qdrant_server_destination_entry
13
+
14
+ add_destination_entry(destination_type=CLOUD_CONNECTOR_TYPE, entry=qdrant_cloud_destination_entry)
15
+ add_destination_entry(destination_type=SERVER_CONNECTOR_TYPE, entry=qdrant_server_destination_entry)
16
+ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=qdrant_local_destination_entry)
@@ -0,0 +1,59 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
6
+ from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
7
+ QdrantAccessConfig,
8
+ QdrantConnectionConfig,
9
+ QdrantUploader,
10
+ QdrantUploaderConfig,
11
+ QdrantUploadStager,
12
+ QdrantUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "qdrant-cloud"
16
+
17
+
18
+ class CloudQdrantAccessConfig(QdrantAccessConfig):
19
+ api_key: str = Field(description="Qdrant API key")
20
+
21
+
22
+ class CloudQdrantConnectionConfig(QdrantConnectionConfig):
23
+ url: str = Field(default=None, description="url of Qdrant Cloud")
24
+ access_config: Secret[CloudQdrantAccessConfig]
25
+
26
+ def get_client_kwargs(self) -> dict:
27
+ return {
28
+ "api_key": self.access_config.get_secret_value().api_key,
29
+ "url": self.url,
30
+ }
31
+
32
+
33
+ class CloudQdrantUploadStagerConfig(QdrantUploadStagerConfig):
34
+ pass
35
+
36
+
37
+ @dataclass
38
+ class CloudQdrantUploadStager(QdrantUploadStager):
39
+ upload_stager_config: CloudQdrantUploadStagerConfig
40
+
41
+
42
+ class CloudQdrantUploaderConfig(QdrantUploaderConfig):
43
+ pass
44
+
45
+
46
+ @dataclass
47
+ class CloudQdrantUploader(QdrantUploader):
48
+ connection_config: CloudQdrantConnectionConfig
49
+ upload_config: CloudQdrantUploaderConfig
50
+ connector_type: str = CONNECTOR_TYPE
51
+
52
+
53
+ qdrant_cloud_destination_entry = DestinationRegistryEntry(
54
+ connection_config=CloudQdrantConnectionConfig,
55
+ uploader=CloudQdrantUploader,
56
+ uploader_config=CloudQdrantUploaderConfig,
57
+ upload_stager=CloudQdrantUploadStager,
58
+ upload_stager_config=CloudQdrantUploadStagerConfig,
59
+ )
@@ -0,0 +1,58 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
6
+ from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
7
+ QdrantAccessConfig,
8
+ QdrantConnectionConfig,
9
+ QdrantUploader,
10
+ QdrantUploaderConfig,
11
+ QdrantUploadStager,
12
+ QdrantUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "qdrant-local"
16
+
17
+
18
+ class LocalQdrantAccessConfig(QdrantAccessConfig):
19
+ pass
20
+
21
+
22
+ class LocalQdrantConnectionConfig(QdrantConnectionConfig):
23
+ path: str = Field(default=None, description="Persistence path for QdrantLocal.")
24
+ access_config: Secret[LocalQdrantAccessConfig] = Field(
25
+ default_factory=LocalQdrantAccessConfig, validate_default=True
26
+ )
27
+
28
+ def get_client_kwargs(self) -> dict:
29
+ return {"path": self.path}
30
+
31
+
32
+ class LocalQdrantUploadStagerConfig(QdrantUploadStagerConfig):
33
+ pass
34
+
35
+
36
+ @dataclass
37
+ class LocalQdrantUploadStager(QdrantUploadStager):
38
+ upload_stager_config: LocalQdrantUploadStagerConfig
39
+
40
+
41
+ class LocalQdrantUploaderConfig(QdrantUploaderConfig):
42
+ pass
43
+
44
+
45
+ @dataclass
46
+ class LocalQdrantUploader(QdrantUploader):
47
+ connection_config: LocalQdrantConnectionConfig
48
+ upload_config: LocalQdrantUploaderConfig
49
+ connector_type: str = CONNECTOR_TYPE
50
+
51
+
52
+ qdrant_local_destination_entry = DestinationRegistryEntry(
53
+ connection_config=LocalQdrantConnectionConfig,
54
+ uploader=LocalQdrantUploader,
55
+ uploader_config=LocalQdrantUploaderConfig,
56
+ upload_stager=LocalQdrantUploadStager,
57
+ upload_stager_config=LocalQdrantUploadStagerConfig,
58
+ )
@@ -0,0 +1,168 @@
1
+ import asyncio
2
+ import json
3
+ import uuid
4
+ from abc import ABC, abstractmethod
5
+ from contextlib import asynccontextmanager
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
9
+
10
+ from pydantic import Field, Secret
11
+
12
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
13
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
14
+ from unstructured_ingest.utils.dep_check import requires_dependencies
15
+ from unstructured_ingest.v2.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ FileData,
19
+ Uploader,
20
+ UploaderConfig,
21
+ UploadStager,
22
+ UploadStagerConfig,
23
+ )
24
+ from unstructured_ingest.v2.logger import logger
25
+
26
+ if TYPE_CHECKING:
27
+ from qdrant_client import AsyncQdrantClient
28
+
29
+
30
+ class QdrantAccessConfig(AccessConfig, ABC):
31
+ pass
32
+
33
+
34
+ class QdrantConnectionConfig(ConnectionConfig, ABC):
35
+ access_config: Secret[QdrantAccessConfig] = Field(
36
+ default_factory=QdrantAccessConfig, validate_default=True, description="Access Config"
37
+ )
38
+
39
+ @abstractmethod
40
+ def get_client_kwargs(self) -> dict:
41
+ pass
42
+
43
+ @requires_dependencies(["qdrant_client"], extras="qdrant")
44
+ @asynccontextmanager
45
+ async def get_client(self) -> AsyncGenerator["AsyncQdrantClient", None]:
46
+ from qdrant_client.async_qdrant_client import AsyncQdrantClient
47
+
48
+ client_kwargs = self.get_client_kwargs()
49
+ client = AsyncQdrantClient(**client_kwargs)
50
+ try:
51
+ yield client
52
+ finally:
53
+ await client.close()
54
+
55
+
56
+ class QdrantUploadStagerConfig(UploadStagerConfig):
57
+ pass
58
+
59
+
60
+ @dataclass
61
+ class QdrantUploadStager(UploadStager, ABC):
62
+ upload_stager_config: QdrantUploadStagerConfig = field(
63
+ default_factory=lambda: QdrantUploadStagerConfig()
64
+ )
65
+
66
+ @staticmethod
67
+ def conform_dict(data: dict) -> dict:
68
+ """Prepares dictionary in the format that Chroma requires"""
69
+ return {
70
+ "id": str(uuid.uuid4()),
71
+ "vector": data.pop("embeddings", {}),
72
+ "payload": {
73
+ "text": data.pop("text", None),
74
+ "element_serialized": json.dumps(data),
75
+ **flatten_dict(
76
+ data,
77
+ separator="-",
78
+ flatten_lists=True,
79
+ ),
80
+ },
81
+ }
82
+
83
+ def run(
84
+ self,
85
+ elements_filepath: Path,
86
+ file_data: FileData,
87
+ output_dir: Path,
88
+ output_filename: str,
89
+ **kwargs: Any,
90
+ ) -> Path:
91
+ with open(elements_filepath) as elements_file:
92
+ elements_contents = json.load(elements_file)
93
+
94
+ conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
95
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
96
+
97
+ with open(output_path, "w") as output_file:
98
+ json.dump(conformed_elements, output_file)
99
+ return output_path
100
+
101
+
102
+ class QdrantUploaderConfig(UploaderConfig):
103
+ collection_name: str = Field(description="Name of the collection.")
104
+ batch_size: int = Field(default=50, description="Number of records per batch.")
105
+ num_processes: Optional[int] = Field(
106
+ default=1,
107
+ description="Optional limit on number of threads to use for upload.",
108
+ deprecated=True,
109
+ )
110
+
111
+
112
+ @dataclass
113
+ class QdrantUploader(Uploader, ABC):
114
+ upload_config: QdrantUploaderConfig
115
+ connection_config: QdrantConnectionConfig
116
+
117
+ @DestinationConnectionError.wrap
118
+ def precheck(self) -> None:
119
+ async def check_connection():
120
+ async with self.connection_config.get_client() as async_client:
121
+ await async_client.get_collections()
122
+
123
+ asyncio.run(check_connection())
124
+
125
+ def is_async(self):
126
+ return True
127
+
128
+ async def run_async(
129
+ self,
130
+ path: Path,
131
+ file_data: FileData,
132
+ **kwargs: Any,
133
+ ) -> None:
134
+ with path.open("r") as file:
135
+ elements: list[dict] = json.load(file)
136
+
137
+ logger.debug("Loaded %i elements from %s", len(elements), path)
138
+
139
+ batches = list(batch_generator(elements, batch_size=self.upload_config.batch_size))
140
+ logger.debug(
141
+ "Elements split into %i batches of size %i.",
142
+ len(batches),
143
+ self.upload_config.batch_size,
144
+ )
145
+ await asyncio.gather(*[self._upsert_batch(batch) for batch in batches])
146
+
147
+ async def _upsert_batch(self, batch: list[dict]) -> None:
148
+ from qdrant_client import models
149
+
150
+ points: list[models.PointStruct] = [models.PointStruct(**item) for item in batch]
151
+ try:
152
+ logger.debug(
153
+ "Upserting %i points to the '%s' collection.",
154
+ len(points),
155
+ self.upload_config.collection_name,
156
+ )
157
+ async with self.connection_config.get_client() as async_client:
158
+ await async_client.upsert(
159
+ self.upload_config.collection_name, points=points, wait=True
160
+ )
161
+ except Exception as api_error:
162
+ logger.error(
163
+ "Failed to upsert points to the collection due to the following error %s", api_error
164
+ )
165
+
166
+ raise WriteError(f"Qdrant error: {api_error}") from api_error
167
+
168
+ logger.debug("Successfully upsert points to the collection.")
@@ -0,0 +1,60 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
6
+ from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
7
+ QdrantAccessConfig,
8
+ QdrantConnectionConfig,
9
+ QdrantUploader,
10
+ QdrantUploaderConfig,
11
+ QdrantUploadStager,
12
+ QdrantUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "qdrant-server"
16
+
17
+
18
+ class ServerQdrantAccessConfig(QdrantAccessConfig):
19
+ pass
20
+
21
+
22
+ class ServerQdrantConnectionConfig(QdrantConnectionConfig):
23
+ url: str = Field(default=None, description="url of Qdrant server")
24
+ access_config: Secret[ServerQdrantAccessConfig] = Field(
25
+ default_factory=ServerQdrantAccessConfig, validate_default=True
26
+ )
27
+
28
+ def get_client_kwargs(self) -> dict:
29
+ return {
30
+ "url": self.url,
31
+ }
32
+
33
+
34
+ class ServerQdrantUploadStagerConfig(QdrantUploadStagerConfig):
35
+ pass
36
+
37
+
38
+ @dataclass
39
+ class ServerQdrantUploadStager(QdrantUploadStager):
40
+ upload_stager_config: ServerQdrantUploadStagerConfig
41
+
42
+
43
+ class ServerQdrantUploaderConfig(QdrantUploaderConfig):
44
+ pass
45
+
46
+
47
+ @dataclass
48
+ class ServerQdrantUploader(QdrantUploader):
49
+ connection_config: ServerQdrantConnectionConfig
50
+ upload_config: ServerQdrantUploaderConfig
51
+ connector_type: str = CONNECTOR_TYPE
52
+
53
+
54
+ qdrant_server_destination_entry = DestinationRegistryEntry(
55
+ connection_config=ServerQdrantConnectionConfig,
56
+ uploader=ServerQdrantUploader,
57
+ uploader_config=ServerQdrantUploaderConfig,
58
+ upload_stager=ServerQdrantUploadStager,
59
+ upload_stager_config=ServerQdrantUploadStagerConfig,
60
+ )
@@ -21,7 +21,6 @@ from unstructured_ingest.v2.interfaces import (
21
21
  Indexer,
22
22
  IndexerConfig,
23
23
  SourceIdentifiers,
24
- download_responses,
25
24
  )
26
25
  from unstructured_ingest.v2.logger import logger
27
26
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -426,7 +425,7 @@ class SharepointDownloader(Downloader):
426
425
  f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
427
426
  return self.generate_download_response(file_data=file_data, download_path=download_path)
428
427
 
429
- def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
428
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
430
429
  content_type = file_data.additional_metadata.get("sharepoint_content_type")
431
430
  if not content_type:
432
431
  raise ValueError(
@@ -436,6 +435,8 @@ class SharepointDownloader(Downloader):
436
435
  return self.get_document(file_data=file_data)
437
436
  elif content_type == SharepointContentType.SITEPAGE.value:
438
437
  return self.get_site_page(file_data=file_data)
438
+ else:
439
+ raise ValueError(f"content type not recognized: {content_type}")
439
440
 
440
441
 
441
442
  sharepoint_source_entry = SourceRegistryEntry(
@@ -16,9 +16,9 @@ from unstructured_ingest.v2.interfaces import (
16
16
  ConnectionConfig,
17
17
  Downloader,
18
18
  DownloaderConfig,
19
+ DownloadResponse,
19
20
  Indexer,
20
21
  IndexerConfig,
21
- download_responses,
22
22
  )
23
23
  from unstructured_ingest.v2.interfaces.file_data import (
24
24
  FileData,
@@ -161,7 +161,7 @@ class SlackDownloader(Downloader):
161
161
  def run(self, file_data, **kwargs):
162
162
  raise NotImplementedError
163
163
 
164
- async def run_async(self, file_data: FileData, **kwargs) -> download_responses:
164
+ async def run_async(self, file_data: FileData, **kwargs) -> DownloadResponse:
165
165
  # NOTE: Indexer should provide source identifiers required to generate the download path
166
166
  download_path = self.get_download_path(file_data)
167
167
  if download_path is None:
@@ -98,20 +98,28 @@ class PostgresDownloader(SQLDownloader):
98
98
  download_config: PostgresDownloaderConfig
99
99
  connector_type: str = CONNECTOR_TYPE
100
100
 
101
+ @requires_dependencies(["psycopg2"], extras="postgres")
101
102
  def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
103
+ from psycopg2 import sql
104
+
102
105
  table_name = file_data.additional_metadata["table_name"]
103
106
  id_column = file_data.additional_metadata["id_column"]
104
- ids = file_data.additional_metadata["ids"]
107
+ ids = tuple(file_data.additional_metadata["ids"])
108
+
105
109
  with self.connection_config.get_cursor() as cursor:
106
- fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
107
- query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
110
+ fields = (
111
+ sql.SQL(",").join(sql.Identifier(field) for field in self.download_config.fields)
112
+ if self.download_config.fields
113
+ else sql.SQL("*")
114
+ )
115
+
116
+ query = sql.SQL("SELECT {fields} FROM {table_name} WHERE {id_column} IN %s").format(
108
117
  fields=fields,
109
- table_name=table_name,
110
- id_column=id_column,
111
- ids=",".join([str(i) for i in ids]),
118
+ table_name=sql.Identifier(table_name),
119
+ id_column=sql.Identifier(id_column),
112
120
  )
113
- logger.debug(f"running query: {query}")
114
- cursor.execute(query)
121
+ logger.debug(f"running query: {cursor.mogrify(query, (ids,))}")
122
+ cursor.execute(query, (ids,))
115
123
  rows = cursor.fetchall()
116
124
  columns = [col[0] for col in cursor.description]
117
125
  return rows, columns
@@ -70,7 +70,9 @@ class SnowflakeConnectionConfig(SQLConnectionConfig):
70
70
  connect_kwargs["password"] = self.access_config.get_secret_value().password
71
71
  # https://peps.python.org/pep-0249/#paramstyle
72
72
  connect_kwargs["paramstyle"] = "qmark"
73
- connection = connect(**connect_kwargs)
73
+ # remove anything that is none
74
+ active_kwargs = {k: v for k, v in connect_kwargs.items() if v is not None}
75
+ connection = connect(**active_kwargs)
74
76
  try:
75
77
  yield connection
76
78
  finally:
@@ -300,10 +300,8 @@ class SQLUploader(Uploader):
300
300
 
301
301
  def precheck(self) -> None:
302
302
  try:
303
- connection = self.connection_config.get_connection()
304
- cursor = connection.cursor()
305
- cursor.execute("SELECT 1;")
306
- cursor.close()
303
+ with self.connection_config.get_cursor() as cursor:
304
+ cursor.execute("SELECT 1;")
307
305
  except Exception as e:
308
306
  logger.error(f"failed to validate connection: {e}", exc_info=True)
309
307
  raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -9,7 +9,7 @@ from unstructured_ingest.utils.data_prep import flatten_dict
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
11
11
  from unstructured_ingest.v2.logger import logger
12
- from unstructured_ingest.v2.unstructured_api import call_api
12
+ from unstructured_ingest.v2.unstructured_api import call_api_async
13
13
 
14
14
 
15
15
  class PartitionerConfig(BaseModel):
@@ -47,7 +47,11 @@ class PartitionerConfig(BaseModel):
47
47
  )
48
48
  metadata_exclude: list[str] = Field(
49
49
  default_factory=list,
50
- description="If set, drop the specified metadata " "fields if they exist.",
50
+ description="If set, drop the specified metadata fields if they exist.",
51
+ )
52
+ element_exclude: list[str] = Field(
53
+ default_factory=list,
54
+ description="If set, drop the specified element types, if they exist.",
51
55
  )
52
56
  metadata_include: list[str] = Field(
53
57
  default_factory=list,
@@ -100,6 +104,13 @@ class Partitioner(BaseProcess, ABC):
100
104
 
101
105
  def postprocess(self, elements: list[dict]) -> list[dict]:
102
106
  element_dicts = [e.copy() for e in elements]
107
+ if self.config.element_exclude:
108
+ element_dicts = list(
109
+ filter(
110
+ lambda element: element["type"] not in self.config.element_exclude,
111
+ element_dicts,
112
+ )
113
+ )
103
114
  for elem in element_dicts:
104
115
  if self.config.metadata_exclude:
105
116
  ex_list = self.config.metadata_exclude
@@ -156,7 +167,7 @@ class Partitioner(BaseProcess, ABC):
156
167
  metadata = metadata or {}
157
168
  logger.debug(f"partitioning file {filename} with metadata: {metadata}")
158
169
 
159
- elements = await call_api(
170
+ elements = await call_api_async(
160
171
  server_url=self.config.partition_endpoint,
161
172
  api_key=self.config.api_key.get_secret_value(),
162
173
  filename=filename,