unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +109 -0
- test/integration/connectors/test_azure_cog_search.py +233 -0
- test/integration/connectors/test_kafka.py +116 -16
- test/integration/connectors/test_pinecone.py +161 -0
- test/integration/connectors/test_s3.py +23 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/__init__.py +2 -2
- unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +4 -4
- unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
- unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
- unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
- unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +6 -2
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +38 -2
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +78 -23
- unstructured_ingest/v2/processes/connectors/kafka/local.py +32 -4
- unstructured_ingest/v2/processes/connectors/onedrive.py +2 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +20 -19
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +58 -37
- unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
- /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
- /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
- /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,10 @@ from typing import TYPE_CHECKING, Optional
|
|
|
4
4
|
|
|
5
5
|
from pydantic import Field, Secret, SecretStr
|
|
6
6
|
|
|
7
|
-
from unstructured_ingest.v2.processes.connector_registry import
|
|
7
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
|
+
DestinationRegistryEntry,
|
|
9
|
+
SourceRegistryEntry,
|
|
10
|
+
)
|
|
8
11
|
from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
|
|
9
12
|
KafkaAccessConfig,
|
|
10
13
|
KafkaConnectionConfig,
|
|
@@ -12,6 +15,8 @@ from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
|
|
|
12
15
|
KafkaDownloaderConfig,
|
|
13
16
|
KafkaIndexer,
|
|
14
17
|
KafkaIndexerConfig,
|
|
18
|
+
KafkaUploader,
|
|
19
|
+
KafkaUploaderConfig,
|
|
15
20
|
)
|
|
16
21
|
|
|
17
22
|
if TYPE_CHECKING:
|
|
@@ -41,7 +46,21 @@ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
|
|
|
41
46
|
"group.id": "default_group_id",
|
|
42
47
|
"enable.auto.commit": "false",
|
|
43
48
|
"auto.offset.reset": "earliest",
|
|
44
|
-
"
|
|
49
|
+
"sasl.username": access_config.api_key,
|
|
50
|
+
"sasl.password": access_config.secret,
|
|
51
|
+
"sasl.mechanism": "PLAIN",
|
|
52
|
+
"security.protocol": "SASL_SSL",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return conf
|
|
56
|
+
|
|
57
|
+
def get_producer_configuration(self) -> dict:
|
|
58
|
+
bootstrap = self.bootstrap_server
|
|
59
|
+
port = self.port
|
|
60
|
+
access_config = self.access_config.get_secret_value()
|
|
61
|
+
|
|
62
|
+
conf = {
|
|
63
|
+
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
45
64
|
"sasl.username": access_config.api_key,
|
|
46
65
|
"sasl.password": access_config.secret,
|
|
47
66
|
"sasl.mechanism": "PLAIN",
|
|
@@ -73,6 +92,17 @@ class CloudKafkaDownloader(KafkaDownloader):
|
|
|
73
92
|
connector_type: str = CONNECTOR_TYPE
|
|
74
93
|
|
|
75
94
|
|
|
95
|
+
class CloudKafkaUploaderConfig(KafkaUploaderConfig):
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class CloudKafkaUploader(KafkaUploader):
|
|
101
|
+
connection_config: CloudKafkaConnectionConfig
|
|
102
|
+
upload_config: CloudKafkaUploaderConfig
|
|
103
|
+
connector_type: str = CONNECTOR_TYPE
|
|
104
|
+
|
|
105
|
+
|
|
76
106
|
kafka_cloud_source_entry = SourceRegistryEntry(
|
|
77
107
|
connection_config=CloudKafkaConnectionConfig,
|
|
78
108
|
indexer=CloudKafkaIndexer,
|
|
@@ -80,3 +110,9 @@ kafka_cloud_source_entry = SourceRegistryEntry(
|
|
|
80
110
|
downloader=CloudKafkaDownloader,
|
|
81
111
|
downloader_config=CloudKafkaDownloaderConfig,
|
|
82
112
|
)
|
|
113
|
+
|
|
114
|
+
kafka_cloud_destination_entry = DestinationRegistryEntry(
|
|
115
|
+
connection_config=CloudKafkaConnectionConfig,
|
|
116
|
+
uploader=CloudKafkaUploader,
|
|
117
|
+
uploader_config=CloudKafkaUploaderConfig,
|
|
118
|
+
)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
3
|
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
@@ -5,32 +6,33 @@ from pathlib import Path
|
|
|
5
6
|
from time import time
|
|
6
7
|
from typing import TYPE_CHECKING, Any, ContextManager, Generator, Optional
|
|
7
8
|
|
|
8
|
-
from pydantic import Secret
|
|
9
|
+
from pydantic import Field, Secret
|
|
9
10
|
|
|
10
11
|
from unstructured_ingest.error import (
|
|
12
|
+
DestinationConnectionError,
|
|
11
13
|
SourceConnectionError,
|
|
12
14
|
SourceConnectionNetworkError,
|
|
13
15
|
)
|
|
16
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
14
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
18
|
from unstructured_ingest.v2.interfaces import (
|
|
16
19
|
AccessConfig,
|
|
17
20
|
ConnectionConfig,
|
|
18
21
|
Downloader,
|
|
19
22
|
DownloaderConfig,
|
|
23
|
+
DownloadResponse,
|
|
20
24
|
FileData,
|
|
21
25
|
FileDataSourceMetadata,
|
|
22
26
|
Indexer,
|
|
23
27
|
IndexerConfig,
|
|
24
28
|
SourceIdentifiers,
|
|
25
|
-
|
|
29
|
+
Uploader,
|
|
30
|
+
UploaderConfig,
|
|
26
31
|
)
|
|
27
32
|
from unstructured_ingest.v2.logger import logger
|
|
28
|
-
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
29
33
|
|
|
30
34
|
if TYPE_CHECKING:
|
|
31
|
-
from confluent_kafka import Consumer
|
|
32
|
-
|
|
33
|
-
CONNECTOR_TYPE = "kafka"
|
|
35
|
+
from confluent_kafka import Consumer, Producer
|
|
34
36
|
|
|
35
37
|
|
|
36
38
|
class KafkaAccessConfig(AccessConfig, ABC):
|
|
@@ -39,7 +41,6 @@ class KafkaAccessConfig(AccessConfig, ABC):
|
|
|
39
41
|
|
|
40
42
|
class KafkaConnectionConfig(ConnectionConfig, ABC):
|
|
41
43
|
access_config: Secret[KafkaAccessConfig]
|
|
42
|
-
timeout: Optional[float] = 1.0
|
|
43
44
|
bootstrap_server: str
|
|
44
45
|
port: int
|
|
45
46
|
|
|
@@ -47,6 +48,10 @@ class KafkaConnectionConfig(ConnectionConfig, ABC):
|
|
|
47
48
|
def get_consumer_configuration(self) -> dict:
|
|
48
49
|
pass
|
|
49
50
|
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def get_producer_configuration(self) -> dict:
|
|
53
|
+
pass
|
|
54
|
+
|
|
50
55
|
@contextmanager
|
|
51
56
|
@requires_dependencies(["confluent_kafka"], extras="kafka")
|
|
52
57
|
def get_consumer(self) -> ContextManager["Consumer"]:
|
|
@@ -59,20 +64,27 @@ class KafkaConnectionConfig(ConnectionConfig, ABC):
|
|
|
59
64
|
finally:
|
|
60
65
|
consumer.close()
|
|
61
66
|
|
|
67
|
+
@requires_dependencies(["confluent_kafka"], extras="kafka")
|
|
68
|
+
def get_producer(self) -> "Producer":
|
|
69
|
+
from confluent_kafka import Producer
|
|
70
|
+
|
|
71
|
+
producer = Producer(self.get_producer_configuration())
|
|
72
|
+
return producer
|
|
73
|
+
|
|
62
74
|
|
|
63
75
|
class KafkaIndexerConfig(IndexerConfig):
|
|
64
|
-
topic: str
|
|
76
|
+
topic: str = Field(description="which topic to consume from")
|
|
65
77
|
num_messages_to_consume: Optional[int] = 100
|
|
78
|
+
timeout: Optional[float] = Field(default=1.0, description="polling timeout")
|
|
66
79
|
|
|
67
80
|
def update_consumer(self, consumer: "Consumer") -> None:
|
|
68
81
|
consumer.subscribe([self.topic])
|
|
69
82
|
|
|
70
83
|
|
|
71
84
|
@dataclass
|
|
72
|
-
class KafkaIndexer(Indexer):
|
|
85
|
+
class KafkaIndexer(Indexer, ABC):
|
|
73
86
|
connection_config: KafkaConnectionConfig
|
|
74
87
|
index_config: KafkaIndexerConfig
|
|
75
|
-
connector_type: str = CONNECTOR_TYPE
|
|
76
88
|
|
|
77
89
|
@contextmanager
|
|
78
90
|
def get_consumer(self) -> ContextManager["Consumer"]:
|
|
@@ -90,7 +102,7 @@ class KafkaIndexer(Indexer):
|
|
|
90
102
|
num_messages_to_consume = self.index_config.num_messages_to_consume
|
|
91
103
|
with self.get_consumer() as consumer:
|
|
92
104
|
while messages_consumed < num_messages_to_consume and empty_polls < max_empty_polls:
|
|
93
|
-
msg = consumer.poll(timeout=self.
|
|
105
|
+
msg = consumer.poll(timeout=self.index_config.timeout)
|
|
94
106
|
if msg is None:
|
|
95
107
|
logger.debug("No Kafka messages found")
|
|
96
108
|
empty_polls += 1
|
|
@@ -139,13 +151,13 @@ class KafkaIndexer(Indexer):
|
|
|
139
151
|
for message in self.generate_messages():
|
|
140
152
|
yield self.generate_file_data(message)
|
|
141
153
|
|
|
142
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) ->
|
|
154
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
143
155
|
raise NotImplementedError()
|
|
144
156
|
|
|
145
157
|
def precheck(self):
|
|
146
158
|
try:
|
|
147
159
|
with self.get_consumer() as consumer:
|
|
148
|
-
cluster_meta = consumer.list_topics(timeout=self.
|
|
160
|
+
cluster_meta = consumer.list_topics(timeout=self.index_config.timeout)
|
|
149
161
|
current_topics = [
|
|
150
162
|
topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
|
|
151
163
|
]
|
|
@@ -160,14 +172,13 @@ class KafkaDownloaderConfig(DownloaderConfig):
|
|
|
160
172
|
|
|
161
173
|
|
|
162
174
|
@dataclass
|
|
163
|
-
class KafkaDownloader(Downloader):
|
|
175
|
+
class KafkaDownloader(Downloader, ABC):
|
|
164
176
|
connection_config: KafkaConnectionConfig
|
|
165
177
|
download_config: KafkaDownloaderConfig = field(default_factory=KafkaDownloaderConfig)
|
|
166
|
-
connector_type: str = CONNECTOR_TYPE
|
|
167
178
|
version: Optional[str] = None
|
|
168
179
|
source_url: Optional[str] = None
|
|
169
180
|
|
|
170
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
181
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
171
182
|
source_identifiers = file_data.source_identifiers
|
|
172
183
|
if source_identifiers is None:
|
|
173
184
|
raise ValueError("FileData is missing source_identifiers")
|
|
@@ -187,10 +198,54 @@ class KafkaDownloader(Downloader):
|
|
|
187
198
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
188
199
|
|
|
189
200
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
201
|
+
class KafkaUploaderConfig(UploaderConfig):
|
|
202
|
+
batch_size: int = Field(default=100, description="Batch size")
|
|
203
|
+
topic: str = Field(description="which topic to write to")
|
|
204
|
+
timeout: Optional[float] = Field(
|
|
205
|
+
default=10.0, description="Timeout in seconds to flush batch of messages"
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@dataclass
|
|
210
|
+
class KafkaUploader(Uploader, ABC):
|
|
211
|
+
connection_config: KafkaConnectionConfig
|
|
212
|
+
upload_config: KafkaUploaderConfig
|
|
213
|
+
|
|
214
|
+
def precheck(self):
|
|
215
|
+
try:
|
|
216
|
+
with self.connection_config.get_consumer() as consumer:
|
|
217
|
+
cluster_meta = consumer.list_topics(timeout=self.upload_config.timeout)
|
|
218
|
+
current_topics = [
|
|
219
|
+
topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
|
|
220
|
+
]
|
|
221
|
+
logger.info(f"successfully checked available topics: {current_topics}")
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
224
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
225
|
+
|
|
226
|
+
def produce_batch(self, elements: list[dict]) -> None:
|
|
227
|
+
from confluent_kafka.error import KafkaException
|
|
228
|
+
|
|
229
|
+
producer = self.connection_config.get_producer()
|
|
230
|
+
failed_producer = False
|
|
231
|
+
|
|
232
|
+
def acked(err, msg):
|
|
233
|
+
if err is not None:
|
|
234
|
+
logger.error("Failed to deliver message: %s: %s" % (str(msg), str(err)))
|
|
235
|
+
|
|
236
|
+
for element in elements:
|
|
237
|
+
producer.produce(
|
|
238
|
+
topic=self.upload_config.topic,
|
|
239
|
+
value=json.dumps(element),
|
|
240
|
+
callback=acked,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
producer.flush(timeout=self.upload_config.timeout)
|
|
244
|
+
if failed_producer:
|
|
245
|
+
raise KafkaException("failed to produce all messages in batch")
|
|
246
|
+
|
|
247
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
248
|
+
with path.open("r") as elements_file:
|
|
249
|
+
elements = json.load(elements_file)
|
|
250
|
+
for element_batch in batch_generator(elements, batch_size=self.upload_config.batch_size):
|
|
251
|
+
self.produce_batch(elements=element_batch)
|
|
@@ -1,10 +1,12 @@
|
|
|
1
|
-
import socket
|
|
2
1
|
from dataclasses import dataclass
|
|
3
2
|
from typing import TYPE_CHECKING
|
|
4
3
|
|
|
5
4
|
from pydantic import Field, Secret
|
|
6
5
|
|
|
7
|
-
from unstructured_ingest.v2.processes.connector_registry import
|
|
6
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
7
|
+
DestinationRegistryEntry,
|
|
8
|
+
SourceRegistryEntry,
|
|
9
|
+
)
|
|
8
10
|
from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
|
|
9
11
|
KafkaAccessConfig,
|
|
10
12
|
KafkaConnectionConfig,
|
|
@@ -12,6 +14,8 @@ from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
|
|
|
12
14
|
KafkaDownloaderConfig,
|
|
13
15
|
KafkaIndexer,
|
|
14
16
|
KafkaIndexerConfig,
|
|
17
|
+
KafkaUploader,
|
|
18
|
+
KafkaUploaderConfig,
|
|
15
19
|
)
|
|
16
20
|
|
|
17
21
|
if TYPE_CHECKING:
|
|
@@ -35,11 +39,18 @@ class LocalKafkaConnectionConfig(KafkaConnectionConfig):
|
|
|
35
39
|
|
|
36
40
|
conf = {
|
|
37
41
|
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
38
|
-
"client.id": socket.gethostname(),
|
|
39
42
|
"group.id": "default_group_id",
|
|
40
43
|
"enable.auto.commit": "false",
|
|
41
44
|
"auto.offset.reset": "earliest",
|
|
42
|
-
|
|
45
|
+
}
|
|
46
|
+
return conf
|
|
47
|
+
|
|
48
|
+
def get_producer_configuration(self) -> dict:
|
|
49
|
+
bootstrap = self.bootstrap_server
|
|
50
|
+
port = self.port
|
|
51
|
+
|
|
52
|
+
conf = {
|
|
53
|
+
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
43
54
|
}
|
|
44
55
|
return conf
|
|
45
56
|
|
|
@@ -66,6 +77,17 @@ class LocalKafkaDownloader(KafkaDownloader):
|
|
|
66
77
|
connector_type: str = CONNECTOR_TYPE
|
|
67
78
|
|
|
68
79
|
|
|
80
|
+
class LocalKafkaUploaderConfig(KafkaUploaderConfig):
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class LocalKafkaUploader(KafkaUploader):
|
|
86
|
+
connection_config: LocalKafkaConnectionConfig
|
|
87
|
+
upload_config: LocalKafkaUploaderConfig
|
|
88
|
+
connector_type: str = CONNECTOR_TYPE
|
|
89
|
+
|
|
90
|
+
|
|
69
91
|
kafka_local_source_entry = SourceRegistryEntry(
|
|
70
92
|
connection_config=LocalKafkaConnectionConfig,
|
|
71
93
|
indexer=LocalKafkaIndexer,
|
|
@@ -73,3 +95,9 @@ kafka_local_source_entry = SourceRegistryEntry(
|
|
|
73
95
|
downloader=LocalKafkaDownloader,
|
|
74
96
|
downloader_config=LocalKafkaDownloaderConfig,
|
|
75
97
|
)
|
|
98
|
+
|
|
99
|
+
kafka_local_destination_entry = DestinationRegistryEntry(
|
|
100
|
+
connection_config=LocalKafkaConnectionConfig,
|
|
101
|
+
uploader=LocalKafkaUploader,
|
|
102
|
+
uploader_config=LocalKafkaUploaderConfig,
|
|
103
|
+
)
|
|
@@ -28,7 +28,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
28
28
|
SourceIdentifiers,
|
|
29
29
|
Uploader,
|
|
30
30
|
UploaderConfig,
|
|
31
|
-
download_responses,
|
|
32
31
|
)
|
|
33
32
|
from unstructured_ingest.v2.logger import logger
|
|
34
33
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -220,7 +219,7 @@ class OnedriveDownloader(Downloader):
|
|
|
220
219
|
return self.download_dir / Path(rel_path)
|
|
221
220
|
|
|
222
221
|
@SourceConnectionError.wrap
|
|
223
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
222
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
224
223
|
file = self._fetch_file(file_data=file_data)
|
|
225
224
|
fsize = file.get_property("size", 0)
|
|
226
225
|
download_path = self.get_download_path(file_data=file_data)
|
|
@@ -233,7 +232,7 @@ class OnedriveDownloader(Downloader):
|
|
|
233
232
|
else:
|
|
234
233
|
with download_path.open(mode="wb") as f:
|
|
235
234
|
file.download(f).execute_query()
|
|
236
|
-
return
|
|
235
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
237
236
|
|
|
238
237
|
|
|
239
238
|
class OnedriveUploaderConfig(UploaderConfig):
|
|
@@ -15,10 +15,10 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
15
15
|
ConnectionConfig,
|
|
16
16
|
Downloader,
|
|
17
17
|
DownloaderConfig,
|
|
18
|
+
DownloadResponse,
|
|
18
19
|
FileData,
|
|
19
20
|
Indexer,
|
|
20
21
|
IndexerConfig,
|
|
21
|
-
download_responses,
|
|
22
22
|
)
|
|
23
23
|
from unstructured_ingest.v2.interfaces.file_data import FileDataSourceMetadata, SourceIdentifiers
|
|
24
24
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
@@ -191,7 +191,7 @@ class OutlookDownloader(Downloader):
|
|
|
191
191
|
connection_config: OutlookConnectionConfig
|
|
192
192
|
download_config: OutlookDownloaderConfig = field(default_factory=OutlookDownloaderConfig)
|
|
193
193
|
|
|
194
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
194
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
195
195
|
# NOTE: Indexer should provide source identifiers required to generate the download path
|
|
196
196
|
download_path = self.get_download_path(file_data)
|
|
197
197
|
if download_path is None:
|
|
@@ -9,6 +9,7 @@ from pydantic import Field, Secret
|
|
|
9
9
|
from unstructured_ingest.error import DestinationConnectionError
|
|
10
10
|
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
12
13
|
from unstructured_ingest.v2.interfaces import (
|
|
13
14
|
AccessConfig,
|
|
14
15
|
ConnectionConfig,
|
|
@@ -23,6 +24,7 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
|
|
|
23
24
|
|
|
24
25
|
if TYPE_CHECKING:
|
|
25
26
|
from pinecone import Index as PineconeIndex
|
|
27
|
+
from pinecone import Pinecone
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
CONNECTOR_TYPE = "pinecone"
|
|
@@ -43,16 +45,19 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
43
45
|
)
|
|
44
46
|
|
|
45
47
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
46
|
-
def
|
|
48
|
+
def get_client(self, **index_kwargs) -> "Pinecone":
|
|
47
49
|
from pinecone import Pinecone
|
|
48
50
|
|
|
49
51
|
from unstructured_ingest import __version__ as unstructured_version
|
|
50
52
|
|
|
51
|
-
|
|
53
|
+
return Pinecone(
|
|
52
54
|
api_key=self.access_config.get_secret_value().pinecone_api_key,
|
|
53
55
|
source_tag=f"unstructured_ingest=={unstructured_version}",
|
|
54
56
|
)
|
|
55
57
|
|
|
58
|
+
def get_index(self, **index_kwargs) -> "PineconeIndex":
|
|
59
|
+
pc = self.get_client()
|
|
60
|
+
|
|
56
61
|
index = pc.Index(name=self.index_name, **index_kwargs)
|
|
57
62
|
logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
|
|
58
63
|
return index
|
|
@@ -106,7 +111,7 @@ class PineconeUploadStager(UploadStager):
|
|
|
106
111
|
default_factory=lambda: PineconeUploadStagerConfig()
|
|
107
112
|
)
|
|
108
113
|
|
|
109
|
-
def conform_dict(self, element_dict: dict) -> dict:
|
|
114
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
110
115
|
embeddings = element_dict.pop("embeddings", None)
|
|
111
116
|
metadata: dict[str, Any] = element_dict.pop("metadata", {})
|
|
112
117
|
data_source = metadata.pop("data_source", {})
|
|
@@ -121,19 +126,23 @@ class PineconeUploadStager(UploadStager):
|
|
|
121
126
|
}
|
|
122
127
|
)
|
|
123
128
|
|
|
129
|
+
metadata = flatten_dict(
|
|
130
|
+
pinecone_metadata,
|
|
131
|
+
separator="-",
|
|
132
|
+
flatten_lists=True,
|
|
133
|
+
remove_none=True,
|
|
134
|
+
)
|
|
135
|
+
metadata[RECORD_ID_LABEL] = file_data.identifier
|
|
136
|
+
|
|
124
137
|
return {
|
|
125
138
|
"id": str(uuid.uuid4()),
|
|
126
139
|
"values": embeddings,
|
|
127
|
-
"metadata":
|
|
128
|
-
pinecone_metadata,
|
|
129
|
-
separator="-",
|
|
130
|
-
flatten_lists=True,
|
|
131
|
-
remove_none=True,
|
|
132
|
-
),
|
|
140
|
+
"metadata": metadata,
|
|
133
141
|
}
|
|
134
142
|
|
|
135
143
|
def run(
|
|
136
144
|
self,
|
|
145
|
+
file_data: FileData,
|
|
137
146
|
elements_filepath: Path,
|
|
138
147
|
output_dir: Path,
|
|
139
148
|
output_filename: str,
|
|
@@ -143,10 +152,15 @@ class PineconeUploadStager(UploadStager):
|
|
|
143
152
|
elements_contents = json.load(elements_file)
|
|
144
153
|
|
|
145
154
|
conformed_elements = [
|
|
146
|
-
self.conform_dict(element_dict=element)
|
|
155
|
+
self.conform_dict(element_dict=element, file_data=file_data)
|
|
156
|
+
for element in elements_contents
|
|
147
157
|
]
|
|
148
158
|
|
|
149
|
-
|
|
159
|
+
if Path(output_filename).suffix != ".json":
|
|
160
|
+
output_filename = f"{output_filename}.json"
|
|
161
|
+
else:
|
|
162
|
+
output_filename = f"{Path(output_filename).stem}.json"
|
|
163
|
+
output_path = Path(output_dir) / Path(f"{output_filename}")
|
|
150
164
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
151
165
|
|
|
152
166
|
with open(output_path, "w") as output_file:
|
|
@@ -167,6 +181,55 @@ class PineconeUploader(Uploader):
|
|
|
167
181
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
168
182
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
169
183
|
|
|
184
|
+
def pod_delete_by_record_id(self, file_data: FileData) -> None:
|
|
185
|
+
logger.debug(
|
|
186
|
+
f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
187
|
+
f"from pinecone pod index"
|
|
188
|
+
)
|
|
189
|
+
index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
|
|
190
|
+
delete_kwargs = {"filter": {RECORD_ID_LABEL: {"$eq": file_data.identifier}}}
|
|
191
|
+
if namespace := self.upload_config.namespace:
|
|
192
|
+
delete_kwargs["namespace"] = namespace
|
|
193
|
+
|
|
194
|
+
resp = index.delete(**delete_kwargs)
|
|
195
|
+
logger.debug(
|
|
196
|
+
f"deleted any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
197
|
+
f"from pinecone index: {resp}"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def serverless_delete_by_record_id(self, file_data: FileData) -> None:
|
|
201
|
+
logger.debug(
|
|
202
|
+
f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
203
|
+
f"from pinecone serverless index"
|
|
204
|
+
)
|
|
205
|
+
index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
|
|
206
|
+
index_stats = index.describe_index_stats()
|
|
207
|
+
total_vectors = index_stats["total_vector_count"]
|
|
208
|
+
if total_vectors == 0:
|
|
209
|
+
return
|
|
210
|
+
dimension = index_stats["dimension"]
|
|
211
|
+
query_params = {
|
|
212
|
+
"filter": {RECORD_ID_LABEL: {"$eq": file_data.identifier}},
|
|
213
|
+
"vector": [0] * dimension,
|
|
214
|
+
"top_k": total_vectors,
|
|
215
|
+
}
|
|
216
|
+
if namespace := self.upload_config.namespace:
|
|
217
|
+
query_params["namespace"] = namespace
|
|
218
|
+
while True:
|
|
219
|
+
query_results = index.query(**query_params)
|
|
220
|
+
matches = query_results.get("matches", [])
|
|
221
|
+
if not matches:
|
|
222
|
+
break
|
|
223
|
+
ids = [match["id"] for match in matches]
|
|
224
|
+
delete_params = {"ids": ids}
|
|
225
|
+
if namespace := self.upload_config.namespace:
|
|
226
|
+
delete_params["namespace"] = namespace
|
|
227
|
+
index.delete(**delete_params)
|
|
228
|
+
logger.debug(
|
|
229
|
+
f"deleted any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
230
|
+
f"from pinecone index"
|
|
231
|
+
)
|
|
232
|
+
|
|
170
233
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
171
234
|
def upsert_batches_async(self, elements_dict: list[dict]):
|
|
172
235
|
from pinecone.exceptions import PineconeApiException
|
|
@@ -208,7 +271,15 @@ class PineconeUploader(Uploader):
|
|
|
208
271
|
f" index named {self.connection_config.index_name}"
|
|
209
272
|
f" with batch size {self.upload_config.batch_size}"
|
|
210
273
|
)
|
|
211
|
-
|
|
274
|
+
# Determine if serverless or pod based index
|
|
275
|
+
pinecone_client = self.connection_config.get_client()
|
|
276
|
+
index_description = pinecone_client.describe_index(name=self.connection_config.index_name)
|
|
277
|
+
if "serverless" in index_description.get("spec"):
|
|
278
|
+
self.serverless_delete_by_record_id(file_data=file_data)
|
|
279
|
+
elif "pod" in index_description.get("spec"):
|
|
280
|
+
self.pod_delete_by_record_id(file_data=file_data)
|
|
281
|
+
else:
|
|
282
|
+
raise ValueError(f"unexpected spec type in index description: {index_description}")
|
|
212
283
|
self.upsert_batches_async(elements_dict=elements_dict)
|
|
213
284
|
|
|
214
285
|
|
|
@@ -21,7 +21,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
21
21
|
Indexer,
|
|
22
22
|
IndexerConfig,
|
|
23
23
|
SourceIdentifiers,
|
|
24
|
-
download_responses,
|
|
25
24
|
)
|
|
26
25
|
from unstructured_ingest.v2.logger import logger
|
|
27
26
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -426,7 +425,7 @@ class SharepointDownloader(Downloader):
|
|
|
426
425
|
f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
|
|
427
426
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
428
427
|
|
|
429
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
428
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
430
429
|
content_type = file_data.additional_metadata.get("sharepoint_content_type")
|
|
431
430
|
if not content_type:
|
|
432
431
|
raise ValueError(
|
|
@@ -436,6 +435,8 @@ class SharepointDownloader(Downloader):
|
|
|
436
435
|
return self.get_document(file_data=file_data)
|
|
437
436
|
elif content_type == SharepointContentType.SITEPAGE.value:
|
|
438
437
|
return self.get_site_page(file_data=file_data)
|
|
438
|
+
else:
|
|
439
|
+
raise ValueError(f"content type not recognized: {content_type}")
|
|
439
440
|
|
|
440
441
|
|
|
441
442
|
sharepoint_source_entry = SourceRegistryEntry(
|
|
@@ -16,9 +16,9 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
16
16
|
ConnectionConfig,
|
|
17
17
|
Downloader,
|
|
18
18
|
DownloaderConfig,
|
|
19
|
+
DownloadResponse,
|
|
19
20
|
Indexer,
|
|
20
21
|
IndexerConfig,
|
|
21
|
-
download_responses,
|
|
22
22
|
)
|
|
23
23
|
from unstructured_ingest.v2.interfaces.file_data import (
|
|
24
24
|
FileData,
|
|
@@ -161,7 +161,7 @@ class SlackDownloader(Downloader):
|
|
|
161
161
|
def run(self, file_data, **kwargs):
|
|
162
162
|
raise NotImplementedError
|
|
163
163
|
|
|
164
|
-
async def run_async(self, file_data: FileData, **kwargs) ->
|
|
164
|
+
async def run_async(self, file_data: FileData, **kwargs) -> DownloadResponse:
|
|
165
165
|
# NOTE: Indexer should provide source identifiers required to generate the download path
|
|
166
166
|
download_path = self.get_download_path(file_data)
|
|
167
167
|
if download_path is None:
|
|
@@ -98,20 +98,28 @@ class PostgresDownloader(SQLDownloader):
|
|
|
98
98
|
download_config: PostgresDownloaderConfig
|
|
99
99
|
connector_type: str = CONNECTOR_TYPE
|
|
100
100
|
|
|
101
|
+
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
101
102
|
def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
|
|
103
|
+
from psycopg2 import sql
|
|
104
|
+
|
|
102
105
|
table_name = file_data.additional_metadata["table_name"]
|
|
103
106
|
id_column = file_data.additional_metadata["id_column"]
|
|
104
|
-
ids = file_data.additional_metadata["ids"]
|
|
107
|
+
ids = tuple(file_data.additional_metadata["ids"])
|
|
108
|
+
|
|
105
109
|
with self.connection_config.get_cursor() as cursor:
|
|
106
|
-
fields =
|
|
107
|
-
|
|
110
|
+
fields = (
|
|
111
|
+
sql.SQL(",").join(sql.Identifier(field) for field in self.download_config.fields)
|
|
112
|
+
if self.download_config.fields
|
|
113
|
+
else sql.SQL("*")
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
query = sql.SQL("SELECT {fields} FROM {table_name} WHERE {id_column} IN %s").format(
|
|
108
117
|
fields=fields,
|
|
109
|
-
table_name=table_name,
|
|
110
|
-
id_column=id_column,
|
|
111
|
-
ids=",".join([str(i) for i in ids]),
|
|
118
|
+
table_name=sql.Identifier(table_name),
|
|
119
|
+
id_column=sql.Identifier(id_column),
|
|
112
120
|
)
|
|
113
|
-
logger.debug(f"running query: {query}")
|
|
114
|
-
cursor.execute(query)
|
|
121
|
+
logger.debug(f"running query: {cursor.mogrify(query, (ids,))}")
|
|
122
|
+
cursor.execute(query, (ids,))
|
|
115
123
|
rows = cursor.fetchall()
|
|
116
124
|
columns = [col[0] for col in cursor.description]
|
|
117
125
|
return rows, columns
|