unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +109 -0
- test/integration/connectors/test_azure_cog_search.py +233 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_kafka.py +167 -0
- test/integration/connectors/test_onedrive.py +112 -0
- test/integration/connectors/test_pinecone.py +161 -0
- test/integration/connectors/test_qdrant.py +137 -0
- test/integration/connectors/test_s3.py +23 -0
- test/integration/connectors/utils/docker.py +2 -1
- test/integration/connectors/utils/validation.py +73 -22
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/__init__.py +2 -2
- unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/kafka.py +0 -1
- unstructured_ingest/interfaces.py +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/processes/chunker.py +2 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
- unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
- unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
- unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
- unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
- unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
- unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
- unstructured_ingest/v2/processes/partitioner.py +14 -3
- unstructured_ingest/v2/unstructured_api.py +24 -10
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
- unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
- /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
- /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
- /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from time import time
|
|
7
|
+
from typing import TYPE_CHECKING, Any, ContextManager, Generator, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.error import (
|
|
12
|
+
DestinationConnectionError,
|
|
13
|
+
SourceConnectionError,
|
|
14
|
+
SourceConnectionNetworkError,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
17
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
18
|
+
from unstructured_ingest.v2.interfaces import (
|
|
19
|
+
AccessConfig,
|
|
20
|
+
ConnectionConfig,
|
|
21
|
+
Downloader,
|
|
22
|
+
DownloaderConfig,
|
|
23
|
+
DownloadResponse,
|
|
24
|
+
FileData,
|
|
25
|
+
FileDataSourceMetadata,
|
|
26
|
+
Indexer,
|
|
27
|
+
IndexerConfig,
|
|
28
|
+
SourceIdentifiers,
|
|
29
|
+
Uploader,
|
|
30
|
+
UploaderConfig,
|
|
31
|
+
)
|
|
32
|
+
from unstructured_ingest.v2.logger import logger
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from confluent_kafka import Consumer, Producer
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class KafkaAccessConfig(AccessConfig, ABC):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class KafkaConnectionConfig(ConnectionConfig, ABC):
|
|
43
|
+
access_config: Secret[KafkaAccessConfig]
|
|
44
|
+
bootstrap_server: str
|
|
45
|
+
port: int
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def get_consumer_configuration(self) -> dict:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def get_producer_configuration(self) -> dict:
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
@contextmanager
|
|
56
|
+
@requires_dependencies(["confluent_kafka"], extras="kafka")
|
|
57
|
+
def get_consumer(self) -> ContextManager["Consumer"]:
|
|
58
|
+
from confluent_kafka import Consumer
|
|
59
|
+
|
|
60
|
+
consumer = Consumer(self.get_consumer_configuration())
|
|
61
|
+
try:
|
|
62
|
+
logger.debug("kafka consumer connected")
|
|
63
|
+
yield consumer
|
|
64
|
+
finally:
|
|
65
|
+
consumer.close()
|
|
66
|
+
|
|
67
|
+
@requires_dependencies(["confluent_kafka"], extras="kafka")
|
|
68
|
+
def get_producer(self) -> "Producer":
|
|
69
|
+
from confluent_kafka import Producer
|
|
70
|
+
|
|
71
|
+
producer = Producer(self.get_producer_configuration())
|
|
72
|
+
return producer
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class KafkaIndexerConfig(IndexerConfig):
|
|
76
|
+
topic: str = Field(description="which topic to consume from")
|
|
77
|
+
num_messages_to_consume: Optional[int] = 100
|
|
78
|
+
timeout: Optional[float] = Field(default=1.0, description="polling timeout")
|
|
79
|
+
|
|
80
|
+
def update_consumer(self, consumer: "Consumer") -> None:
|
|
81
|
+
consumer.subscribe([self.topic])
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class KafkaIndexer(Indexer, ABC):
|
|
86
|
+
connection_config: KafkaConnectionConfig
|
|
87
|
+
index_config: KafkaIndexerConfig
|
|
88
|
+
|
|
89
|
+
@contextmanager
|
|
90
|
+
def get_consumer(self) -> ContextManager["Consumer"]:
|
|
91
|
+
with self.connection_config.get_consumer() as consumer:
|
|
92
|
+
self.index_config.update_consumer(consumer=consumer)
|
|
93
|
+
yield consumer
|
|
94
|
+
|
|
95
|
+
@requires_dependencies(["confluent_kafka"], extras="kafka")
|
|
96
|
+
def generate_messages(self) -> Generator[Any, None, None]:
|
|
97
|
+
from confluent_kafka import KafkaError, KafkaException
|
|
98
|
+
|
|
99
|
+
messages_consumed = 0
|
|
100
|
+
max_empty_polls = 10
|
|
101
|
+
empty_polls = 0
|
|
102
|
+
num_messages_to_consume = self.index_config.num_messages_to_consume
|
|
103
|
+
with self.get_consumer() as consumer:
|
|
104
|
+
while messages_consumed < num_messages_to_consume and empty_polls < max_empty_polls:
|
|
105
|
+
msg = consumer.poll(timeout=self.index_config.timeout)
|
|
106
|
+
if msg is None:
|
|
107
|
+
logger.debug("No Kafka messages found")
|
|
108
|
+
empty_polls += 1
|
|
109
|
+
continue
|
|
110
|
+
if msg.error():
|
|
111
|
+
if msg.error().code() == KafkaError._PARTITION_EOF:
|
|
112
|
+
logger.info(
|
|
113
|
+
"Reached end of partition for topic %s [%d] at offset %d"
|
|
114
|
+
% (msg.topic(), msg.partition(), msg.offset())
|
|
115
|
+
)
|
|
116
|
+
break
|
|
117
|
+
else:
|
|
118
|
+
raise KafkaException(msg.error())
|
|
119
|
+
try:
|
|
120
|
+
empty_polls = 0
|
|
121
|
+
messages_consumed += 1
|
|
122
|
+
yield msg
|
|
123
|
+
finally:
|
|
124
|
+
consumer.commit(asynchronous=False)
|
|
125
|
+
|
|
126
|
+
def generate_file_data(self, msg) -> FileData:
|
|
127
|
+
msg_content = msg.value().decode("utf8")
|
|
128
|
+
identifier = f"{msg.topic()}_{msg.partition()}_{msg.offset()}"
|
|
129
|
+
additional_metadata = {
|
|
130
|
+
"topic": msg.topic(),
|
|
131
|
+
"partition": msg.partition(),
|
|
132
|
+
"offset": msg.offset(),
|
|
133
|
+
"content": msg_content,
|
|
134
|
+
}
|
|
135
|
+
filename = f"{identifier}.txt"
|
|
136
|
+
return FileData(
|
|
137
|
+
identifier=identifier,
|
|
138
|
+
connector_type=self.connector_type,
|
|
139
|
+
source_identifiers=SourceIdentifiers(
|
|
140
|
+
filename=filename,
|
|
141
|
+
fullpath=filename,
|
|
142
|
+
),
|
|
143
|
+
metadata=FileDataSourceMetadata(
|
|
144
|
+
date_processed=str(time()),
|
|
145
|
+
),
|
|
146
|
+
additional_metadata=additional_metadata,
|
|
147
|
+
display_name=filename,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def run(self) -> Generator[FileData, None, None]:
|
|
151
|
+
for message in self.generate_messages():
|
|
152
|
+
yield self.generate_file_data(message)
|
|
153
|
+
|
|
154
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
155
|
+
raise NotImplementedError()
|
|
156
|
+
|
|
157
|
+
def precheck(self):
|
|
158
|
+
try:
|
|
159
|
+
with self.get_consumer() as consumer:
|
|
160
|
+
cluster_meta = consumer.list_topics(timeout=self.index_config.timeout)
|
|
161
|
+
current_topics = [
|
|
162
|
+
topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
|
|
163
|
+
]
|
|
164
|
+
logger.info(f"successfully checked available topics: {current_topics}")
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
167
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class KafkaDownloaderConfig(DownloaderConfig):
|
|
171
|
+
pass
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@dataclass
|
|
175
|
+
class KafkaDownloader(Downloader, ABC):
|
|
176
|
+
connection_config: KafkaConnectionConfig
|
|
177
|
+
download_config: KafkaDownloaderConfig = field(default_factory=KafkaDownloaderConfig)
|
|
178
|
+
version: Optional[str] = None
|
|
179
|
+
source_url: Optional[str] = None
|
|
180
|
+
|
|
181
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
182
|
+
source_identifiers = file_data.source_identifiers
|
|
183
|
+
if source_identifiers is None:
|
|
184
|
+
raise ValueError("FileData is missing source_identifiers")
|
|
185
|
+
|
|
186
|
+
# Build the download path using source_identifiers
|
|
187
|
+
download_path = Path(self.download_dir) / source_identifiers.relative_path
|
|
188
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
content = file_data.additional_metadata["content"]
|
|
192
|
+
with open(download_path, "w") as file:
|
|
193
|
+
file.write(content)
|
|
194
|
+
except Exception as e:
|
|
195
|
+
logger.error(f"Failed to download file {file_data.identifier}: {e}")
|
|
196
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
197
|
+
|
|
198
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class KafkaUploaderConfig(UploaderConfig):
|
|
202
|
+
batch_size: int = Field(default=100, description="Batch size")
|
|
203
|
+
topic: str = Field(description="which topic to write to")
|
|
204
|
+
timeout: Optional[float] = Field(
|
|
205
|
+
default=10.0, description="Timeout in seconds to flush batch of messages"
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@dataclass
|
|
210
|
+
class KafkaUploader(Uploader, ABC):
|
|
211
|
+
connection_config: KafkaConnectionConfig
|
|
212
|
+
upload_config: KafkaUploaderConfig
|
|
213
|
+
|
|
214
|
+
def precheck(self):
|
|
215
|
+
try:
|
|
216
|
+
with self.connection_config.get_consumer() as consumer:
|
|
217
|
+
cluster_meta = consumer.list_topics(timeout=self.upload_config.timeout)
|
|
218
|
+
current_topics = [
|
|
219
|
+
topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
|
|
220
|
+
]
|
|
221
|
+
logger.info(f"successfully checked available topics: {current_topics}")
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
224
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
225
|
+
|
|
226
|
+
def produce_batch(self, elements: list[dict]) -> None:
|
|
227
|
+
from confluent_kafka.error import KafkaException
|
|
228
|
+
|
|
229
|
+
producer = self.connection_config.get_producer()
|
|
230
|
+
failed_producer = False
|
|
231
|
+
|
|
232
|
+
def acked(err, msg):
|
|
233
|
+
if err is not None:
|
|
234
|
+
logger.error("Failed to deliver message: %s: %s" % (str(msg), str(err)))
|
|
235
|
+
|
|
236
|
+
for element in elements:
|
|
237
|
+
producer.produce(
|
|
238
|
+
topic=self.upload_config.topic,
|
|
239
|
+
value=json.dumps(element),
|
|
240
|
+
callback=acked,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
producer.flush(timeout=self.upload_config.timeout)
|
|
244
|
+
if failed_producer:
|
|
245
|
+
raise KafkaException("failed to produce all messages in batch")
|
|
246
|
+
|
|
247
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
248
|
+
with path.open("r") as elements_file:
|
|
249
|
+
elements = json.load(elements_file)
|
|
250
|
+
for element_batch in batch_generator(elements, batch_size=self.upload_config.batch_size):
|
|
251
|
+
self.produce_batch(elements=element_batch)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
7
|
+
DestinationRegistryEntry,
|
|
8
|
+
SourceRegistryEntry,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
|
|
11
|
+
KafkaAccessConfig,
|
|
12
|
+
KafkaConnectionConfig,
|
|
13
|
+
KafkaDownloader,
|
|
14
|
+
KafkaDownloaderConfig,
|
|
15
|
+
KafkaIndexer,
|
|
16
|
+
KafkaIndexerConfig,
|
|
17
|
+
KafkaUploader,
|
|
18
|
+
KafkaUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
CONNECTOR_TYPE = "kafka-local"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LocalKafkaAccessConfig(KafkaAccessConfig):
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LocalKafkaConnectionConfig(KafkaConnectionConfig):
|
|
32
|
+
access_config: Secret[LocalKafkaAccessConfig] = Field(
|
|
33
|
+
default=LocalKafkaAccessConfig(), validate_default=True
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def get_consumer_configuration(self) -> dict:
|
|
37
|
+
bootstrap = self.bootstrap_server
|
|
38
|
+
port = self.port
|
|
39
|
+
|
|
40
|
+
conf = {
|
|
41
|
+
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
42
|
+
"group.id": "default_group_id",
|
|
43
|
+
"enable.auto.commit": "false",
|
|
44
|
+
"auto.offset.reset": "earliest",
|
|
45
|
+
}
|
|
46
|
+
return conf
|
|
47
|
+
|
|
48
|
+
def get_producer_configuration(self) -> dict:
|
|
49
|
+
bootstrap = self.bootstrap_server
|
|
50
|
+
port = self.port
|
|
51
|
+
|
|
52
|
+
conf = {
|
|
53
|
+
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
54
|
+
}
|
|
55
|
+
return conf
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class LocalKafkaIndexerConfig(KafkaIndexerConfig):
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class LocalKafkaIndexer(KafkaIndexer):
|
|
64
|
+
connection_config: LocalKafkaConnectionConfig
|
|
65
|
+
index_config: LocalKafkaIndexerConfig
|
|
66
|
+
connector_type: str = CONNECTOR_TYPE
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class LocalKafkaDownloaderConfig(KafkaDownloaderConfig):
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class LocalKafkaDownloader(KafkaDownloader):
|
|
75
|
+
connection_config: LocalKafkaConnectionConfig
|
|
76
|
+
download_config: LocalKafkaDownloaderConfig
|
|
77
|
+
connector_type: str = CONNECTOR_TYPE
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class LocalKafkaUploaderConfig(KafkaUploaderConfig):
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class LocalKafkaUploader(KafkaUploader):
|
|
86
|
+
connection_config: LocalKafkaConnectionConfig
|
|
87
|
+
upload_config: LocalKafkaUploaderConfig
|
|
88
|
+
connector_type: str = CONNECTOR_TYPE
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
kafka_local_source_entry = SourceRegistryEntry(
|
|
92
|
+
connection_config=LocalKafkaConnectionConfig,
|
|
93
|
+
indexer=LocalKafkaIndexer,
|
|
94
|
+
indexer_config=LocalKafkaIndexerConfig,
|
|
95
|
+
downloader=LocalKafkaDownloader,
|
|
96
|
+
downloader_config=LocalKafkaDownloaderConfig,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
kafka_local_destination_entry = DestinationRegistryEntry(
|
|
100
|
+
connection_config=LocalKafkaConnectionConfig,
|
|
101
|
+
uploader=LocalKafkaUploader,
|
|
102
|
+
uploader_config=LocalKafkaUploaderConfig,
|
|
103
|
+
)
|
|
@@ -9,7 +9,11 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
|
9
9
|
from dateutil import parser
|
|
10
10
|
from pydantic import Field, Secret
|
|
11
11
|
|
|
12
|
-
from unstructured_ingest.error import
|
|
12
|
+
from unstructured_ingest.error import (
|
|
13
|
+
DestinationConnectionError,
|
|
14
|
+
SourceConnectionError,
|
|
15
|
+
SourceConnectionNetworkError,
|
|
16
|
+
)
|
|
13
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
18
|
from unstructured_ingest.v2.interfaces import (
|
|
15
19
|
AccessConfig,
|
|
@@ -22,16 +26,19 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
22
26
|
Indexer,
|
|
23
27
|
IndexerConfig,
|
|
24
28
|
SourceIdentifiers,
|
|
25
|
-
|
|
29
|
+
Uploader,
|
|
30
|
+
UploaderConfig,
|
|
26
31
|
)
|
|
27
32
|
from unstructured_ingest.v2.logger import logger
|
|
28
33
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
34
|
+
DestinationRegistryEntry,
|
|
29
35
|
SourceRegistryEntry,
|
|
30
36
|
)
|
|
31
37
|
|
|
32
38
|
if TYPE_CHECKING:
|
|
33
39
|
from office365.graph_client import GraphClient
|
|
34
40
|
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
41
|
+
from office365.onedrive.drives.drive import Drive
|
|
35
42
|
|
|
36
43
|
CONNECTOR_TYPE = "onedrive"
|
|
37
44
|
MAX_MB_SIZE = 512_000_000
|
|
@@ -55,6 +62,11 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
55
62
|
)
|
|
56
63
|
access_config: Secret[OnedriveAccessConfig]
|
|
57
64
|
|
|
65
|
+
def get_drive(self) -> "Drive":
|
|
66
|
+
client = self.get_client()
|
|
67
|
+
drive = client.users[self.user_pname].drive
|
|
68
|
+
return drive
|
|
69
|
+
|
|
58
70
|
@requires_dependencies(["msal"], extras="onedrive")
|
|
59
71
|
def get_token(self):
|
|
60
72
|
from msal import ConfidentialClientApplication
|
|
@@ -100,7 +112,6 @@ class OnedriveIndexer(Indexer):
|
|
|
100
112
|
raise SourceConnectionError(
|
|
101
113
|
"{} ({})".format(error, token_resp.get("error_description"))
|
|
102
114
|
)
|
|
103
|
-
self.connection_config.get_client()
|
|
104
115
|
except Exception as e:
|
|
105
116
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
106
117
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
@@ -208,7 +219,7 @@ class OnedriveDownloader(Downloader):
|
|
|
208
219
|
return self.download_dir / Path(rel_path)
|
|
209
220
|
|
|
210
221
|
@SourceConnectionError.wrap
|
|
211
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
222
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
212
223
|
file = self._fetch_file(file_data=file_data)
|
|
213
224
|
fsize = file.get_property("size", 0)
|
|
214
225
|
download_path = self.get_download_path(file_data=file_data)
|
|
@@ -221,7 +232,150 @@ class OnedriveDownloader(Downloader):
|
|
|
221
232
|
else:
|
|
222
233
|
with download_path.open(mode="wb") as f:
|
|
223
234
|
file.download(f).execute_query()
|
|
224
|
-
return
|
|
235
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class OnedriveUploaderConfig(UploaderConfig):
|
|
239
|
+
remote_url: str = Field(
|
|
240
|
+
description="URL of the destination in OneDrive, e.g., 'onedrive://Documents/Folder'"
|
|
241
|
+
)
|
|
242
|
+
prefix: str = "onedrive://"
|
|
243
|
+
|
|
244
|
+
@property
|
|
245
|
+
def root_folder(self) -> str:
|
|
246
|
+
url = (
|
|
247
|
+
self.remote_url.replace(self.prefix, "", 1)
|
|
248
|
+
if self.remote_url.startswith(self.prefix)
|
|
249
|
+
else self.remote_url
|
|
250
|
+
)
|
|
251
|
+
return url.split("/")[0]
|
|
252
|
+
|
|
253
|
+
@property
|
|
254
|
+
def url(self) -> str:
|
|
255
|
+
url = (
|
|
256
|
+
self.remote_url.replace(self.prefix, "", 1)
|
|
257
|
+
if self.remote_url.startswith(self.prefix)
|
|
258
|
+
else self.remote_url
|
|
259
|
+
)
|
|
260
|
+
return url
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
@dataclass
|
|
264
|
+
class OnedriveUploader(Uploader):
|
|
265
|
+
connection_config: OnedriveConnectionConfig
|
|
266
|
+
upload_config: OnedriveUploaderConfig
|
|
267
|
+
connector_type: str = CONNECTOR_TYPE
|
|
268
|
+
|
|
269
|
+
@requires_dependencies(["office365"], extras="onedrive")
|
|
270
|
+
def precheck(self) -> None:
|
|
271
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
token_resp: dict = self.connection_config.get_token()
|
|
275
|
+
if error := token_resp.get("error"):
|
|
276
|
+
raise SourceConnectionError(
|
|
277
|
+
"{} ({})".format(error, token_resp.get("error_description"))
|
|
278
|
+
)
|
|
279
|
+
drive = self.connection_config.get_drive()
|
|
280
|
+
root = drive.root
|
|
281
|
+
root_folder = self.upload_config.root_folder
|
|
282
|
+
folder = root.get_by_path(root_folder)
|
|
283
|
+
try:
|
|
284
|
+
folder.get().execute_query()
|
|
285
|
+
except ClientRequestException as e:
|
|
286
|
+
if e.message != "The resource could not be found.":
|
|
287
|
+
raise e
|
|
288
|
+
folder = root.create_folder(root_folder).execute_query()
|
|
289
|
+
logger.info(f"successfully created folder: {folder.name}")
|
|
290
|
+
except Exception as e:
|
|
291
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
292
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
293
|
+
|
|
294
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
295
|
+
drive = self.connection_config.get_drive()
|
|
296
|
+
|
|
297
|
+
# Use the remote_url from upload_config as the base destination folder
|
|
298
|
+
base_destination_folder = self.upload_config.url
|
|
299
|
+
|
|
300
|
+
# Use the file's relative path to maintain directory structure, if needed
|
|
301
|
+
if file_data.source_identifiers and file_data.source_identifiers.rel_path:
|
|
302
|
+
# Combine the base destination folder with the file's relative path
|
|
303
|
+
destination_path = Path(base_destination_folder) / Path(
|
|
304
|
+
file_data.source_identifiers.rel_path
|
|
305
|
+
)
|
|
306
|
+
else:
|
|
307
|
+
# If no relative path is provided, upload directly to the base destination folder
|
|
308
|
+
destination_path = Path(base_destination_folder) / path.name
|
|
309
|
+
|
|
310
|
+
destination_folder = destination_path.parent
|
|
311
|
+
file_name = destination_path.name
|
|
312
|
+
|
|
313
|
+
# Convert destination folder to a string suitable for OneDrive API
|
|
314
|
+
destination_folder_str = str(destination_folder).replace("\\", "/")
|
|
315
|
+
|
|
316
|
+
# Resolve the destination folder in OneDrive, creating it if necessary
|
|
317
|
+
try:
|
|
318
|
+
# Attempt to get the folder
|
|
319
|
+
folder = drive.root.get_by_path(destination_folder_str)
|
|
320
|
+
folder.get().execute_query()
|
|
321
|
+
except Exception:
|
|
322
|
+
# Folder doesn't exist, create it recursively
|
|
323
|
+
current_folder = drive.root
|
|
324
|
+
for part in destination_folder.parts:
|
|
325
|
+
# Use filter to find the folder by name
|
|
326
|
+
folders = (
|
|
327
|
+
current_folder.children.filter(f"name eq '{part}' and folder ne null")
|
|
328
|
+
.get()
|
|
329
|
+
.execute_query()
|
|
330
|
+
)
|
|
331
|
+
if folders:
|
|
332
|
+
current_folder = folders[0]
|
|
333
|
+
else:
|
|
334
|
+
# Folder doesn't exist, create it
|
|
335
|
+
current_folder = current_folder.create_folder(part).execute_query()
|
|
336
|
+
folder = current_folder
|
|
337
|
+
|
|
338
|
+
# Check the size of the file
|
|
339
|
+
file_size = path.stat().st_size
|
|
340
|
+
|
|
341
|
+
if file_size < MAX_MB_SIZE:
|
|
342
|
+
# Use simple upload for small files
|
|
343
|
+
with path.open("rb") as local_file:
|
|
344
|
+
content = local_file.read()
|
|
345
|
+
logger.info(f"Uploading {path} to {destination_path} using simple upload")
|
|
346
|
+
try:
|
|
347
|
+
uploaded_file = folder.upload(file_name, content).execute_query()
|
|
348
|
+
if not uploaded_file or uploaded_file.name != file_name:
|
|
349
|
+
raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
|
|
350
|
+
# Log details about the uploaded file
|
|
351
|
+
logger.info(
|
|
352
|
+
f"Uploaded file '{uploaded_file.name}' with ID '{uploaded_file.id}'"
|
|
353
|
+
)
|
|
354
|
+
except Exception as e:
|
|
355
|
+
logger.error(f"Failed to upload file '{file_name}': {e}", exc_info=True)
|
|
356
|
+
raise DestinationConnectionError(
|
|
357
|
+
f"Failed to upload file '{file_name}': {e}"
|
|
358
|
+
) from e
|
|
359
|
+
else:
|
|
360
|
+
# Use resumable upload for large files
|
|
361
|
+
destination_fullpath = f"{destination_folder_str}/{file_name}"
|
|
362
|
+
destination_drive_item = drive.root.item_with_path(destination_fullpath)
|
|
363
|
+
|
|
364
|
+
logger.info(f"Uploading {path} to {destination_fullpath} using resumable upload")
|
|
365
|
+
try:
|
|
366
|
+
uploaded_file = destination_drive_item.resumable_upload(
|
|
367
|
+
source_path=str(path)
|
|
368
|
+
).execute_query()
|
|
369
|
+
# Validate the upload
|
|
370
|
+
if not uploaded_file or uploaded_file.name != file_name:
|
|
371
|
+
raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
|
|
372
|
+
# Log details about the uploaded file
|
|
373
|
+
logger.info(f"Uploaded file {uploaded_file.name} with ID {uploaded_file.id}")
|
|
374
|
+
except Exception as e:
|
|
375
|
+
logger.error(f"Failed to upload file '{file_name}' using resumable upload: {e}")
|
|
376
|
+
raise DestinationConnectionError(
|
|
377
|
+
f"Failed to upload file '{file_name}' using resumable upload: {e}"
|
|
378
|
+
) from e
|
|
225
379
|
|
|
226
380
|
|
|
227
381
|
onedrive_source_entry = SourceRegistryEntry(
|
|
@@ -231,3 +385,9 @@ onedrive_source_entry = SourceRegistryEntry(
|
|
|
231
385
|
downloader_config=OnedriveDownloaderConfig,
|
|
232
386
|
downloader=OnedriveDownloader,
|
|
233
387
|
)
|
|
388
|
+
|
|
389
|
+
onedrive_destination_entry = DestinationRegistryEntry(
|
|
390
|
+
connection_config=OnedriveConnectionConfig,
|
|
391
|
+
uploader=OnedriveUploader,
|
|
392
|
+
uploader_config=OnedriveUploaderConfig,
|
|
393
|
+
)
|
|
@@ -15,10 +15,10 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
15
15
|
ConnectionConfig,
|
|
16
16
|
Downloader,
|
|
17
17
|
DownloaderConfig,
|
|
18
|
+
DownloadResponse,
|
|
18
19
|
FileData,
|
|
19
20
|
Indexer,
|
|
20
21
|
IndexerConfig,
|
|
21
|
-
download_responses,
|
|
22
22
|
)
|
|
23
23
|
from unstructured_ingest.v2.interfaces.file_data import FileDataSourceMetadata, SourceIdentifiers
|
|
24
24
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
@@ -191,7 +191,7 @@ class OutlookDownloader(Downloader):
|
|
|
191
191
|
connection_config: OutlookConnectionConfig
|
|
192
192
|
download_config: OutlookDownloaderConfig = field(default_factory=OutlookDownloaderConfig)
|
|
193
193
|
|
|
194
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
194
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
195
195
|
# NOTE: Indexer should provide source identifiers required to generate the download path
|
|
196
196
|
download_path = self.get_download_path(file_data)
|
|
197
197
|
if download_path is None:
|