unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (78) hide show
  1. test/integration/connectors/test_astradb.py +109 -0
  2. test/integration/connectors/test_azure_cog_search.py +233 -0
  3. test/integration/connectors/test_confluence.py +113 -0
  4. test/integration/connectors/test_kafka.py +167 -0
  5. test/integration/connectors/test_onedrive.py +112 -0
  6. test/integration/connectors/test_pinecone.py +161 -0
  7. test/integration/connectors/test_qdrant.py +137 -0
  8. test/integration/connectors/test_s3.py +23 -0
  9. test/integration/connectors/utils/docker.py +2 -1
  10. test/integration/connectors/utils/validation.py +73 -22
  11. test/unit/v2/__init__.py +0 -0
  12. test/unit/v2/chunkers/__init__.py +0 -0
  13. test/unit/v2/chunkers/test_chunkers.py +49 -0
  14. test/unit/v2/connectors/__init__.py +0 -0
  15. test/unit/v2/embedders/__init__.py +0 -0
  16. test/unit/v2/embedders/test_bedrock.py +36 -0
  17. test/unit/v2/embedders/test_huggingface.py +48 -0
  18. test/unit/v2/embedders/test_mixedbread.py +37 -0
  19. test/unit/v2/embedders/test_octoai.py +35 -0
  20. test/unit/v2/embedders/test_openai.py +35 -0
  21. test/unit/v2/embedders/test_togetherai.py +37 -0
  22. test/unit/v2/embedders/test_vertexai.py +37 -0
  23. test/unit/v2/embedders/test_voyageai.py +38 -0
  24. test/unit/v2/partitioners/__init__.py +0 -0
  25. test/unit/v2/partitioners/test_partitioner.py +63 -0
  26. test/unit/v2/utils/__init__.py +0 -0
  27. test/unit/v2/utils/data_generator.py +32 -0
  28. unstructured_ingest/__version__.py +1 -1
  29. unstructured_ingest/cli/cmds/__init__.py +2 -2
  30. unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  31. unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  32. unstructured_ingest/connector/kafka.py +0 -1
  33. unstructured_ingest/interfaces.py +7 -7
  34. unstructured_ingest/runner/writers/__init__.py +2 -2
  35. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  36. unstructured_ingest/v2/constants.py +2 -0
  37. unstructured_ingest/v2/processes/chunker.py +2 -2
  38. unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
  39. unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
  40. unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
  41. unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
  42. unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
  43. unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
  44. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
  45. unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
  46. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
  47. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
  48. unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
  49. unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
  50. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  51. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
  52. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
  53. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  54. unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
  55. unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
  56. unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
  57. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  58. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  59. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  60. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
  61. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  62. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
  63. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  64. unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
  65. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
  66. unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
  67. unstructured_ingest/v2/processes/partitioner.py +14 -3
  68. unstructured_ingest/v2/unstructured_api.py +24 -10
  69. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
  70. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
  71. unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
  72. /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
  73. /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
  74. /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
  75. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
  76. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
  77. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
  78. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,251 @@
1
+ import json
2
+ from abc import ABC, abstractmethod
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from time import time
7
+ from typing import TYPE_CHECKING, Any, ContextManager, Generator, Optional
8
+
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.error import (
12
+ DestinationConnectionError,
13
+ SourceConnectionError,
14
+ SourceConnectionNetworkError,
15
+ )
16
+ from unstructured_ingest.utils.data_prep import batch_generator
17
+ from unstructured_ingest.utils.dep_check import requires_dependencies
18
+ from unstructured_ingest.v2.interfaces import (
19
+ AccessConfig,
20
+ ConnectionConfig,
21
+ Downloader,
22
+ DownloaderConfig,
23
+ DownloadResponse,
24
+ FileData,
25
+ FileDataSourceMetadata,
26
+ Indexer,
27
+ IndexerConfig,
28
+ SourceIdentifiers,
29
+ Uploader,
30
+ UploaderConfig,
31
+ )
32
+ from unstructured_ingest.v2.logger import logger
33
+
34
+ if TYPE_CHECKING:
35
+ from confluent_kafka import Consumer, Producer
36
+
37
+
38
+ class KafkaAccessConfig(AccessConfig, ABC):
39
+ pass
40
+
41
+
42
+ class KafkaConnectionConfig(ConnectionConfig, ABC):
43
+ access_config: Secret[KafkaAccessConfig]
44
+ bootstrap_server: str
45
+ port: int
46
+
47
+ @abstractmethod
48
+ def get_consumer_configuration(self) -> dict:
49
+ pass
50
+
51
+ @abstractmethod
52
+ def get_producer_configuration(self) -> dict:
53
+ pass
54
+
55
+ @contextmanager
56
+ @requires_dependencies(["confluent_kafka"], extras="kafka")
57
+ def get_consumer(self) -> ContextManager["Consumer"]:
58
+ from confluent_kafka import Consumer
59
+
60
+ consumer = Consumer(self.get_consumer_configuration())
61
+ try:
62
+ logger.debug("kafka consumer connected")
63
+ yield consumer
64
+ finally:
65
+ consumer.close()
66
+
67
+ @requires_dependencies(["confluent_kafka"], extras="kafka")
68
+ def get_producer(self) -> "Producer":
69
+ from confluent_kafka import Producer
70
+
71
+ producer = Producer(self.get_producer_configuration())
72
+ return producer
73
+
74
+
75
+ class KafkaIndexerConfig(IndexerConfig):
76
+ topic: str = Field(description="which topic to consume from")
77
+ num_messages_to_consume: Optional[int] = 100
78
+ timeout: Optional[float] = Field(default=1.0, description="polling timeout")
79
+
80
+ def update_consumer(self, consumer: "Consumer") -> None:
81
+ consumer.subscribe([self.topic])
82
+
83
+
84
+ @dataclass
85
+ class KafkaIndexer(Indexer, ABC):
86
+ connection_config: KafkaConnectionConfig
87
+ index_config: KafkaIndexerConfig
88
+
89
+ @contextmanager
90
+ def get_consumer(self) -> ContextManager["Consumer"]:
91
+ with self.connection_config.get_consumer() as consumer:
92
+ self.index_config.update_consumer(consumer=consumer)
93
+ yield consumer
94
+
95
+ @requires_dependencies(["confluent_kafka"], extras="kafka")
96
+ def generate_messages(self) -> Generator[Any, None, None]:
97
+ from confluent_kafka import KafkaError, KafkaException
98
+
99
+ messages_consumed = 0
100
+ max_empty_polls = 10
101
+ empty_polls = 0
102
+ num_messages_to_consume = self.index_config.num_messages_to_consume
103
+ with self.get_consumer() as consumer:
104
+ while messages_consumed < num_messages_to_consume and empty_polls < max_empty_polls:
105
+ msg = consumer.poll(timeout=self.index_config.timeout)
106
+ if msg is None:
107
+ logger.debug("No Kafka messages found")
108
+ empty_polls += 1
109
+ continue
110
+ if msg.error():
111
+ if msg.error().code() == KafkaError._PARTITION_EOF:
112
+ logger.info(
113
+ "Reached end of partition for topic %s [%d] at offset %d"
114
+ % (msg.topic(), msg.partition(), msg.offset())
115
+ )
116
+ break
117
+ else:
118
+ raise KafkaException(msg.error())
119
+ try:
120
+ empty_polls = 0
121
+ messages_consumed += 1
122
+ yield msg
123
+ finally:
124
+ consumer.commit(asynchronous=False)
125
+
126
+ def generate_file_data(self, msg) -> FileData:
127
+ msg_content = msg.value().decode("utf8")
128
+ identifier = f"{msg.topic()}_{msg.partition()}_{msg.offset()}"
129
+ additional_metadata = {
130
+ "topic": msg.topic(),
131
+ "partition": msg.partition(),
132
+ "offset": msg.offset(),
133
+ "content": msg_content,
134
+ }
135
+ filename = f"{identifier}.txt"
136
+ return FileData(
137
+ identifier=identifier,
138
+ connector_type=self.connector_type,
139
+ source_identifiers=SourceIdentifiers(
140
+ filename=filename,
141
+ fullpath=filename,
142
+ ),
143
+ metadata=FileDataSourceMetadata(
144
+ date_processed=str(time()),
145
+ ),
146
+ additional_metadata=additional_metadata,
147
+ display_name=filename,
148
+ )
149
+
150
+ def run(self) -> Generator[FileData, None, None]:
151
+ for message in self.generate_messages():
152
+ yield self.generate_file_data(message)
153
+
154
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
155
+ raise NotImplementedError()
156
+
157
+ def precheck(self):
158
+ try:
159
+ with self.get_consumer() as consumer:
160
+ cluster_meta = consumer.list_topics(timeout=self.index_config.timeout)
161
+ current_topics = [
162
+ topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
163
+ ]
164
+ logger.info(f"successfully checked available topics: {current_topics}")
165
+ except Exception as e:
166
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
167
+ raise SourceConnectionError(f"failed to validate connection: {e}")
168
+
169
+
170
+ class KafkaDownloaderConfig(DownloaderConfig):
171
+ pass
172
+
173
+
174
+ @dataclass
175
+ class KafkaDownloader(Downloader, ABC):
176
+ connection_config: KafkaConnectionConfig
177
+ download_config: KafkaDownloaderConfig = field(default_factory=KafkaDownloaderConfig)
178
+ version: Optional[str] = None
179
+ source_url: Optional[str] = None
180
+
181
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
182
+ source_identifiers = file_data.source_identifiers
183
+ if source_identifiers is None:
184
+ raise ValueError("FileData is missing source_identifiers")
185
+
186
+ # Build the download path using source_identifiers
187
+ download_path = Path(self.download_dir) / source_identifiers.relative_path
188
+ download_path.parent.mkdir(parents=True, exist_ok=True)
189
+
190
+ try:
191
+ content = file_data.additional_metadata["content"]
192
+ with open(download_path, "w") as file:
193
+ file.write(content)
194
+ except Exception as e:
195
+ logger.error(f"Failed to download file {file_data.identifier}: {e}")
196
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
197
+
198
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
199
+
200
+
201
+ class KafkaUploaderConfig(UploaderConfig):
202
+ batch_size: int = Field(default=100, description="Batch size")
203
+ topic: str = Field(description="which topic to write to")
204
+ timeout: Optional[float] = Field(
205
+ default=10.0, description="Timeout in seconds to flush batch of messages"
206
+ )
207
+
208
+
209
+ @dataclass
210
+ class KafkaUploader(Uploader, ABC):
211
+ connection_config: KafkaConnectionConfig
212
+ upload_config: KafkaUploaderConfig
213
+
214
+ def precheck(self):
215
+ try:
216
+ with self.connection_config.get_consumer() as consumer:
217
+ cluster_meta = consumer.list_topics(timeout=self.upload_config.timeout)
218
+ current_topics = [
219
+ topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
220
+ ]
221
+ logger.info(f"successfully checked available topics: {current_topics}")
222
+ except Exception as e:
223
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
224
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
225
+
226
+ def produce_batch(self, elements: list[dict]) -> None:
227
+ from confluent_kafka.error import KafkaException
228
+
229
+ producer = self.connection_config.get_producer()
230
+ failed_producer = False
231
+
232
+ def acked(err, msg):
233
+ if err is not None:
234
+ logger.error("Failed to deliver message: %s: %s" % (str(msg), str(err)))
235
+
236
+ for element in elements:
237
+ producer.produce(
238
+ topic=self.upload_config.topic,
239
+ value=json.dumps(element),
240
+ callback=acked,
241
+ )
242
+
243
+ producer.flush(timeout=self.upload_config.timeout)
244
+ if failed_producer:
245
+ raise KafkaException("failed to produce all messages in batch")
246
+
247
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
248
+ with path.open("r") as elements_file:
249
+ elements = json.load(elements_file)
250
+ for element_batch in batch_generator(elements, batch_size=self.upload_config.batch_size):
251
+ self.produce_batch(elements=element_batch)
@@ -0,0 +1,103 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.processes.connector_registry import (
7
+ DestinationRegistryEntry,
8
+ SourceRegistryEntry,
9
+ )
10
+ from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
11
+ KafkaAccessConfig,
12
+ KafkaConnectionConfig,
13
+ KafkaDownloader,
14
+ KafkaDownloaderConfig,
15
+ KafkaIndexer,
16
+ KafkaIndexerConfig,
17
+ KafkaUploader,
18
+ KafkaUploaderConfig,
19
+ )
20
+
21
+ if TYPE_CHECKING:
22
+ pass
23
+
24
+ CONNECTOR_TYPE = "kafka-local"
25
+
26
+
27
+ class LocalKafkaAccessConfig(KafkaAccessConfig):
28
+ pass
29
+
30
+
31
+ class LocalKafkaConnectionConfig(KafkaConnectionConfig):
32
+ access_config: Secret[LocalKafkaAccessConfig] = Field(
33
+ default=LocalKafkaAccessConfig(), validate_default=True
34
+ )
35
+
36
+ def get_consumer_configuration(self) -> dict:
37
+ bootstrap = self.bootstrap_server
38
+ port = self.port
39
+
40
+ conf = {
41
+ "bootstrap.servers": f"{bootstrap}:{port}",
42
+ "group.id": "default_group_id",
43
+ "enable.auto.commit": "false",
44
+ "auto.offset.reset": "earliest",
45
+ }
46
+ return conf
47
+
48
+ def get_producer_configuration(self) -> dict:
49
+ bootstrap = self.bootstrap_server
50
+ port = self.port
51
+
52
+ conf = {
53
+ "bootstrap.servers": f"{bootstrap}:{port}",
54
+ }
55
+ return conf
56
+
57
+
58
+ class LocalKafkaIndexerConfig(KafkaIndexerConfig):
59
+ pass
60
+
61
+
62
+ @dataclass
63
+ class LocalKafkaIndexer(KafkaIndexer):
64
+ connection_config: LocalKafkaConnectionConfig
65
+ index_config: LocalKafkaIndexerConfig
66
+ connector_type: str = CONNECTOR_TYPE
67
+
68
+
69
+ class LocalKafkaDownloaderConfig(KafkaDownloaderConfig):
70
+ pass
71
+
72
+
73
+ @dataclass
74
+ class LocalKafkaDownloader(KafkaDownloader):
75
+ connection_config: LocalKafkaConnectionConfig
76
+ download_config: LocalKafkaDownloaderConfig
77
+ connector_type: str = CONNECTOR_TYPE
78
+
79
+
80
+ class LocalKafkaUploaderConfig(KafkaUploaderConfig):
81
+ pass
82
+
83
+
84
+ @dataclass
85
+ class LocalKafkaUploader(KafkaUploader):
86
+ connection_config: LocalKafkaConnectionConfig
87
+ upload_config: LocalKafkaUploaderConfig
88
+ connector_type: str = CONNECTOR_TYPE
89
+
90
+
91
+ kafka_local_source_entry = SourceRegistryEntry(
92
+ connection_config=LocalKafkaConnectionConfig,
93
+ indexer=LocalKafkaIndexer,
94
+ indexer_config=LocalKafkaIndexerConfig,
95
+ downloader=LocalKafkaDownloader,
96
+ downloader_config=LocalKafkaDownloaderConfig,
97
+ )
98
+
99
+ kafka_local_destination_entry = DestinationRegistryEntry(
100
+ connection_config=LocalKafkaConnectionConfig,
101
+ uploader=LocalKafkaUploader,
102
+ uploader_config=LocalKafkaUploaderConfig,
103
+ )
@@ -9,7 +9,11 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
9
9
  from dateutil import parser
10
10
  from pydantic import Field, Secret
11
11
 
12
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
+ from unstructured_ingest.error import (
13
+ DestinationConnectionError,
14
+ SourceConnectionError,
15
+ SourceConnectionNetworkError,
16
+ )
13
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
18
  from unstructured_ingest.v2.interfaces import (
15
19
  AccessConfig,
@@ -22,16 +26,19 @@ from unstructured_ingest.v2.interfaces import (
22
26
  Indexer,
23
27
  IndexerConfig,
24
28
  SourceIdentifiers,
25
- download_responses,
29
+ Uploader,
30
+ UploaderConfig,
26
31
  )
27
32
  from unstructured_ingest.v2.logger import logger
28
33
  from unstructured_ingest.v2.processes.connector_registry import (
34
+ DestinationRegistryEntry,
29
35
  SourceRegistryEntry,
30
36
  )
31
37
 
32
38
  if TYPE_CHECKING:
33
39
  from office365.graph_client import GraphClient
34
40
  from office365.onedrive.driveitems.driveItem import DriveItem
41
+ from office365.onedrive.drives.drive import Drive
35
42
 
36
43
  CONNECTOR_TYPE = "onedrive"
37
44
  MAX_MB_SIZE = 512_000_000
@@ -55,6 +62,11 @@ class OnedriveConnectionConfig(ConnectionConfig):
55
62
  )
56
63
  access_config: Secret[OnedriveAccessConfig]
57
64
 
65
+ def get_drive(self) -> "Drive":
66
+ client = self.get_client()
67
+ drive = client.users[self.user_pname].drive
68
+ return drive
69
+
58
70
  @requires_dependencies(["msal"], extras="onedrive")
59
71
  def get_token(self):
60
72
  from msal import ConfidentialClientApplication
@@ -100,7 +112,6 @@ class OnedriveIndexer(Indexer):
100
112
  raise SourceConnectionError(
101
113
  "{} ({})".format(error, token_resp.get("error_description"))
102
114
  )
103
- self.connection_config.get_client()
104
115
  except Exception as e:
105
116
  logger.error(f"failed to validate connection: {e}", exc_info=True)
106
117
  raise SourceConnectionError(f"failed to validate connection: {e}")
@@ -208,7 +219,7 @@ class OnedriveDownloader(Downloader):
208
219
  return self.download_dir / Path(rel_path)
209
220
 
210
221
  @SourceConnectionError.wrap
211
- def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
222
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
212
223
  file = self._fetch_file(file_data=file_data)
213
224
  fsize = file.get_property("size", 0)
214
225
  download_path = self.get_download_path(file_data=file_data)
@@ -221,7 +232,150 @@ class OnedriveDownloader(Downloader):
221
232
  else:
222
233
  with download_path.open(mode="wb") as f:
223
234
  file.download(f).execute_query()
224
- return DownloadResponse(file_data=file_data, path=download_path)
235
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
236
+
237
+
238
+ class OnedriveUploaderConfig(UploaderConfig):
239
+ remote_url: str = Field(
240
+ description="URL of the destination in OneDrive, e.g., 'onedrive://Documents/Folder'"
241
+ )
242
+ prefix: str = "onedrive://"
243
+
244
+ @property
245
+ def root_folder(self) -> str:
246
+ url = (
247
+ self.remote_url.replace(self.prefix, "", 1)
248
+ if self.remote_url.startswith(self.prefix)
249
+ else self.remote_url
250
+ )
251
+ return url.split("/")[0]
252
+
253
+ @property
254
+ def url(self) -> str:
255
+ url = (
256
+ self.remote_url.replace(self.prefix, "", 1)
257
+ if self.remote_url.startswith(self.prefix)
258
+ else self.remote_url
259
+ )
260
+ return url
261
+
262
+
263
+ @dataclass
264
+ class OnedriveUploader(Uploader):
265
+ connection_config: OnedriveConnectionConfig
266
+ upload_config: OnedriveUploaderConfig
267
+ connector_type: str = CONNECTOR_TYPE
268
+
269
+ @requires_dependencies(["office365"], extras="onedrive")
270
+ def precheck(self) -> None:
271
+ from office365.runtime.client_request_exception import ClientRequestException
272
+
273
+ try:
274
+ token_resp: dict = self.connection_config.get_token()
275
+ if error := token_resp.get("error"):
276
+ raise SourceConnectionError(
277
+ "{} ({})".format(error, token_resp.get("error_description"))
278
+ )
279
+ drive = self.connection_config.get_drive()
280
+ root = drive.root
281
+ root_folder = self.upload_config.root_folder
282
+ folder = root.get_by_path(root_folder)
283
+ try:
284
+ folder.get().execute_query()
285
+ except ClientRequestException as e:
286
+ if e.message != "The resource could not be found.":
287
+ raise e
288
+ folder = root.create_folder(root_folder).execute_query()
289
+ logger.info(f"successfully created folder: {folder.name}")
290
+ except Exception as e:
291
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
292
+ raise SourceConnectionError(f"failed to validate connection: {e}")
293
+
294
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
295
+ drive = self.connection_config.get_drive()
296
+
297
+ # Use the remote_url from upload_config as the base destination folder
298
+ base_destination_folder = self.upload_config.url
299
+
300
+ # Use the file's relative path to maintain directory structure, if needed
301
+ if file_data.source_identifiers and file_data.source_identifiers.rel_path:
302
+ # Combine the base destination folder with the file's relative path
303
+ destination_path = Path(base_destination_folder) / Path(
304
+ file_data.source_identifiers.rel_path
305
+ )
306
+ else:
307
+ # If no relative path is provided, upload directly to the base destination folder
308
+ destination_path = Path(base_destination_folder) / path.name
309
+
310
+ destination_folder = destination_path.parent
311
+ file_name = destination_path.name
312
+
313
+ # Convert destination folder to a string suitable for OneDrive API
314
+ destination_folder_str = str(destination_folder).replace("\\", "/")
315
+
316
+ # Resolve the destination folder in OneDrive, creating it if necessary
317
+ try:
318
+ # Attempt to get the folder
319
+ folder = drive.root.get_by_path(destination_folder_str)
320
+ folder.get().execute_query()
321
+ except Exception:
322
+ # Folder doesn't exist, create it recursively
323
+ current_folder = drive.root
324
+ for part in destination_folder.parts:
325
+ # Use filter to find the folder by name
326
+ folders = (
327
+ current_folder.children.filter(f"name eq '{part}' and folder ne null")
328
+ .get()
329
+ .execute_query()
330
+ )
331
+ if folders:
332
+ current_folder = folders[0]
333
+ else:
334
+ # Folder doesn't exist, create it
335
+ current_folder = current_folder.create_folder(part).execute_query()
336
+ folder = current_folder
337
+
338
+ # Check the size of the file
339
+ file_size = path.stat().st_size
340
+
341
+ if file_size < MAX_MB_SIZE:
342
+ # Use simple upload for small files
343
+ with path.open("rb") as local_file:
344
+ content = local_file.read()
345
+ logger.info(f"Uploading {path} to {destination_path} using simple upload")
346
+ try:
347
+ uploaded_file = folder.upload(file_name, content).execute_query()
348
+ if not uploaded_file or uploaded_file.name != file_name:
349
+ raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
350
+ # Log details about the uploaded file
351
+ logger.info(
352
+ f"Uploaded file '{uploaded_file.name}' with ID '{uploaded_file.id}'"
353
+ )
354
+ except Exception as e:
355
+ logger.error(f"Failed to upload file '{file_name}': {e}", exc_info=True)
356
+ raise DestinationConnectionError(
357
+ f"Failed to upload file '{file_name}': {e}"
358
+ ) from e
359
+ else:
360
+ # Use resumable upload for large files
361
+ destination_fullpath = f"{destination_folder_str}/{file_name}"
362
+ destination_drive_item = drive.root.item_with_path(destination_fullpath)
363
+
364
+ logger.info(f"Uploading {path} to {destination_fullpath} using resumable upload")
365
+ try:
366
+ uploaded_file = destination_drive_item.resumable_upload(
367
+ source_path=str(path)
368
+ ).execute_query()
369
+ # Validate the upload
370
+ if not uploaded_file or uploaded_file.name != file_name:
371
+ raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
372
+ # Log details about the uploaded file
373
+ logger.info(f"Uploaded file {uploaded_file.name} with ID {uploaded_file.id}")
374
+ except Exception as e:
375
+ logger.error(f"Failed to upload file '{file_name}' using resumable upload: {e}")
376
+ raise DestinationConnectionError(
377
+ f"Failed to upload file '{file_name}' using resumable upload: {e}"
378
+ ) from e
225
379
 
226
380
 
227
381
  onedrive_source_entry = SourceRegistryEntry(
@@ -231,3 +385,9 @@ onedrive_source_entry = SourceRegistryEntry(
231
385
  downloader_config=OnedriveDownloaderConfig,
232
386
  downloader=OnedriveDownloader,
233
387
  )
388
+
389
+ onedrive_destination_entry = DestinationRegistryEntry(
390
+ connection_config=OnedriveConnectionConfig,
391
+ uploader=OnedriveUploader,
392
+ uploader_config=OnedriveUploaderConfig,
393
+ )
@@ -15,10 +15,10 @@ from unstructured_ingest.v2.interfaces import (
15
15
  ConnectionConfig,
16
16
  Downloader,
17
17
  DownloaderConfig,
18
+ DownloadResponse,
18
19
  FileData,
19
20
  Indexer,
20
21
  IndexerConfig,
21
- download_responses,
22
22
  )
23
23
  from unstructured_ingest.v2.interfaces.file_data import FileDataSourceMetadata, SourceIdentifiers
24
24
  from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
@@ -191,7 +191,7 @@ class OutlookDownloader(Downloader):
191
191
  connection_config: OutlookConnectionConfig
192
192
  download_config: OutlookDownloaderConfig = field(default_factory=OutlookDownloaderConfig)
193
193
 
194
- def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
194
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
195
195
  # NOTE: Indexer should provide source identifiers required to generate the download path
196
196
  download_path = self.get_download_path(file_data)
197
197
  if download_path is None: