unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from time import time
|
|
8
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, Optional
|
|
9
|
+
|
|
10
|
+
from dateutil import parser
|
|
11
|
+
from pydantic import Field, Secret
|
|
12
|
+
|
|
13
|
+
from unstructured_ingest.data_types.file_data import (
|
|
14
|
+
FileData,
|
|
15
|
+
FileDataSourceMetadata,
|
|
16
|
+
SourceIdentifiers,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.error import (
|
|
19
|
+
DestinationConnectionError,
|
|
20
|
+
SourceConnectionError,
|
|
21
|
+
SourceConnectionNetworkError,
|
|
22
|
+
UserAuthError,
|
|
23
|
+
ValueError,
|
|
24
|
+
)
|
|
25
|
+
from unstructured_ingest.interfaces import (
|
|
26
|
+
AccessConfig,
|
|
27
|
+
ConnectionConfig,
|
|
28
|
+
Downloader,
|
|
29
|
+
DownloaderConfig,
|
|
30
|
+
DownloadResponse,
|
|
31
|
+
Indexer,
|
|
32
|
+
IndexerConfig,
|
|
33
|
+
Uploader,
|
|
34
|
+
UploaderConfig,
|
|
35
|
+
)
|
|
36
|
+
from unstructured_ingest.logger import logger
|
|
37
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
38
|
+
DestinationRegistryEntry,
|
|
39
|
+
SourceRegistryEntry,
|
|
40
|
+
)
|
|
41
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
42
|
+
BlobStoreUploadStager,
|
|
43
|
+
BlobStoreUploadStagerConfig,
|
|
44
|
+
)
|
|
45
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
46
|
+
|
|
47
|
+
if TYPE_CHECKING:
|
|
48
|
+
from office365.graph_client import GraphClient
|
|
49
|
+
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
50
|
+
from office365.onedrive.drives.drive import Drive
|
|
51
|
+
|
|
52
|
+
CONNECTOR_TYPE = "onedrive"
|
|
53
|
+
MAX_BYTES_SIZE = 512_000_000
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class OnedriveAccessConfig(AccessConfig):
|
|
57
|
+
client_cred: str = Field(description="Microsoft App client secret")
|
|
58
|
+
password: Optional[str] = Field(description="Service account password", default=None)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class OnedriveConnectionConfig(ConnectionConfig):
|
|
62
|
+
client_id: str = Field(description="Microsoft app client ID")
|
|
63
|
+
user_pname: str = Field(
|
|
64
|
+
description="User principal name or service account, usually your Azure AD email."
|
|
65
|
+
)
|
|
66
|
+
tenant: str = Field(
|
|
67
|
+
repr=False, description="ID or domain name associated with your Azure AD instance"
|
|
68
|
+
)
|
|
69
|
+
authority_url: Optional[str] = Field(
|
|
70
|
+
repr=False,
|
|
71
|
+
default="https://login.microsoftonline.com",
|
|
72
|
+
examples=["https://login.microsoftonline.com"],
|
|
73
|
+
description="Authentication token provider for Microsoft apps",
|
|
74
|
+
)
|
|
75
|
+
access_config: Secret[OnedriveAccessConfig]
|
|
76
|
+
|
|
77
|
+
def get_drive(self) -> "Drive":
|
|
78
|
+
client = self.get_client()
|
|
79
|
+
drive = client.users[self.user_pname].drive
|
|
80
|
+
return drive
|
|
81
|
+
|
|
82
|
+
@requires_dependencies(["msal", "requests"], extras="onedrive")
|
|
83
|
+
def get_token(self):
|
|
84
|
+
from msal import ConfidentialClientApplication
|
|
85
|
+
from requests import post
|
|
86
|
+
|
|
87
|
+
if self.access_config.get_secret_value().password:
|
|
88
|
+
url = f"https://login.microsoftonline.com/{self.tenant}/oauth2/v2.0/token"
|
|
89
|
+
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
|
90
|
+
data = {
|
|
91
|
+
"grant_type": "password",
|
|
92
|
+
"username": self.user_pname,
|
|
93
|
+
"password": self.access_config.get_secret_value().password,
|
|
94
|
+
"client_id": self.client_id,
|
|
95
|
+
"client_secret": self.access_config.get_secret_value().client_cred,
|
|
96
|
+
"scope": "https://graph.microsoft.com/.default",
|
|
97
|
+
}
|
|
98
|
+
response = post(url, headers=headers, data=data)
|
|
99
|
+
if response.status_code == 200:
|
|
100
|
+
return response.json()
|
|
101
|
+
else:
|
|
102
|
+
raise SourceConnectionError(
|
|
103
|
+
f"Oauth2 authentication failed with {response.status_code}: {response.text}"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
else:
|
|
107
|
+
try:
|
|
108
|
+
app = ConfidentialClientApplication(
|
|
109
|
+
authority=f"{self.authority_url}/{self.tenant}",
|
|
110
|
+
client_id=self.client_id,
|
|
111
|
+
client_credential=self.access_config.get_secret_value().client_cred,
|
|
112
|
+
)
|
|
113
|
+
token = app.acquire_token_for_client(
|
|
114
|
+
scopes=["https://graph.microsoft.com/.default"]
|
|
115
|
+
)
|
|
116
|
+
except ValueError as exc:
|
|
117
|
+
logger.error("Couldn't set up credentials.")
|
|
118
|
+
raise exc
|
|
119
|
+
|
|
120
|
+
if "error" in token:
|
|
121
|
+
error_codes = token.get("error_codes", [])
|
|
122
|
+
error_type = token.get("error", "")
|
|
123
|
+
error_description = token.get("error_description", "")
|
|
124
|
+
|
|
125
|
+
# 7000215: Invalid client secret provided
|
|
126
|
+
# 7000218: Invalid client id provided
|
|
127
|
+
# 700016: Application not found in directory
|
|
128
|
+
# 90002: Tenant not found
|
|
129
|
+
auth_error_codes = [7000215, 7000218, 700016, 90002]
|
|
130
|
+
|
|
131
|
+
if any(code in error_codes for code in auth_error_codes) or error_type in [
|
|
132
|
+
"invalid_client",
|
|
133
|
+
"unauthorized_client",
|
|
134
|
+
"invalid_grant",
|
|
135
|
+
]:
|
|
136
|
+
raise UserAuthError(f"Authentication failed: {error_type}: {error_description}")
|
|
137
|
+
else:
|
|
138
|
+
raise SourceConnectionNetworkError(
|
|
139
|
+
f"Failed to fetch token: {error_type}: {error_description}"
|
|
140
|
+
)
|
|
141
|
+
return token
|
|
142
|
+
|
|
143
|
+
@requires_dependencies(["office365"], extras="onedrive")
|
|
144
|
+
def get_client(self) -> "GraphClient":
|
|
145
|
+
from office365.graph_client import GraphClient
|
|
146
|
+
|
|
147
|
+
client = GraphClient(self.get_token)
|
|
148
|
+
return client
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class OnedriveIndexerConfig(IndexerConfig):
|
|
152
|
+
path: Optional[str] = Field(default="")
|
|
153
|
+
recursive: bool = False
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@dataclass
|
|
157
|
+
class OnedriveIndexer(Indexer):
|
|
158
|
+
connection_config: OnedriveConnectionConfig
|
|
159
|
+
index_config: OnedriveIndexerConfig
|
|
160
|
+
connector_type: str = CONNECTOR_TYPE
|
|
161
|
+
|
|
162
|
+
def precheck(self) -> None:
|
|
163
|
+
try:
|
|
164
|
+
token_resp: dict = self.connection_config.get_token()
|
|
165
|
+
if error := token_resp.get("error"):
|
|
166
|
+
raise SourceConnectionError(
|
|
167
|
+
"{} ({})".format(error, token_resp.get("error_description"))
|
|
168
|
+
)
|
|
169
|
+
except Exception as e:
|
|
170
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
171
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
172
|
+
|
|
173
|
+
def list_objects_sync(self, folder: DriveItem, recursive: bool) -> list["DriveItem"]:
|
|
174
|
+
drive_items = folder.children.get().execute_query()
|
|
175
|
+
files = [d for d in drive_items if d.is_file]
|
|
176
|
+
if not recursive:
|
|
177
|
+
return files
|
|
178
|
+
|
|
179
|
+
folders = [d for d in drive_items if d.is_folder]
|
|
180
|
+
for f in folders:
|
|
181
|
+
files.extend(self.list_objects_sync(f, recursive))
|
|
182
|
+
return files
|
|
183
|
+
|
|
184
|
+
async def list_objects(self, folder: "DriveItem", recursive: bool) -> list["DriveItem"]:
|
|
185
|
+
return await asyncio.to_thread(self.list_objects_sync, folder, recursive)
|
|
186
|
+
|
|
187
|
+
def get_root_sync(self, client: "GraphClient") -> "DriveItem":
|
|
188
|
+
root = client.users[self.connection_config.user_pname].drive.get().execute_query().root
|
|
189
|
+
if fpath := self.index_config.path:
|
|
190
|
+
root = root.get_by_path(fpath).get().execute_query()
|
|
191
|
+
if root is None or not root.is_folder:
|
|
192
|
+
raise ValueError(f"Unable to find directory, given: {fpath}")
|
|
193
|
+
return root
|
|
194
|
+
|
|
195
|
+
async def get_root(self, client: "GraphClient") -> "DriveItem":
|
|
196
|
+
return await asyncio.to_thread(self.get_root_sync, client)
|
|
197
|
+
|
|
198
|
+
def get_properties_sync(self, drive_item: "DriveItem") -> dict:
|
|
199
|
+
properties = drive_item.properties
|
|
200
|
+
filtered_properties = {}
|
|
201
|
+
for k, v in properties.items():
|
|
202
|
+
try:
|
|
203
|
+
json.dumps(v)
|
|
204
|
+
filtered_properties[k] = v
|
|
205
|
+
except TypeError:
|
|
206
|
+
pass
|
|
207
|
+
return filtered_properties
|
|
208
|
+
|
|
209
|
+
async def get_properties(self, drive_item: "DriveItem") -> dict:
|
|
210
|
+
return await asyncio.to_thread(self.get_properties_sync, drive_item)
|
|
211
|
+
|
|
212
|
+
def drive_item_to_file_data_sync(self, drive_item: "DriveItem") -> FileData:
|
|
213
|
+
file_path = drive_item.parent_reference.path.split(":")[-1]
|
|
214
|
+
file_path = file_path[1:] if file_path and file_path[0] == "/" else file_path
|
|
215
|
+
filename = drive_item.name
|
|
216
|
+
server_path = file_path + "/" + filename
|
|
217
|
+
rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
|
|
218
|
+
date_modified_dt = (
|
|
219
|
+
parser.parse(str(drive_item.last_modified_datetime))
|
|
220
|
+
if drive_item.last_modified_datetime
|
|
221
|
+
else None
|
|
222
|
+
)
|
|
223
|
+
date_created_at = (
|
|
224
|
+
parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None
|
|
225
|
+
)
|
|
226
|
+
return FileData(
|
|
227
|
+
identifier=drive_item.id,
|
|
228
|
+
connector_type=self.connector_type,
|
|
229
|
+
source_identifiers=SourceIdentifiers(
|
|
230
|
+
fullpath=server_path, filename=drive_item.name, rel_path=rel_path
|
|
231
|
+
),
|
|
232
|
+
metadata=FileDataSourceMetadata(
|
|
233
|
+
url=drive_item.parent_reference.path + "/" + drive_item.name,
|
|
234
|
+
version=drive_item.etag,
|
|
235
|
+
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
236
|
+
date_created=str(date_created_at.timestamp()) if date_created_at else None,
|
|
237
|
+
date_processed=str(time()),
|
|
238
|
+
record_locator={
|
|
239
|
+
"user_pname": self.connection_config.user_pname,
|
|
240
|
+
"server_relative_path": server_path,
|
|
241
|
+
},
|
|
242
|
+
),
|
|
243
|
+
additional_metadata=self.get_properties_sync(drive_item=drive_item),
|
|
244
|
+
display_name=server_path,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
async def drive_item_to_file_data(self, drive_item: "DriveItem") -> FileData:
|
|
248
|
+
# Offload the file data creation if it's not guaranteed async
|
|
249
|
+
return await asyncio.to_thread(self.drive_item_to_file_data_sync, drive_item)
|
|
250
|
+
|
|
251
|
+
def is_async(self) -> bool:
|
|
252
|
+
return True
|
|
253
|
+
|
|
254
|
+
async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
|
|
255
|
+
token_resp = await asyncio.to_thread(self.connection_config.get_token)
|
|
256
|
+
if "error" in token_resp:
|
|
257
|
+
raise SourceConnectionError(
|
|
258
|
+
f"[{self.connector_type}]: {token_resp['error']} "
|
|
259
|
+
f"({token_resp.get('error_description')})"
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
client = await asyncio.to_thread(self.connection_config.get_client)
|
|
263
|
+
root = await self.get_root(client=client)
|
|
264
|
+
drive_items = await self.list_objects(folder=root, recursive=self.index_config.recursive)
|
|
265
|
+
|
|
266
|
+
for drive_item in drive_items:
|
|
267
|
+
file_data = await self.drive_item_to_file_data(drive_item=drive_item)
|
|
268
|
+
yield file_data
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class OnedriveDownloaderConfig(DownloaderConfig):
|
|
272
|
+
pass
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
@dataclass
|
|
276
|
+
class OnedriveDownloader(Downloader):
|
|
277
|
+
connection_config: OnedriveConnectionConfig
|
|
278
|
+
download_config: OnedriveDownloaderConfig
|
|
279
|
+
connector_type: str = CONNECTOR_TYPE
|
|
280
|
+
|
|
281
|
+
@SourceConnectionNetworkError.wrap
|
|
282
|
+
def _fetch_file(self, file_data: FileData) -> DriveItem:
|
|
283
|
+
if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
|
|
284
|
+
raise ValueError(
|
|
285
|
+
f"file data doesn't have enough information to get "
|
|
286
|
+
f"file content: {file_data.model_dump()}"
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
server_relative_path = file_data.source_identifiers.fullpath
|
|
290
|
+
client = self.connection_config.get_client()
|
|
291
|
+
root = client.users[self.connection_config.user_pname].drive.get().execute_query().root
|
|
292
|
+
file = root.get_by_path(server_relative_path).get().execute_query()
|
|
293
|
+
if not file:
|
|
294
|
+
raise FileNotFoundError(f"file not found: {server_relative_path}")
|
|
295
|
+
return file
|
|
296
|
+
|
|
297
|
+
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
298
|
+
rel_path = file_data.source_identifiers.relative_path
|
|
299
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
300
|
+
return self.download_dir / Path(rel_path)
|
|
301
|
+
|
|
302
|
+
@SourceConnectionError.wrap
|
|
303
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
304
|
+
try:
|
|
305
|
+
file = self._fetch_file(file_data=file_data)
|
|
306
|
+
fsize = file.get_property("size", 0)
|
|
307
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
308
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
309
|
+
logger.info(f"downloading {file_data.source_identifiers.fullpath} to {download_path}")
|
|
310
|
+
if fsize > MAX_BYTES_SIZE:
|
|
311
|
+
logger.info(f"downloading file with size: {fsize} bytes in chunks")
|
|
312
|
+
with download_path.open(mode="wb") as f:
|
|
313
|
+
file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
|
|
314
|
+
else:
|
|
315
|
+
with download_path.open(mode="wb") as f:
|
|
316
|
+
file.download_session(f).execute_query()
|
|
317
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
318
|
+
except Exception as e:
|
|
319
|
+
logger.error(
|
|
320
|
+
f"[{self.connector_type}] Exception during downloading: {e}", exc_info=True
|
|
321
|
+
)
|
|
322
|
+
# Re-raise to see full stack trace locally
|
|
323
|
+
raise
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class OnedriveUploaderConfig(UploaderConfig):
|
|
327
|
+
remote_url: str = Field(
|
|
328
|
+
description="URL of the destination in OneDrive, e.g., 'onedrive://Documents/Folder'"
|
|
329
|
+
)
|
|
330
|
+
prefix: str = "onedrive://"
|
|
331
|
+
|
|
332
|
+
@property
|
|
333
|
+
def root_folder(self) -> str:
|
|
334
|
+
url = (
|
|
335
|
+
self.remote_url.replace(self.prefix, "", 1)
|
|
336
|
+
if self.remote_url.startswith(self.prefix)
|
|
337
|
+
else self.remote_url
|
|
338
|
+
)
|
|
339
|
+
return url.split("/")[0]
|
|
340
|
+
|
|
341
|
+
@property
|
|
342
|
+
def url(self) -> str:
|
|
343
|
+
url = (
|
|
344
|
+
self.remote_url.replace(self.prefix, "", 1)
|
|
345
|
+
if self.remote_url.startswith(self.prefix)
|
|
346
|
+
else self.remote_url
|
|
347
|
+
)
|
|
348
|
+
return url
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
@dataclass
|
|
352
|
+
class OnedriveUploader(Uploader):
|
|
353
|
+
connection_config: OnedriveConnectionConfig
|
|
354
|
+
upload_config: OnedriveUploaderConfig
|
|
355
|
+
connector_type: str = CONNECTOR_TYPE
|
|
356
|
+
|
|
357
|
+
@requires_dependencies(["office365"], extras="onedrive")
|
|
358
|
+
def precheck(self) -> None:
|
|
359
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
360
|
+
|
|
361
|
+
try:
|
|
362
|
+
token_resp: dict = self.connection_config.get_token()
|
|
363
|
+
if error := token_resp.get("error"):
|
|
364
|
+
raise SourceConnectionError(
|
|
365
|
+
"{} ({})".format(error, token_resp.get("error_description"))
|
|
366
|
+
)
|
|
367
|
+
drive = self.connection_config.get_drive()
|
|
368
|
+
root = drive.root
|
|
369
|
+
root_folder = self.upload_config.root_folder
|
|
370
|
+
folder = root.get_by_path(root_folder)
|
|
371
|
+
try:
|
|
372
|
+
folder.get().execute_query()
|
|
373
|
+
except ClientRequestException as e:
|
|
374
|
+
if not e.response.status_code == 404:
|
|
375
|
+
raise e
|
|
376
|
+
folder = root.create_folder(root_folder).execute_query()
|
|
377
|
+
logger.info(f"successfully created folder: {folder.name}")
|
|
378
|
+
except Exception as e:
|
|
379
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
380
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
381
|
+
|
|
382
|
+
@requires_dependencies(["office365"], extras="onedrive")
|
|
383
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
384
|
+
from office365.onedrive.driveitems.conflict_behavior import ConflictBehavior
|
|
385
|
+
from office365.runtime.client_request_exception import ClientRequestException
|
|
386
|
+
|
|
387
|
+
drive = self.connection_config.get_drive()
|
|
388
|
+
|
|
389
|
+
# Use the remote_url from upload_config as the base destination folder
|
|
390
|
+
base_destination_folder = self.upload_config.url
|
|
391
|
+
# Use the file's relative path to maintain directory structure, if needed
|
|
392
|
+
if file_data.source_identifiers and file_data.source_identifiers.relative_path:
|
|
393
|
+
# Combine the base destination folder with the file's relative path
|
|
394
|
+
destination_path = Path(base_destination_folder) / Path(
|
|
395
|
+
f"{file_data.source_identifiers.relative_path}.json"
|
|
396
|
+
)
|
|
397
|
+
else:
|
|
398
|
+
# If no relative path is provided, upload directly to the base destination folder
|
|
399
|
+
destination_path = Path(base_destination_folder) / f"{path.name}.json"
|
|
400
|
+
|
|
401
|
+
destination_folder = destination_path.parent
|
|
402
|
+
file_name = destination_path.name
|
|
403
|
+
|
|
404
|
+
# Convert destination folder to a string suitable for OneDrive API
|
|
405
|
+
destination_folder_str = str(destination_folder).replace("\\", "/")
|
|
406
|
+
|
|
407
|
+
# Resolve the destination folder in OneDrive, creating it if necessary
|
|
408
|
+
try:
|
|
409
|
+
# Attempt to get the folder
|
|
410
|
+
folder = drive.root.get_by_path(destination_folder_str)
|
|
411
|
+
folder.get().execute_query()
|
|
412
|
+
except ClientRequestException as e:
|
|
413
|
+
# Folder doesn't exist, create it recursively
|
|
414
|
+
root = drive.root
|
|
415
|
+
root_folder = self.upload_config.root_folder
|
|
416
|
+
if not e.response.status_code == 404:
|
|
417
|
+
raise e
|
|
418
|
+
folder = root.create_folder(root_folder).execute_query()
|
|
419
|
+
logger.info(f"successfully created folder: {folder.name}")
|
|
420
|
+
|
|
421
|
+
# Check the size of the file
|
|
422
|
+
file_size = path.stat().st_size
|
|
423
|
+
|
|
424
|
+
if file_size < MAX_BYTES_SIZE:
|
|
425
|
+
# Use simple upload for small files
|
|
426
|
+
with path.open("rb") as local_file:
|
|
427
|
+
content = local_file.read()
|
|
428
|
+
logger.info(f"Uploading {path} to {destination_path} using simple upload")
|
|
429
|
+
try:
|
|
430
|
+
uploaded_file = folder.upload(file_name, content).execute_query()
|
|
431
|
+
if not uploaded_file or uploaded_file.name != file_name:
|
|
432
|
+
raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
|
|
433
|
+
# Log details about the uploaded file
|
|
434
|
+
logger.info(
|
|
435
|
+
f"Uploaded file '{uploaded_file.name}' with ID '{uploaded_file.id}'"
|
|
436
|
+
)
|
|
437
|
+
except Exception as e:
|
|
438
|
+
logger.error(f"Failed to upload file '{file_name}': {e}", exc_info=True)
|
|
439
|
+
raise DestinationConnectionError(
|
|
440
|
+
f"Failed to upload file '{file_name}': {e}"
|
|
441
|
+
) from e
|
|
442
|
+
else:
|
|
443
|
+
# Use resumable upload for large files
|
|
444
|
+
destination_drive_item = drive.root.get_by_path(destination_folder_str)
|
|
445
|
+
|
|
446
|
+
logger.info(
|
|
447
|
+
f"Uploading {path.parent / file_name} to {destination_folder_str} using resumable upload" # noqa: E501
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
try:
|
|
451
|
+
uploaded_file = destination_drive_item.resumable_upload(
|
|
452
|
+
source_path=str(path)
|
|
453
|
+
).execute_query()
|
|
454
|
+
# Rename the uploaded file to the original source name with a .json extension
|
|
455
|
+
# Overwrite the file if it already exists
|
|
456
|
+
renamed_file = uploaded_file.move(
|
|
457
|
+
name=file_name, conflict_behavior=ConflictBehavior.Replace
|
|
458
|
+
).execute_query()
|
|
459
|
+
# Validate the upload
|
|
460
|
+
if not renamed_file or renamed_file.name != file_name:
|
|
461
|
+
raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
|
|
462
|
+
# Log details about the uploaded file
|
|
463
|
+
logger.info(f"Uploaded file {renamed_file.name} with ID {renamed_file.id}")
|
|
464
|
+
except Exception as e:
|
|
465
|
+
logger.error(f"Failed to upload file '{file_name}' using resumable upload: {e}")
|
|
466
|
+
raise DestinationConnectionError(
|
|
467
|
+
f"Failed to upload file '{file_name}' using resumable upload: {e}"
|
|
468
|
+
) from e
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
onedrive_source_entry = SourceRegistryEntry(
|
|
472
|
+
connection_config=OnedriveConnectionConfig,
|
|
473
|
+
indexer_config=OnedriveIndexerConfig,
|
|
474
|
+
indexer=OnedriveIndexer,
|
|
475
|
+
downloader_config=OnedriveDownloaderConfig,
|
|
476
|
+
downloader=OnedriveDownloader,
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
onedrive_destination_entry = DestinationRegistryEntry(
|
|
480
|
+
connection_config=OnedriveConnectionConfig,
|
|
481
|
+
uploader=OnedriveUploader,
|
|
482
|
+
uploader_config=OnedriveUploaderConfig,
|
|
483
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
484
|
+
upload_stager=BlobStoreUploadStager,
|
|
485
|
+
)
|