unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from time import time
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
11
|
+
from unstructured_ingest.error import ProviderError, UserAuthError, UserError, ValueError
|
|
12
|
+
from unstructured_ingest.logger import logger
|
|
13
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
14
|
+
DestinationRegistryEntry,
|
|
15
|
+
SourceRegistryEntry,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
18
|
+
FsspecAccessConfig,
|
|
19
|
+
FsspecConnectionConfig,
|
|
20
|
+
FsspecDownloader,
|
|
21
|
+
FsspecDownloaderConfig,
|
|
22
|
+
FsspecIndexer,
|
|
23
|
+
FsspecIndexerConfig,
|
|
24
|
+
FsspecUploader,
|
|
25
|
+
FsspecUploaderConfig,
|
|
26
|
+
)
|
|
27
|
+
from unstructured_ingest.processes.connectors.fsspec.utils import json_serial, sterilize_dict
|
|
28
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
29
|
+
BlobStoreUploadStager,
|
|
30
|
+
BlobStoreUploadStagerConfig,
|
|
31
|
+
)
|
|
32
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from adlfs import AzureBlobFileSystem
|
|
36
|
+
|
|
37
|
+
CONNECTOR_TYPE = "azure"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def azure_json_serial(obj):
|
|
41
|
+
from azure.storage.blob._models import ContentSettings
|
|
42
|
+
|
|
43
|
+
if isinstance(obj, ContentSettings):
|
|
44
|
+
return dict(obj)
|
|
45
|
+
if isinstance(obj, bytearray):
|
|
46
|
+
return str(obj)
|
|
47
|
+
return json_serial(obj)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class AzureIndexerConfig(FsspecIndexerConfig):
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AzureAccessConfig(FsspecAccessConfig):
|
|
55
|
+
account_name: Optional[str] = Field(
|
|
56
|
+
default=None,
|
|
57
|
+
description="The storage account name. This is used to authenticate "
|
|
58
|
+
"requests signed with an account key and to construct "
|
|
59
|
+
"the storage endpoint. It is required unless a connection "
|
|
60
|
+
"string is given, or if a custom domain is used with "
|
|
61
|
+
"anonymous authentication.",
|
|
62
|
+
)
|
|
63
|
+
account_key: Optional[str] = Field(
|
|
64
|
+
default=None,
|
|
65
|
+
description="The storage account key. This is used for shared key "
|
|
66
|
+
"authentication. If any of account key, sas token or "
|
|
67
|
+
"client_id are not specified, anonymous access will be used.",
|
|
68
|
+
)
|
|
69
|
+
connection_string: Optional[str] = Field(
|
|
70
|
+
default=None,
|
|
71
|
+
description="If specified, this will override all other parameters. See "
|
|
72
|
+
"http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501
|
|
73
|
+
"for the connection string format.",
|
|
74
|
+
)
|
|
75
|
+
sas_token: Optional[str] = Field(
|
|
76
|
+
default=None,
|
|
77
|
+
description="A shared access signature token to use to authenticate "
|
|
78
|
+
"requests instead of the account key. If account key and "
|
|
79
|
+
"sas token are both specified, account key will be used "
|
|
80
|
+
"to sign. If any of account key, sas token or client_id "
|
|
81
|
+
"are not specified, anonymous access will be used.",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def model_post_init(self, __context: Any) -> None:
|
|
85
|
+
if self.connection_string is None and self.account_name is None:
|
|
86
|
+
raise ValueError("either connection_string or account_name must be set")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class AzureConnectionConfig(FsspecConnectionConfig):
|
|
90
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["az"], init=False)
|
|
91
|
+
access_config: Secret[AzureAccessConfig]
|
|
92
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
93
|
+
|
|
94
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
95
|
+
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
96
|
+
access_configs: dict[str, Any] = {
|
|
97
|
+
k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v
|
|
98
|
+
}
|
|
99
|
+
return access_configs
|
|
100
|
+
|
|
101
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
102
|
+
@contextmanager
|
|
103
|
+
def get_client(self, protocol: str) -> Generator["AzureBlobFileSystem", None, None]:
|
|
104
|
+
with super().get_client(protocol=protocol) as client:
|
|
105
|
+
yield client
|
|
106
|
+
|
|
107
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
108
|
+
from azure.core.exceptions import ClientAuthenticationError, HttpResponseError
|
|
109
|
+
|
|
110
|
+
if not isinstance(e, HttpResponseError):
|
|
111
|
+
logger.error(f"unhandled exception from azure ({type(e)}): {e}", exc_info=True)
|
|
112
|
+
return e
|
|
113
|
+
if isinstance(e, ClientAuthenticationError):
|
|
114
|
+
return UserAuthError(e.reason)
|
|
115
|
+
status_code = e.status_code
|
|
116
|
+
message = e.reason
|
|
117
|
+
if status_code is not None:
|
|
118
|
+
if 400 <= status_code < 500:
|
|
119
|
+
return UserError(message)
|
|
120
|
+
if status_code >= 500:
|
|
121
|
+
return ProviderError(message)
|
|
122
|
+
logger.error(f"unhandled exception from azure ({type(e)}): {e}", exc_info=True)
|
|
123
|
+
return e
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass
|
|
127
|
+
class AzureIndexer(FsspecIndexer):
|
|
128
|
+
connection_config: AzureConnectionConfig
|
|
129
|
+
index_config: AzureIndexerConfig
|
|
130
|
+
connector_type: str = CONNECTOR_TYPE
|
|
131
|
+
|
|
132
|
+
def sterilize_info(self, file_data: dict) -> dict:
|
|
133
|
+
return sterilize_dict(data=file_data, default=azure_json_serial)
|
|
134
|
+
|
|
135
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
136
|
+
path = file_info["name"]
|
|
137
|
+
date_created = (
|
|
138
|
+
str(file_info.get("creation_time").timestamp())
|
|
139
|
+
if "creation_time" in file_info
|
|
140
|
+
else None
|
|
141
|
+
)
|
|
142
|
+
date_modified = (
|
|
143
|
+
str(file_info.get("last_modified").timestamp())
|
|
144
|
+
if "last_modified" in file_info
|
|
145
|
+
else None
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
149
|
+
|
|
150
|
+
version = file_info.get("etag")
|
|
151
|
+
record_locator = {
|
|
152
|
+
"protocol": self.index_config.protocol,
|
|
153
|
+
"remote_file_path": self.index_config.remote_url,
|
|
154
|
+
}
|
|
155
|
+
return FileDataSourceMetadata(
|
|
156
|
+
date_created=date_created,
|
|
157
|
+
date_modified=date_modified,
|
|
158
|
+
date_processed=str(time()),
|
|
159
|
+
version=version,
|
|
160
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
161
|
+
record_locator=record_locator,
|
|
162
|
+
filesize_bytes=file_size,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class AzureDownloaderConfig(FsspecDownloaderConfig):
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@dataclass
|
|
171
|
+
class AzureDownloader(FsspecDownloader):
|
|
172
|
+
protocol: str = "az"
|
|
173
|
+
connection_config: AzureConnectionConfig
|
|
174
|
+
connector_type: str = CONNECTOR_TYPE
|
|
175
|
+
download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class AzureUploaderConfig(FsspecUploaderConfig):
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@dataclass
|
|
183
|
+
class AzureUploader(FsspecUploader):
|
|
184
|
+
connector_type: str = CONNECTOR_TYPE
|
|
185
|
+
connection_config: AzureConnectionConfig
|
|
186
|
+
upload_config: AzureUploaderConfig = field(default=None)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
azure_source_entry = SourceRegistryEntry(
|
|
190
|
+
indexer=AzureIndexer,
|
|
191
|
+
indexer_config=AzureIndexerConfig,
|
|
192
|
+
downloader=AzureDownloader,
|
|
193
|
+
downloader_config=AzureDownloaderConfig,
|
|
194
|
+
connection_config=AzureConnectionConfig,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
azure_destination_entry = DestinationRegistryEntry(
|
|
198
|
+
uploader=AzureUploader,
|
|
199
|
+
uploader_config=AzureUploaderConfig,
|
|
200
|
+
connection_config=AzureConnectionConfig,
|
|
201
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
202
|
+
upload_stager=BlobStoreUploadStager,
|
|
203
|
+
)
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from time import time
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from dateutil import parser
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
from pydantic.functional_validators import BeforeValidator
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
13
|
+
from unstructured_ingest.error import ProviderError, UserAuthError, UserError
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
16
|
+
DestinationRegistryEntry,
|
|
17
|
+
SourceRegistryEntry,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
20
|
+
FsspecAccessConfig,
|
|
21
|
+
FsspecConnectionConfig,
|
|
22
|
+
FsspecDownloader,
|
|
23
|
+
FsspecDownloaderConfig,
|
|
24
|
+
FsspecIndexer,
|
|
25
|
+
FsspecIndexerConfig,
|
|
26
|
+
FsspecUploader,
|
|
27
|
+
FsspecUploaderConfig,
|
|
28
|
+
)
|
|
29
|
+
from unstructured_ingest.processes.connectors.utils import conform_string_to_dict
|
|
30
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
31
|
+
BlobStoreUploadStager,
|
|
32
|
+
BlobStoreUploadStagerConfig,
|
|
33
|
+
)
|
|
34
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from boxfs import BoxFileSystem
|
|
38
|
+
|
|
39
|
+
CONNECTOR_TYPE = "box"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class BoxIndexerConfig(FsspecIndexerConfig):
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class BoxAccessConfig(FsspecAccessConfig):
|
|
47
|
+
box_app_config: Annotated[dict, BeforeValidator(conform_string_to_dict)] = Field(
|
|
48
|
+
description="Box app credentials as a JSON string."
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class BoxConnectionConfig(FsspecConnectionConfig):
|
|
53
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
|
|
54
|
+
access_config: Secret[BoxAccessConfig]
|
|
55
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
56
|
+
|
|
57
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
58
|
+
from boxsdk import JWTAuth
|
|
59
|
+
|
|
60
|
+
ac = self.access_config.get_secret_value()
|
|
61
|
+
settings_dict = ac.box_app_config
|
|
62
|
+
|
|
63
|
+
# Create and authenticate the JWTAuth object
|
|
64
|
+
oauth = JWTAuth.from_settings_dictionary(settings_dict)
|
|
65
|
+
oauth.authenticate_instance()
|
|
66
|
+
|
|
67
|
+
# if not oauth.access_token:
|
|
68
|
+
# raise SourceConnectionError("Authentication failed: No access token generated.")
|
|
69
|
+
|
|
70
|
+
# Prepare the access configuration with the authenticated oauth
|
|
71
|
+
access_kwargs_with_oauth: dict[str, Any] = {
|
|
72
|
+
"oauth": oauth,
|
|
73
|
+
}
|
|
74
|
+
access_config: dict[str, Any] = ac.model_dump()
|
|
75
|
+
access_config.pop("box_app_config", None)
|
|
76
|
+
access_kwargs_with_oauth.update(access_config)
|
|
77
|
+
|
|
78
|
+
return access_kwargs_with_oauth
|
|
79
|
+
|
|
80
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
81
|
+
from boxsdk.exception import BoxAPIException, BoxOAuthException
|
|
82
|
+
|
|
83
|
+
if isinstance(e, BoxOAuthException):
|
|
84
|
+
return UserAuthError(e.message)
|
|
85
|
+
if not isinstance(e, BoxAPIException):
|
|
86
|
+
logger.error(f"unhandled exception from box ({type(e)}): {e}", exc_info=True)
|
|
87
|
+
return e
|
|
88
|
+
message = e.message or e
|
|
89
|
+
if error_code_status := e.status:
|
|
90
|
+
if 400 <= error_code_status < 500:
|
|
91
|
+
return UserError(message)
|
|
92
|
+
if error_code_status >= 500:
|
|
93
|
+
return ProviderError(message)
|
|
94
|
+
|
|
95
|
+
logger.error(f"unhandled exception from box ({type(e)}): {e}", exc_info=True)
|
|
96
|
+
return e
|
|
97
|
+
|
|
98
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
99
|
+
@contextmanager
|
|
100
|
+
def get_client(self, protocol: str) -> Generator["BoxFileSystem", None, None]:
|
|
101
|
+
with super().get_client(protocol=protocol) as client:
|
|
102
|
+
yield client
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class BoxIndexer(FsspecIndexer):
|
|
107
|
+
connection_config: BoxConnectionConfig
|
|
108
|
+
index_config: BoxIndexerConfig
|
|
109
|
+
connector_type: str = CONNECTOR_TYPE
|
|
110
|
+
|
|
111
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
112
|
+
path = file_info["name"]
|
|
113
|
+
date_created = None
|
|
114
|
+
date_modified = None
|
|
115
|
+
if modified_at_str := file_info.get("modified_at"):
|
|
116
|
+
date_modified = str(parser.parse(modified_at_str).timestamp())
|
|
117
|
+
if created_at_str := file_info.get("created_at"):
|
|
118
|
+
date_created = str(parser.parse(created_at_str).timestamp())
|
|
119
|
+
|
|
120
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
121
|
+
|
|
122
|
+
version = file_info.get("id")
|
|
123
|
+
record_locator = {
|
|
124
|
+
"protocol": self.index_config.protocol,
|
|
125
|
+
"remote_file_path": self.index_config.remote_url,
|
|
126
|
+
"file_id": file_info.get("id"),
|
|
127
|
+
}
|
|
128
|
+
return FileDataSourceMetadata(
|
|
129
|
+
date_created=date_created,
|
|
130
|
+
date_modified=date_modified,
|
|
131
|
+
date_processed=str(time()),
|
|
132
|
+
version=version,
|
|
133
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
134
|
+
record_locator=record_locator,
|
|
135
|
+
filesize_bytes=file_size,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class BoxDownloaderConfig(FsspecDownloaderConfig):
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@dataclass
|
|
144
|
+
class BoxDownloader(FsspecDownloader):
|
|
145
|
+
protocol: str = "box"
|
|
146
|
+
connection_config: BoxConnectionConfig
|
|
147
|
+
connector_type: str = CONNECTOR_TYPE
|
|
148
|
+
download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class BoxUploaderConfig(FsspecUploaderConfig):
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class BoxUploader(FsspecUploader):
|
|
157
|
+
connector_type: str = CONNECTOR_TYPE
|
|
158
|
+
connection_config: BoxConnectionConfig
|
|
159
|
+
upload_config: BoxUploaderConfig = field(default=None)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
box_source_entry = SourceRegistryEntry(
|
|
163
|
+
indexer=BoxIndexer,
|
|
164
|
+
indexer_config=BoxIndexerConfig,
|
|
165
|
+
downloader=BoxDownloader,
|
|
166
|
+
downloader_config=BoxDownloaderConfig,
|
|
167
|
+
connection_config=BoxConnectionConfig,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
box_destination_entry = DestinationRegistryEntry(
|
|
171
|
+
uploader=BoxUploader,
|
|
172
|
+
uploader_config=BoxUploaderConfig,
|
|
173
|
+
connection_config=BoxConnectionConfig,
|
|
174
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
175
|
+
upload_stager=BlobStoreUploadStager,
|
|
176
|
+
)
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from time import time
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
10
|
+
from unstructured_ingest.error import (
|
|
11
|
+
ProviderError,
|
|
12
|
+
UserAuthError,
|
|
13
|
+
UserError,
|
|
14
|
+
ValueError,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.error import (
|
|
17
|
+
RateLimitError as CustomRateLimitError,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.logger import logger
|
|
20
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
21
|
+
DestinationRegistryEntry,
|
|
22
|
+
SourceRegistryEntry,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
25
|
+
FsspecAccessConfig,
|
|
26
|
+
FsspecConnectionConfig,
|
|
27
|
+
FsspecDownloader,
|
|
28
|
+
FsspecDownloaderConfig,
|
|
29
|
+
FsspecIndexer,
|
|
30
|
+
FsspecIndexerConfig,
|
|
31
|
+
FsspecUploader,
|
|
32
|
+
FsspecUploaderConfig,
|
|
33
|
+
)
|
|
34
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
35
|
+
BlobStoreUploadStager,
|
|
36
|
+
BlobStoreUploadStagerConfig,
|
|
37
|
+
)
|
|
38
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
39
|
+
|
|
40
|
+
if TYPE_CHECKING:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
CONNECTOR_TYPE = "dropbox"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DropboxIndexerConfig(FsspecIndexerConfig):
|
|
47
|
+
def model_post_init(self, __context):
|
|
48
|
+
if not self.path_without_protocol.startswith("/"):
|
|
49
|
+
self.path_without_protocol = "/" + self.path_without_protocol
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class DropboxAccessConfig(FsspecAccessConfig):
|
|
53
|
+
token: Optional[str] = Field(
|
|
54
|
+
default=None, description="Dropbox access token."
|
|
55
|
+
) # This is the short lived (4h) token that needs to be generated anew each time.
|
|
56
|
+
app_key: Optional[str] = Field(default=None, description="Dropbox app key.")
|
|
57
|
+
app_secret: Optional[str] = Field(default=None, description="Dropbox app secret.")
|
|
58
|
+
refresh_token: Optional[str] = Field(
|
|
59
|
+
default=None, description="Dropbox refresh token."
|
|
60
|
+
) # This is the long lived token that doesn't expire
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class DropboxConnectionConfig(FsspecConnectionConfig):
|
|
64
|
+
access_config: Secret[DropboxAccessConfig] = Field(
|
|
65
|
+
default=DropboxAccessConfig(), validate_default=True
|
|
66
|
+
)
|
|
67
|
+
connector_type: str = Field(default=CONNECTOR_TYPE)
|
|
68
|
+
|
|
69
|
+
@requires_dependencies(["dropbox"])
|
|
70
|
+
def get_dropbox_access_token_from_refresh(
|
|
71
|
+
self,
|
|
72
|
+
refresh_token: str,
|
|
73
|
+
app_key: str,
|
|
74
|
+
app_secret: str,
|
|
75
|
+
) -> str:
|
|
76
|
+
"""
|
|
77
|
+
Uses the Dropbox Python SDK to exchange a long-lived refresh token for an access token.
|
|
78
|
+
"""
|
|
79
|
+
import dropbox
|
|
80
|
+
|
|
81
|
+
dbx = dropbox.Dropbox(
|
|
82
|
+
oauth2_access_token=None,
|
|
83
|
+
oauth2_refresh_token=refresh_token,
|
|
84
|
+
app_key=app_key,
|
|
85
|
+
app_secret=app_secret,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# This call fetches a new short-lived token and auto-updates dbx._oauth2_access_token
|
|
89
|
+
dbx.check_and_refresh_access_token()
|
|
90
|
+
short_lived_token = dbx._oauth2_access_token # Private attr, but standard usage
|
|
91
|
+
return short_lived_token
|
|
92
|
+
|
|
93
|
+
def get_access_config(self) -> dict[str, Any]:
|
|
94
|
+
"""
|
|
95
|
+
Overrides the parent FsspecConnectionConfig.get_access_config() to ensure
|
|
96
|
+
that we always provide an access token if refresh credentials exist.
|
|
97
|
+
"""
|
|
98
|
+
base_conf = super().get_access_config()
|
|
99
|
+
|
|
100
|
+
refresh_token = base_conf.get("refresh_token")
|
|
101
|
+
app_key = base_conf.get("app_key")
|
|
102
|
+
app_secret = base_conf.get("app_secret")
|
|
103
|
+
|
|
104
|
+
# Standard scenario - we have refresh a token and creds provided
|
|
105
|
+
# which we're going to use to retrieve access token
|
|
106
|
+
if refresh_token and app_key and app_secret:
|
|
107
|
+
logger.debug("Attempting to generate access token from refresh token...")
|
|
108
|
+
new_token = self.get_dropbox_access_token_from_refresh(
|
|
109
|
+
refresh_token=refresh_token,
|
|
110
|
+
app_key=app_key,
|
|
111
|
+
app_secret=app_secret,
|
|
112
|
+
)
|
|
113
|
+
if not new_token:
|
|
114
|
+
raise ValueError(
|
|
115
|
+
"Unable to retrieve an access token from Dropbox. "
|
|
116
|
+
"Please check that your refresh token, app key, and secret are valid."
|
|
117
|
+
)
|
|
118
|
+
base_conf["token"] = new_token
|
|
119
|
+
elif not base_conf.get("token"): # we might already have an access token from outside
|
|
120
|
+
# We have neither an existing short?lived token nor refresh credentials
|
|
121
|
+
raise ValueError(
|
|
122
|
+
"No valid token or refresh_token with app credentials was found. "
|
|
123
|
+
"Please check that your refresh token, app key, and secret are valid "
|
|
124
|
+
"or provide a valid short-lived token"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return base_conf
|
|
128
|
+
|
|
129
|
+
@requires_dependencies(["dropbox"])
|
|
130
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
131
|
+
from dropbox.exceptions import AuthError, HttpError, RateLimitError
|
|
132
|
+
|
|
133
|
+
if not isinstance(e, HttpError):
|
|
134
|
+
logger.error(f"Unhandled Dropbox exception: {repr(e)}", exc_info=True)
|
|
135
|
+
return e
|
|
136
|
+
|
|
137
|
+
if isinstance(e, AuthError):
|
|
138
|
+
raise UserAuthError(e.error)
|
|
139
|
+
elif isinstance(e, RateLimitError):
|
|
140
|
+
return CustomRateLimitError(e.error)
|
|
141
|
+
|
|
142
|
+
status_code = e.status_code
|
|
143
|
+
if 400 <= status_code < 500:
|
|
144
|
+
if body := getattr(e, "body", None):
|
|
145
|
+
return UserError(body)
|
|
146
|
+
else:
|
|
147
|
+
return UserError(e.body)
|
|
148
|
+
if status_code >= 500:
|
|
149
|
+
if body := getattr(e, "body", None):
|
|
150
|
+
return ProviderError(body)
|
|
151
|
+
else:
|
|
152
|
+
return ProviderError(e.body)
|
|
153
|
+
|
|
154
|
+
logger.error(f"Unhandled Dropbox HttpError: {repr(e)}", exc_info=True)
|
|
155
|
+
return e
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@dataclass
|
|
159
|
+
class DropboxIndexer(FsspecIndexer):
|
|
160
|
+
connection_config: DropboxConnectionConfig
|
|
161
|
+
index_config: DropboxIndexerConfig
|
|
162
|
+
connector_type: str = CONNECTOR_TYPE
|
|
163
|
+
|
|
164
|
+
def get_path(self, file_info: dict) -> str:
|
|
165
|
+
return file_info["name"]
|
|
166
|
+
|
|
167
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
168
|
+
path = file_info["name"].lstrip("/")
|
|
169
|
+
date_created = None
|
|
170
|
+
date_modified = None
|
|
171
|
+
server_modified = file_info.get("server_modified")
|
|
172
|
+
client_modified = file_info.get("client_modified")
|
|
173
|
+
if server_modified and client_modified and server_modified > client_modified:
|
|
174
|
+
date_created = str(client_modified.timestamp())
|
|
175
|
+
date_modified = str(server_modified.timestamp())
|
|
176
|
+
elif server_modified and client_modified and server_modified < client_modified:
|
|
177
|
+
date_created = str(server_modified.timestamp())
|
|
178
|
+
date_modified = str(client_modified.timestamp())
|
|
179
|
+
|
|
180
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
181
|
+
|
|
182
|
+
version = file_info.get("content_hash")
|
|
183
|
+
record_locator = {
|
|
184
|
+
"protocol": self.index_config.protocol,
|
|
185
|
+
"remote_file_path": self.index_config.remote_url,
|
|
186
|
+
"file_id": file_info.get("id"),
|
|
187
|
+
}
|
|
188
|
+
return FileDataSourceMetadata(
|
|
189
|
+
date_created=date_created,
|
|
190
|
+
date_modified=date_modified,
|
|
191
|
+
date_processed=str(time()),
|
|
192
|
+
version=version,
|
|
193
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
194
|
+
record_locator=record_locator,
|
|
195
|
+
filesize_bytes=file_size,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class DropboxDownloaderConfig(FsspecDownloaderConfig):
|
|
200
|
+
pass
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@dataclass
|
|
204
|
+
class DropboxDownloader(FsspecDownloader):
|
|
205
|
+
protocol: str = "dropbox"
|
|
206
|
+
connection_config: DropboxConnectionConfig
|
|
207
|
+
connector_type: str = CONNECTOR_TYPE
|
|
208
|
+
download_config: Optional[DropboxDownloaderConfig] = field(
|
|
209
|
+
default_factory=DropboxDownloaderConfig
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class DropboxUploaderConfig(FsspecUploaderConfig):
|
|
214
|
+
pass
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@dataclass
|
|
218
|
+
class DropboxUploader(FsspecUploader):
|
|
219
|
+
connector_type: str = CONNECTOR_TYPE
|
|
220
|
+
connection_config: DropboxConnectionConfig
|
|
221
|
+
upload_config: DropboxUploaderConfig = field(default=None)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
dropbox_source_entry = SourceRegistryEntry(
|
|
225
|
+
indexer=DropboxIndexer,
|
|
226
|
+
indexer_config=DropboxIndexerConfig,
|
|
227
|
+
downloader=DropboxDownloader,
|
|
228
|
+
downloader_config=DropboxDownloaderConfig,
|
|
229
|
+
connection_config=DropboxConnectionConfig,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
dropbox_destination_entry = DestinationRegistryEntry(
|
|
233
|
+
uploader=DropboxUploader,
|
|
234
|
+
uploader_config=DropboxUploaderConfig,
|
|
235
|
+
connection_config=DropboxConnectionConfig,
|
|
236
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
237
|
+
upload_stager=BlobStoreUploadStager,
|
|
238
|
+
)
|