unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import random
|
|
5
|
+
import shutil
|
|
6
|
+
import tempfile
|
|
7
|
+
from contextlib import contextmanager
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
11
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, Field, Secret
|
|
14
|
+
|
|
15
|
+
from unstructured_ingest.data_types.file_data import (
|
|
16
|
+
FileData,
|
|
17
|
+
FileDataSourceMetadata,
|
|
18
|
+
SourceIdentifiers,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.error import TypeError, ValueError
|
|
21
|
+
from unstructured_ingest.interfaces import (
|
|
22
|
+
AccessConfig,
|
|
23
|
+
ConnectionConfig,
|
|
24
|
+
Downloader,
|
|
25
|
+
DownloaderConfig,
|
|
26
|
+
DownloadResponse,
|
|
27
|
+
Indexer,
|
|
28
|
+
IndexerConfig,
|
|
29
|
+
Uploader,
|
|
30
|
+
UploaderConfig,
|
|
31
|
+
)
|
|
32
|
+
from unstructured_ingest.processes.connectors.fsspec.utils import sterilize_dict
|
|
33
|
+
from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from fsspec import AbstractFileSystem
|
|
37
|
+
|
|
38
|
+
CONNECTOR_TYPE = "fsspec"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class FileConfig(BaseModel):
|
|
42
|
+
remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
|
|
43
|
+
protocol: str = Field(init=False)
|
|
44
|
+
path_without_protocol: str = Field(init=False)
|
|
45
|
+
supported_protocols: list[str] = Field(
|
|
46
|
+
init=False,
|
|
47
|
+
default_factory=lambda: [
|
|
48
|
+
"s3",
|
|
49
|
+
"s3a",
|
|
50
|
+
"abfs",
|
|
51
|
+
"az",
|
|
52
|
+
"gs",
|
|
53
|
+
"gcs",
|
|
54
|
+
"box",
|
|
55
|
+
"dropbox",
|
|
56
|
+
"sftp",
|
|
57
|
+
],
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def __init__(self, **data):
|
|
61
|
+
protocol, path_without_protocol = data["remote_url"].split("://")
|
|
62
|
+
data["protocol"] = protocol
|
|
63
|
+
data["path_without_protocol"] = path_without_protocol
|
|
64
|
+
super().__init__(**data)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class FsspecIndexerConfig(FileConfig, IndexerConfig):
|
|
68
|
+
recursive: bool = False
|
|
69
|
+
sample_n_files: Optional[int] = None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class FsspecAccessConfig(AccessConfig):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class FsspecConnectionConfig(ConnectionConfig):
|
|
77
|
+
access_config: Secret[FsspecAccessConfig]
|
|
78
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
79
|
+
|
|
80
|
+
@contextmanager
|
|
81
|
+
def get_client(self, protocol: str) -> Generator["AbstractFileSystem", None, None]:
|
|
82
|
+
from fsspec import get_filesystem_class
|
|
83
|
+
|
|
84
|
+
client = get_filesystem_class(protocol)(
|
|
85
|
+
**self.get_access_config(),
|
|
86
|
+
)
|
|
87
|
+
yield client
|
|
88
|
+
|
|
89
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
90
|
+
return e
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
|
|
94
|
+
FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnectionConfig)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass
|
|
98
|
+
class FsspecIndexer(Indexer):
|
|
99
|
+
connection_config: FsspecConnectionConfigT
|
|
100
|
+
index_config: FsspecIndexerConfigT
|
|
101
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
102
|
+
|
|
103
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
104
|
+
return self.connection_config.wrap_error(e=e)
|
|
105
|
+
|
|
106
|
+
def precheck(self) -> None:
|
|
107
|
+
from fsspec import get_filesystem_class
|
|
108
|
+
|
|
109
|
+
self.log_operation_start(
|
|
110
|
+
"Connection validation",
|
|
111
|
+
protocol=self.index_config.protocol,
|
|
112
|
+
path=self.index_config.path_without_protocol,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
fs = get_filesystem_class(self.index_config.protocol)(
|
|
117
|
+
**self.connection_config.get_access_config(),
|
|
118
|
+
)
|
|
119
|
+
files = fs.ls(path=self.index_config.path_without_protocol, detail=True)
|
|
120
|
+
valid_files = [x.get("name") for x in files if x.get("type") == "file"]
|
|
121
|
+
if not valid_files:
|
|
122
|
+
self.log_operation_complete("Connection validation", count=0)
|
|
123
|
+
return
|
|
124
|
+
file_to_sample = valid_files[0]
|
|
125
|
+
self.log_debug(f"attempting to make HEAD request for file: {file_to_sample}")
|
|
126
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
127
|
+
client.head(path=file_to_sample)
|
|
128
|
+
|
|
129
|
+
self.log_connection_validated(
|
|
130
|
+
connector_type=self.connector_type,
|
|
131
|
+
endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
self.log_connection_failed(
|
|
136
|
+
connector_type=self.connector_type,
|
|
137
|
+
error=e,
|
|
138
|
+
endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
|
|
139
|
+
)
|
|
140
|
+
raise self.wrap_error(e=e)
|
|
141
|
+
|
|
142
|
+
def get_file_info(self) -> list[dict[str, Any]]:
|
|
143
|
+
if not self.index_config.recursive:
|
|
144
|
+
# fs.ls does not walk directories
|
|
145
|
+
# directories that are listed in cloud storage can cause problems
|
|
146
|
+
# because they are seen as 0 byte files
|
|
147
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
148
|
+
files = client.ls(self.index_config.path_without_protocol, detail=True)
|
|
149
|
+
|
|
150
|
+
else:
|
|
151
|
+
# fs.find will recursively walk directories
|
|
152
|
+
# "size" is a common key for all the cloud protocols with fs
|
|
153
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
154
|
+
found = client.find(
|
|
155
|
+
self.index_config.path_without_protocol,
|
|
156
|
+
detail=True,
|
|
157
|
+
)
|
|
158
|
+
files = found.values()
|
|
159
|
+
filtered_files = [
|
|
160
|
+
file for file in files if file.get("size") > 0 and file.get("type") == "file"
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
if self.index_config.sample_n_files:
|
|
164
|
+
filtered_files = self.sample_n_files(filtered_files, self.index_config.sample_n_files)
|
|
165
|
+
|
|
166
|
+
return filtered_files
|
|
167
|
+
|
|
168
|
+
def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
|
|
169
|
+
if len(files) <= n:
|
|
170
|
+
self.log_warning(
|
|
171
|
+
f"number of files to be sampled={n} is not smaller than the number"
|
|
172
|
+
f" of files found ({len(files)}). Returning all of the files as the"
|
|
173
|
+
" sample."
|
|
174
|
+
)
|
|
175
|
+
return files
|
|
176
|
+
|
|
177
|
+
return random.sample(files, n)
|
|
178
|
+
|
|
179
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
180
|
+
raise NotImplementedError()
|
|
181
|
+
|
|
182
|
+
def get_path(self, file_info: dict) -> str:
|
|
183
|
+
return file_info["name"]
|
|
184
|
+
|
|
185
|
+
def sterilize_info(self, file_data: dict) -> dict:
|
|
186
|
+
return sterilize_dict(data=file_data)
|
|
187
|
+
|
|
188
|
+
def create_init_file_data(self, remote_filepath: Optional[str] = None) -> FileData:
|
|
189
|
+
# Create initial file data that requires no network calls and is constructed purely
|
|
190
|
+
# with information that exists in the config
|
|
191
|
+
remote_filepath = remote_filepath or self.index_config.remote_url
|
|
192
|
+
path_without_protocol = remote_filepath.split("://")[1]
|
|
193
|
+
rel_path = remote_filepath.replace(path_without_protocol, "").lstrip("/")
|
|
194
|
+
return FileData(
|
|
195
|
+
identifier=str(uuid5(NAMESPACE_DNS, remote_filepath)),
|
|
196
|
+
connector_type=self.connector_type,
|
|
197
|
+
display_name=remote_filepath,
|
|
198
|
+
source_identifiers=SourceIdentifiers(
|
|
199
|
+
filename=Path(remote_filepath).name,
|
|
200
|
+
rel_path=rel_path or None,
|
|
201
|
+
fullpath=remote_filepath,
|
|
202
|
+
),
|
|
203
|
+
metadata=FileDataSourceMetadata(url=remote_filepath),
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
def hydrate_file_data(self, init_file_data: FileData):
|
|
207
|
+
# Get file info
|
|
208
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
209
|
+
files = client.ls(self.index_config.path_without_protocol, detail=True)
|
|
210
|
+
filtered_files = [
|
|
211
|
+
file for file in files if file.get("size") > 0 and file.get("type") == "file"
|
|
212
|
+
]
|
|
213
|
+
if not filtered_files:
|
|
214
|
+
raise ValueError(f"{init_file_data} did not reference any valid file")
|
|
215
|
+
if len(filtered_files) > 1:
|
|
216
|
+
raise ValueError(f"{init_file_data} referenced more than one file")
|
|
217
|
+
file_info = filtered_files[0]
|
|
218
|
+
init_file_data.additional_metadata = self.get_metadata(file_info=file_info)
|
|
219
|
+
|
|
220
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
221
|
+
self.log_indexing_start(f"{self.connector_type} files")
|
|
222
|
+
|
|
223
|
+
files = self.get_file_info()
|
|
224
|
+
total_files = len(files)
|
|
225
|
+
|
|
226
|
+
self.log_operation_start("File indexing", total_files=total_files)
|
|
227
|
+
|
|
228
|
+
for i, file_info in enumerate(files):
|
|
229
|
+
file_path = self.get_path(file_info=file_info)
|
|
230
|
+
|
|
231
|
+
# Only log progress for larger operations
|
|
232
|
+
if total_files > 5:
|
|
233
|
+
self.log_progress(
|
|
234
|
+
current=i + 1, total=total_files, item_type="files", operation="Indexing"
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
238
|
+
# to get a valid relative path
|
|
239
|
+
rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
|
|
240
|
+
|
|
241
|
+
additional_metadata = self.sterilize_info(file_data=file_info)
|
|
242
|
+
additional_metadata["original_file_path"] = file_path
|
|
243
|
+
yield FileData(
|
|
244
|
+
identifier=str(uuid5(NAMESPACE_DNS, file_path)),
|
|
245
|
+
connector_type=self.connector_type,
|
|
246
|
+
source_identifiers=SourceIdentifiers(
|
|
247
|
+
filename=Path(file_path).name,
|
|
248
|
+
rel_path=rel_path or None,
|
|
249
|
+
fullpath=file_path,
|
|
250
|
+
),
|
|
251
|
+
metadata=self.get_metadata(file_info=file_info),
|
|
252
|
+
additional_metadata=additional_metadata,
|
|
253
|
+
display_name=file_path,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
self.log_indexing_complete(f"{self.connector_type} files", total_files)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class FsspecDownloaderConfig(DownloaderConfig):
|
|
260
|
+
pass
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
FsspecDownloaderConfigT = TypeVar("FsspecDownloaderConfigT", bound=FsspecDownloaderConfig)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
@dataclass
|
|
267
|
+
class FsspecDownloader(Downloader):
|
|
268
|
+
TEMP_DIR_PREFIX = "unstructured_"
|
|
269
|
+
|
|
270
|
+
protocol: str
|
|
271
|
+
connection_config: FsspecConnectionConfigT
|
|
272
|
+
connector_type: str = CONNECTOR_TYPE
|
|
273
|
+
download_config: Optional[FsspecDownloaderConfigT] = field(
|
|
274
|
+
default_factory=lambda: FsspecDownloaderConfig()
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
278
|
+
has_source_identifiers = file_data.source_identifiers is not None
|
|
279
|
+
has_filename = has_source_identifiers and file_data.source_identifiers.filename
|
|
280
|
+
|
|
281
|
+
if not (has_source_identifiers and has_filename):
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
filename = file_data.source_identifiers.filename
|
|
285
|
+
|
|
286
|
+
mkdir_concurrent_safe(self.download_dir)
|
|
287
|
+
|
|
288
|
+
temp_dir = tempfile.mkdtemp(prefix=self.TEMP_DIR_PREFIX, dir=self.download_dir)
|
|
289
|
+
return Path(temp_dir) / filename
|
|
290
|
+
|
|
291
|
+
def is_async(self) -> bool:
|
|
292
|
+
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
293
|
+
return client.async_impl
|
|
294
|
+
|
|
295
|
+
def handle_directory_download(self, lpath: Path) -> None:
|
|
296
|
+
# If the object's name contains certain characters (i.e. '?'), it
|
|
297
|
+
# gets downloaded into a new directory of the same name. This
|
|
298
|
+
# reconciles that with what is expected, which is to download it
|
|
299
|
+
# as a file that is not within a directory.
|
|
300
|
+
if not lpath.is_dir():
|
|
301
|
+
return
|
|
302
|
+
desired_name = lpath.name
|
|
303
|
+
files_in_dir = [file for file in lpath.iterdir() if file.is_file()]
|
|
304
|
+
if not files_in_dir:
|
|
305
|
+
raise ValueError(f"no files in {lpath}")
|
|
306
|
+
if len(files_in_dir) > 1:
|
|
307
|
+
raise ValueError(
|
|
308
|
+
"Multiple files in {}: {}".format(lpath, ", ".join([str(f) for f in files_in_dir]))
|
|
309
|
+
)
|
|
310
|
+
file = files_in_dir[0]
|
|
311
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
312
|
+
temp_location = os.path.join(temp_dir, desired_name)
|
|
313
|
+
shutil.copyfile(src=file, dst=temp_location)
|
|
314
|
+
shutil.rmtree(lpath)
|
|
315
|
+
shutil.move(src=temp_location, dst=lpath)
|
|
316
|
+
|
|
317
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
318
|
+
return self.connection_config.wrap_error(e=e)
|
|
319
|
+
|
|
320
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
321
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
322
|
+
mkdir_concurrent_safe(download_path.parent)
|
|
323
|
+
|
|
324
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
325
|
+
file_size = file_data.metadata.filesize_bytes
|
|
326
|
+
self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
|
|
327
|
+
|
|
328
|
+
try:
|
|
329
|
+
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
330
|
+
client.get_file(rpath=rpath, lpath=download_path.as_posix())
|
|
331
|
+
self.handle_directory_download(lpath=download_path)
|
|
332
|
+
|
|
333
|
+
except Exception as e:
|
|
334
|
+
self.log_error(
|
|
335
|
+
"File download failed",
|
|
336
|
+
error=e,
|
|
337
|
+
context={"file_path": rpath, "file_id": file_data.identifier},
|
|
338
|
+
)
|
|
339
|
+
raise self.wrap_error(e=e)
|
|
340
|
+
|
|
341
|
+
self.log_download_complete(
|
|
342
|
+
file_path=rpath,
|
|
343
|
+
file_id=file_data.identifier,
|
|
344
|
+
download_path=str(download_path),
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
348
|
+
|
|
349
|
+
async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
350
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
351
|
+
mkdir_concurrent_safe(download_path.parent)
|
|
352
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
353
|
+
file_size = file_data.metadata.filesize_bytes
|
|
354
|
+
self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
|
|
355
|
+
|
|
356
|
+
try:
|
|
357
|
+
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
358
|
+
await client.get_file(rpath=rpath, lpath=download_path.as_posix())
|
|
359
|
+
self.handle_directory_download(lpath=download_path)
|
|
360
|
+
except Exception as e:
|
|
361
|
+
self.log_error(
|
|
362
|
+
"File download failed",
|
|
363
|
+
error=e,
|
|
364
|
+
context={"file_path": rpath, "file_id": file_data.identifier},
|
|
365
|
+
)
|
|
366
|
+
raise self.wrap_error(e=e)
|
|
367
|
+
|
|
368
|
+
self.log_download_complete(
|
|
369
|
+
file_path=rpath,
|
|
370
|
+
file_id=file_data.identifier,
|
|
371
|
+
download_path=str(download_path),
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
class FsspecUploaderConfig(FileConfig, UploaderConfig):
|
|
378
|
+
pass
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
@dataclass
|
|
385
|
+
class FsspecUploader(Uploader):
|
|
386
|
+
connector_type: str = CONNECTOR_TYPE
|
|
387
|
+
upload_config: FsspecUploaderConfigT = field(default=None)
|
|
388
|
+
connection_config: FsspecConnectionConfigT
|
|
389
|
+
|
|
390
|
+
def is_async(self) -> bool:
|
|
391
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
392
|
+
return client.async_impl
|
|
393
|
+
|
|
394
|
+
@property
|
|
395
|
+
def fs(self) -> "AbstractFileSystem":
|
|
396
|
+
from fsspec import get_filesystem_class
|
|
397
|
+
|
|
398
|
+
fs_kwargs = self.connection_config.get_access_config() if self.connection_config else {}
|
|
399
|
+
return get_filesystem_class(self.upload_config.protocol)(
|
|
400
|
+
**fs_kwargs,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
def __post_init__(self):
|
|
404
|
+
super().__post_init__()
|
|
405
|
+
# TODO: Consider using `kw_only` instead
|
|
406
|
+
if not self.upload_config:
|
|
407
|
+
raise TypeError(
|
|
408
|
+
f"{self.__class__.__name__}.__init__() "
|
|
409
|
+
f"missing 1 required positional argument: 'upload_config'"
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
413
|
+
return self.connection_config.wrap_error(e=e)
|
|
414
|
+
|
|
415
|
+
def precheck(self) -> None:
|
|
416
|
+
from fsspec import get_filesystem_class
|
|
417
|
+
|
|
418
|
+
self.log_operation_start("Connection validation", protocol=self.upload_config.protocol)
|
|
419
|
+
|
|
420
|
+
try:
|
|
421
|
+
fs = get_filesystem_class(self.upload_config.protocol)(
|
|
422
|
+
**self.connection_config.get_access_config(),
|
|
423
|
+
)
|
|
424
|
+
upload_path = Path(self.upload_config.path_without_protocol) / "_empty"
|
|
425
|
+
fs.write_bytes(path=upload_path.as_posix(), value=b"")
|
|
426
|
+
except Exception as e:
|
|
427
|
+
self.log_connection_failed(
|
|
428
|
+
connector_type=self.connector_type,
|
|
429
|
+
error=e,
|
|
430
|
+
endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
|
|
431
|
+
)
|
|
432
|
+
raise self.wrap_error(e=e)
|
|
433
|
+
self.log_connection_validated(
|
|
434
|
+
connector_type=self.connector_type,
|
|
435
|
+
endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
def get_upload_path(self, file_data: FileData) -> Path:
|
|
439
|
+
upload_path = Path(
|
|
440
|
+
self.upload_config.path_without_protocol
|
|
441
|
+
) / file_data.source_identifiers.relative_path.lstrip("/")
|
|
442
|
+
updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
|
|
443
|
+
return updated_upload_path
|
|
444
|
+
|
|
445
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
446
|
+
path_str = str(path.resolve())
|
|
447
|
+
upload_path = self.get_upload_path(file_data=file_data)
|
|
448
|
+
self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
|
|
449
|
+
try:
|
|
450
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
451
|
+
client.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
452
|
+
except Exception as e:
|
|
453
|
+
self.log_error(
|
|
454
|
+
"File upload failed",
|
|
455
|
+
error=e,
|
|
456
|
+
context={"file_path": path_str, "destination": upload_path.as_posix()},
|
|
457
|
+
)
|
|
458
|
+
raise self.wrap_error(e=e)
|
|
459
|
+
self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())
|
|
460
|
+
|
|
461
|
+
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
462
|
+
path_str = str(path.resolve())
|
|
463
|
+
upload_path = self.get_upload_path(file_data=file_data)
|
|
464
|
+
self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
|
|
465
|
+
try:
|
|
466
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
467
|
+
client.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
468
|
+
except Exception as e:
|
|
469
|
+
self.log_error(
|
|
470
|
+
"File upload failed",
|
|
471
|
+
error=e,
|
|
472
|
+
context={"file_path": path_str, "destination": upload_path.as_posix()},
|
|
473
|
+
)
|
|
474
|
+
raise self.wrap_error(e=e)
|
|
475
|
+
self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from time import time
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
8
|
+
|
|
9
|
+
from dateutil import parser
|
|
10
|
+
from pydantic import Field, Secret
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
|
|
13
|
+
from unstructured_ingest.error import ProviderError, UserError, ValueError
|
|
14
|
+
from unstructured_ingest.logger import logger
|
|
15
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
16
|
+
DestinationRegistryEntry,
|
|
17
|
+
SourceRegistryEntry,
|
|
18
|
+
)
|
|
19
|
+
from unstructured_ingest.processes.connectors.fsspec.fsspec import (
|
|
20
|
+
FsspecAccessConfig,
|
|
21
|
+
FsspecConnectionConfig,
|
|
22
|
+
FsspecDownloader,
|
|
23
|
+
FsspecDownloaderConfig,
|
|
24
|
+
FsspecIndexer,
|
|
25
|
+
FsspecIndexerConfig,
|
|
26
|
+
FsspecUploader,
|
|
27
|
+
FsspecUploaderConfig,
|
|
28
|
+
)
|
|
29
|
+
from unstructured_ingest.processes.utils.blob_storage import (
|
|
30
|
+
BlobStoreUploadStager,
|
|
31
|
+
BlobStoreUploadStagerConfig,
|
|
32
|
+
)
|
|
33
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
34
|
+
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from gcsfs import GCSFileSystem
|
|
38
|
+
|
|
39
|
+
CONNECTOR_TYPE = "gcs"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class GcsIndexerConfig(FsspecIndexerConfig):
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
service_account_key_description = """
|
|
47
|
+
Options:
|
|
48
|
+
- ``None``, GCSFS will attempt to guess your credentials in the
|
|
49
|
+
following order: gcloud CLI default, gcsfs cached token, google compute
|
|
50
|
+
metadata service, anonymous.
|
|
51
|
+
- ``'google_default'``, your default gcloud credentials will be used,
|
|
52
|
+
which are typically established by doing ``gcloud login`` in a terminal.
|
|
53
|
+
- ``'cache'``, credentials from previously successful gcsfs
|
|
54
|
+
authentication will be used (use this after "browser" auth succeeded)
|
|
55
|
+
- ``'anon'``, no authentication is performed, and you can only
|
|
56
|
+
access data which is accessible to allUsers (in this case, the project and
|
|
57
|
+
access level parameters are meaningless)
|
|
58
|
+
- ``'browser'``, you get an access code with which you can
|
|
59
|
+
authenticate via a specially provided URL
|
|
60
|
+
- if ``'cloud'``, we assume we are running within google compute
|
|
61
|
+
or google container engine, and query the internal metadata directly for
|
|
62
|
+
a token.
|
|
63
|
+
- you may supply a token generated by the
|
|
64
|
+
[gcloud](https://cloud.google.com/sdk/docs/)
|
|
65
|
+
utility; this is either a python dictionary or the name of a file
|
|
66
|
+
containing the JSON returned by logging in with the gcloud CLI tool.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class GcsAccessConfig(FsspecAccessConfig):
|
|
71
|
+
service_account_key: Optional[str] = Field(
|
|
72
|
+
default=None, description=service_account_key_description
|
|
73
|
+
)
|
|
74
|
+
token: Union[str, dict, None] = Field(init=False, default=None)
|
|
75
|
+
|
|
76
|
+
def model_post_init(self, __context: Any) -> None:
|
|
77
|
+
ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
|
|
78
|
+
|
|
79
|
+
# Case: null value
|
|
80
|
+
if not self.service_account_key:
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
# Case: one of auth constants
|
|
84
|
+
if self.service_account_key in ALLOWED_AUTH_VALUES:
|
|
85
|
+
self.token = self.service_account_key
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
# Case: token as json
|
|
89
|
+
if isinstance(json_to_dict(self.service_account_key), dict):
|
|
90
|
+
self.token = json_to_dict(self.service_account_key)
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
# Case: path to token
|
|
94
|
+
if Path(self.service_account_key).is_file():
|
|
95
|
+
self.token = self.service_account_key
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
raise ValueError("Invalid auth token value")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class GcsConnectionConfig(FsspecConnectionConfig):
|
|
102
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"], init=False)
|
|
103
|
+
access_config: Secret[GcsAccessConfig] = Field(default=GcsAccessConfig(), validate_default=True)
|
|
104
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
105
|
+
|
|
106
|
+
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
107
|
+
@contextmanager
|
|
108
|
+
def get_client(self, protocol: str) -> Generator["GCSFileSystem", None, None]:
|
|
109
|
+
with super().get_client(protocol=protocol) as client:
|
|
110
|
+
yield client
|
|
111
|
+
|
|
112
|
+
def wrap_error(self, e: Exception) -> Exception:
|
|
113
|
+
# https://github.com/fsspec/gcsfs/blob/main/gcsfs/retry.py#L79
|
|
114
|
+
from gcsfs.retry import HttpError
|
|
115
|
+
|
|
116
|
+
if isinstance(e, FileNotFoundError):
|
|
117
|
+
raise UserError(f"File not found: {e}")
|
|
118
|
+
if isinstance(e, OSError) and "Forbidden" in str(e):
|
|
119
|
+
raise UserError(e)
|
|
120
|
+
if isinstance(e, ValueError) and "Bad Request" in str(e):
|
|
121
|
+
raise UserError(e)
|
|
122
|
+
if isinstance(e, HttpError) and (http_error_code := e.code):
|
|
123
|
+
message = e.message or e
|
|
124
|
+
if 400 <= http_error_code < 500:
|
|
125
|
+
raise UserError(message)
|
|
126
|
+
if http_error_code >= 500:
|
|
127
|
+
raise ProviderError(message)
|
|
128
|
+
logger.error(f"({type(e)} from gcs): {e}", exc_info=True)
|
|
129
|
+
return e
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@dataclass
|
|
133
|
+
class GcsIndexer(FsspecIndexer):
|
|
134
|
+
connection_config: GcsConnectionConfig
|
|
135
|
+
index_config: GcsIndexerConfig
|
|
136
|
+
connector_type: str = CONNECTOR_TYPE
|
|
137
|
+
|
|
138
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
139
|
+
path = file_info["name"]
|
|
140
|
+
date_created = None
|
|
141
|
+
date_modified = None
|
|
142
|
+
if modified_at_str := file_info.get("updated"):
|
|
143
|
+
date_modified = str(parser.parse(modified_at_str).timestamp())
|
|
144
|
+
if created_at_str := file_info.get("timeCreated"):
|
|
145
|
+
date_created = str(parser.parse(created_at_str).timestamp())
|
|
146
|
+
|
|
147
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
148
|
+
|
|
149
|
+
version = file_info.get("etag")
|
|
150
|
+
record_locator = {
|
|
151
|
+
"protocol": self.index_config.protocol,
|
|
152
|
+
"remote_file_path": self.index_config.remote_url,
|
|
153
|
+
"file_id": file_info.get("id"),
|
|
154
|
+
}
|
|
155
|
+
return FileDataSourceMetadata(
|
|
156
|
+
date_created=date_created,
|
|
157
|
+
date_modified=date_modified,
|
|
158
|
+
date_processed=str(time()),
|
|
159
|
+
version=version,
|
|
160
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
161
|
+
record_locator=record_locator,
|
|
162
|
+
filesize_bytes=file_size,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class GcsDownloaderConfig(FsspecDownloaderConfig):
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@dataclass
|
|
171
|
+
class GcsDownloader(FsspecDownloader):
|
|
172
|
+
protocol: str = "gcs"
|
|
173
|
+
connection_config: GcsConnectionConfig
|
|
174
|
+
connector_type: str = CONNECTOR_TYPE
|
|
175
|
+
download_config: Optional[GcsDownloaderConfig] = field(default_factory=GcsDownloaderConfig)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class GcsUploaderConfig(FsspecUploaderConfig):
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@dataclass
|
|
183
|
+
class GcsUploader(FsspecUploader):
|
|
184
|
+
connector_type: str = CONNECTOR_TYPE
|
|
185
|
+
connection_config: GcsConnectionConfig
|
|
186
|
+
upload_config: GcsUploaderConfig = field(default=None)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
gcs_source_entry = SourceRegistryEntry(
|
|
190
|
+
indexer=GcsIndexer,
|
|
191
|
+
indexer_config=GcsIndexerConfig,
|
|
192
|
+
downloader=GcsDownloader,
|
|
193
|
+
downloader_config=GcsDownloaderConfig,
|
|
194
|
+
connection_config=GcsConnectionConfig,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
gcs_destination_entry = DestinationRegistryEntry(
|
|
198
|
+
uploader=GcsUploader,
|
|
199
|
+
uploader_config=GcsUploaderConfig,
|
|
200
|
+
connection_config=GcsConnectionConfig,
|
|
201
|
+
upload_stager_config=BlobStoreUploadStagerConfig,
|
|
202
|
+
upload_stager=BlobStoreUploadStager,
|
|
203
|
+
)
|