unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +66 -12
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -21
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +17 -8
- unstructured_ingest/v2/interfaces/file_data.py +13 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +73 -7
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +90 -24
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
- unstructured_ingest/v2/processes/connectors/local.py +36 -28
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
- unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
- unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
- unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
- unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
- unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +52 -39
- unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -60
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -6
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -36,35 +37,59 @@ def azure_json_serial(obj):
|
|
|
36
37
|
return json_serial(obj)
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
@dataclass
|
|
40
40
|
class AzureIndexerConfig(FsspecIndexerConfig):
|
|
41
41
|
pass
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
@dataclass
|
|
45
44
|
class AzureAccessConfig(FsspecAccessConfig):
|
|
46
|
-
account_name: Optional[str] =
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
45
|
+
account_name: Optional[str] = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="The storage account name. This is used to authenticate "
|
|
48
|
+
"requests signed with an account key and to construct "
|
|
49
|
+
"the storage endpoint. It is required unless a connection "
|
|
50
|
+
"string is given, or if a custom domain is used with "
|
|
51
|
+
"anonymous authentication.",
|
|
52
|
+
)
|
|
53
|
+
account_key: Optional[str] = Field(
|
|
54
|
+
default=None,
|
|
55
|
+
description="The storage account key. This is used for shared key "
|
|
56
|
+
"authentication. If any of account key, sas token or "
|
|
57
|
+
"client_id are not specified, anonymous access will be used.",
|
|
58
|
+
)
|
|
59
|
+
connection_string: Optional[str] = Field(
|
|
60
|
+
default=None,
|
|
61
|
+
description="If specified, this will override all other parameters. See "
|
|
62
|
+
"http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501
|
|
63
|
+
"for the connection string format.",
|
|
64
|
+
)
|
|
65
|
+
sas_token: Optional[str] = Field(
|
|
66
|
+
default=None,
|
|
67
|
+
description="A shared access signature token to use to authenticate "
|
|
68
|
+
"requests instead of the account key. If account key and "
|
|
69
|
+
"sas token are both specified, account key will be used "
|
|
70
|
+
"to sign. If any of account key, sas token or client_id "
|
|
71
|
+
"are not specified, anonymous access will be used.",
|
|
72
|
+
)
|
|
50
73
|
|
|
51
|
-
def
|
|
74
|
+
def model_post_init(self, __context: Any) -> None:
|
|
52
75
|
if self.connection_string is None and self.account_name is None:
|
|
53
76
|
raise ValueError("either connection_string or account_name must be set")
|
|
54
77
|
|
|
55
78
|
|
|
56
|
-
|
|
79
|
+
SecretAzureAccessConfig = Secret[AzureAccessConfig]
|
|
80
|
+
|
|
81
|
+
|
|
57
82
|
class AzureConnectionConfig(FsspecConnectionConfig):
|
|
58
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["az"])
|
|
59
|
-
access_config:
|
|
60
|
-
|
|
83
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["az"], init=False)
|
|
84
|
+
access_config: SecretAzureAccessConfig = Field(
|
|
85
|
+
default_factory=lambda: SecretAzureAccessConfig(secret_value=AzureAccessConfig())
|
|
61
86
|
)
|
|
62
|
-
connector_type: str = CONNECTOR_TYPE
|
|
87
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
63
88
|
|
|
64
89
|
def get_access_config(self) -> dict[str, Any]:
|
|
65
90
|
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
66
91
|
access_configs: dict[str, Any] = {
|
|
67
|
-
k: v for k, v in self.access_config.
|
|
92
|
+
k: v for k, v in self.access_config.get_secret_value().dict().items() if v
|
|
68
93
|
}
|
|
69
94
|
return access_configs
|
|
70
95
|
|
|
@@ -75,6 +100,10 @@ class AzureIndexer(FsspecIndexer):
|
|
|
75
100
|
index_config: AzureIndexerConfig
|
|
76
101
|
connector_type: str = CONNECTOR_TYPE
|
|
77
102
|
|
|
103
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
104
|
+
def precheck(self) -> None:
|
|
105
|
+
super().precheck()
|
|
106
|
+
|
|
78
107
|
def sterilize_info(self, path) -> dict:
|
|
79
108
|
info = self.fs.info(path=path)
|
|
80
109
|
return sterilize_dict(data=info, default=azure_json_serial)
|
|
@@ -84,7 +113,6 @@ class AzureIndexer(FsspecIndexer):
|
|
|
84
113
|
return super().run(**kwargs)
|
|
85
114
|
|
|
86
115
|
|
|
87
|
-
@dataclass
|
|
88
116
|
class AzureDownloaderConfig(FsspecDownloaderConfig):
|
|
89
117
|
pass
|
|
90
118
|
|
|
@@ -105,7 +133,6 @@ class AzureDownloader(FsspecDownloader):
|
|
|
105
133
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
106
134
|
|
|
107
135
|
|
|
108
|
-
@dataclass
|
|
109
136
|
class AzureUploaderConfig(FsspecUploaderConfig):
|
|
110
137
|
pass
|
|
111
138
|
|
|
@@ -120,6 +147,10 @@ class AzureUploader(FsspecUploader):
|
|
|
120
147
|
def __post_init__(self):
|
|
121
148
|
super().__post_init__()
|
|
122
149
|
|
|
150
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
151
|
+
def precheck(self) -> None:
|
|
152
|
+
super().precheck()
|
|
153
|
+
|
|
123
154
|
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
124
155
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
125
156
|
return super().run(contents=contents, **kwargs)
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -25,35 +26,38 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
25
26
|
CONNECTOR_TYPE = "box"
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
@dataclass
|
|
29
29
|
class BoxIndexerConfig(FsspecIndexerConfig):
|
|
30
30
|
pass
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
@dataclass
|
|
34
33
|
class BoxAccessConfig(FsspecAccessConfig):
|
|
35
|
-
box_app_config: Optional[str] =
|
|
34
|
+
box_app_config: Optional[str] = Field(
|
|
35
|
+
default=None, description="Path to Box app credentials as json file."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
SecretBoxAccessConfig = Secret[BoxAccessConfig]
|
|
36
40
|
|
|
37
41
|
|
|
38
|
-
@dataclass
|
|
39
42
|
class BoxConnectionConfig(FsspecConnectionConfig):
|
|
40
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["box"])
|
|
41
|
-
access_config:
|
|
42
|
-
|
|
43
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
|
|
44
|
+
access_config: SecretBoxAccessConfig = Field(
|
|
45
|
+
default_factory=lambda: SecretBoxAccessConfig(secret_value=BoxAccessConfig())
|
|
43
46
|
)
|
|
44
|
-
connector_type: str = CONNECTOR_TYPE
|
|
47
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
45
48
|
|
|
46
49
|
def get_access_config(self) -> dict[str, Any]:
|
|
47
50
|
# Return access_kwargs with oauth. The oauth object can not be stored directly in the config
|
|
48
51
|
# because it is not serializable.
|
|
49
52
|
from boxsdk import JWTAuth
|
|
50
53
|
|
|
54
|
+
ac = self.access_config.get_secret_value()
|
|
51
55
|
access_kwargs_with_oauth: dict[str, Any] = {
|
|
52
56
|
"oauth": JWTAuth.from_settings_file(
|
|
53
|
-
|
|
57
|
+
ac.box_app_config,
|
|
54
58
|
),
|
|
55
59
|
}
|
|
56
|
-
access_config: dict[str, Any] =
|
|
60
|
+
access_config: dict[str, Any] = ac.dict()
|
|
57
61
|
access_config.pop("box_app_config", None)
|
|
58
62
|
access_kwargs_with_oauth.update(access_config)
|
|
59
63
|
|
|
@@ -70,8 +74,11 @@ class BoxIndexer(FsspecIndexer):
|
|
|
70
74
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
71
75
|
return super().run(**kwargs)
|
|
72
76
|
|
|
77
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
78
|
+
def precheck(self) -> None:
|
|
79
|
+
super().precheck()
|
|
80
|
+
|
|
73
81
|
|
|
74
|
-
@dataclass
|
|
75
82
|
class BoxDownloaderConfig(FsspecDownloaderConfig):
|
|
76
83
|
pass
|
|
77
84
|
|
|
@@ -92,7 +99,6 @@ class BoxDownloader(FsspecDownloader):
|
|
|
92
99
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
93
100
|
|
|
94
101
|
|
|
95
|
-
@dataclass
|
|
96
102
|
class BoxUploaderConfig(FsspecUploaderConfig):
|
|
97
103
|
pass
|
|
98
104
|
|
|
@@ -107,6 +113,10 @@ class BoxUploader(FsspecUploader):
|
|
|
107
113
|
def __post_init__(self):
|
|
108
114
|
super().__post_init__()
|
|
109
115
|
|
|
116
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
117
|
+
def precheck(self) -> None:
|
|
118
|
+
super().precheck()
|
|
119
|
+
|
|
110
120
|
@requires_dependencies(["boxfs"], extras="box")
|
|
111
121
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
112
122
|
return super().run(contents=contents, **kwargs)
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -26,23 +27,23 @@ from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_d
|
|
|
26
27
|
CONNECTOR_TYPE = "dropbox"
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
@dataclass
|
|
30
30
|
class DropboxIndexerConfig(FsspecIndexerConfig):
|
|
31
31
|
pass
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@dataclass
|
|
35
34
|
class DropboxAccessConfig(FsspecAccessConfig):
|
|
36
|
-
token: Optional[str] = None
|
|
35
|
+
token: Optional[str] = Field(default=None, description="Dropbox access token.")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
SecretDropboxAccessConfig = Secret[DropboxAccessConfig]
|
|
37
39
|
|
|
38
40
|
|
|
39
|
-
@dataclass
|
|
40
41
|
class DropboxConnectionConfig(FsspecConnectionConfig):
|
|
41
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"])
|
|
42
|
-
access_config:
|
|
43
|
-
|
|
42
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"], init=False)
|
|
43
|
+
access_config: SecretDropboxAccessConfig = Field(
|
|
44
|
+
default_factory=lambda: SecretDropboxAccessConfig(secret_value=DropboxAccessConfig())
|
|
44
45
|
)
|
|
45
|
-
connector_type: str = CONNECTOR_TYPE
|
|
46
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
@dataclass
|
|
@@ -57,6 +58,10 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
57
58
|
if not self.index_config.path_without_protocol.startswith("/"):
|
|
58
59
|
self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
|
|
59
60
|
|
|
61
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
62
|
+
def precheck(self) -> None:
|
|
63
|
+
super().precheck()
|
|
64
|
+
|
|
60
65
|
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
61
66
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
62
67
|
return super().run(**kwargs)
|
|
@@ -68,7 +73,6 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
68
73
|
return sterilize_dict(data=info)
|
|
69
74
|
|
|
70
75
|
|
|
71
|
-
@dataclass
|
|
72
76
|
class DropboxDownloaderConfig(FsspecDownloaderConfig):
|
|
73
77
|
pass
|
|
74
78
|
|
|
@@ -91,7 +95,6 @@ class DropboxDownloader(FsspecDownloader):
|
|
|
91
95
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
92
96
|
|
|
93
97
|
|
|
94
|
-
@dataclass
|
|
95
98
|
class DropboxUploaderConfig(FsspecUploaderConfig):
|
|
96
99
|
pass
|
|
97
100
|
|
|
@@ -106,6 +109,10 @@ class DropboxUploader(FsspecUploader):
|
|
|
106
109
|
def __post_init__(self):
|
|
107
110
|
super().__post_init__()
|
|
108
111
|
|
|
112
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
113
|
+
def precheck(self) -> None:
|
|
114
|
+
super().precheck()
|
|
115
|
+
|
|
109
116
|
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
110
117
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
111
118
|
return super().run(contents=contents, **kwargs)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
|
-
import fnmatch
|
|
5
4
|
from dataclasses import dataclass, field
|
|
6
5
|
from datetime import datetime
|
|
7
6
|
from pathlib import Path
|
|
@@ -9,10 +8,13 @@ from time import time
|
|
|
9
8
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
10
9
|
from uuid import NAMESPACE_DNS, uuid5
|
|
11
10
|
|
|
12
|
-
from
|
|
11
|
+
from pydantic import BaseModel, Field, Secret
|
|
13
12
|
|
|
14
|
-
from unstructured_ingest.
|
|
15
|
-
|
|
13
|
+
from unstructured_ingest.error import (
|
|
14
|
+
DestinationConnectionError,
|
|
15
|
+
SourceConnectionError,
|
|
16
|
+
SourceConnectionNetworkError,
|
|
17
|
+
)
|
|
16
18
|
from unstructured_ingest.v2.interfaces import (
|
|
17
19
|
AccessConfig,
|
|
18
20
|
ConnectionConfig,
|
|
@@ -20,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
22
|
DownloaderConfig,
|
|
21
23
|
DownloadResponse,
|
|
22
24
|
FileData,
|
|
25
|
+
FileDataSourceMetadata,
|
|
23
26
|
Indexer,
|
|
24
27
|
IndexerConfig,
|
|
25
28
|
SourceIdentifiers,
|
|
@@ -36,17 +39,12 @@ if TYPE_CHECKING:
|
|
|
36
39
|
CONNECTOR_TYPE = "fsspec"
|
|
37
40
|
|
|
38
41
|
|
|
39
|
-
class
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
class FileConfig(Base):
|
|
46
|
-
remote_url: str
|
|
47
|
-
protocol: str = field(init=False)
|
|
48
|
-
path_without_protocol: str = field(init=False)
|
|
49
|
-
supported_protocols: list[str] = field(
|
|
42
|
+
class FileConfig(BaseModel):
|
|
43
|
+
remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
|
|
44
|
+
protocol: str = Field(init=False)
|
|
45
|
+
path_without_protocol: str = Field(init=False)
|
|
46
|
+
supported_protocols: list[str] = Field(
|
|
47
|
+
init=False,
|
|
50
48
|
default_factory=lambda: [
|
|
51
49
|
"s3",
|
|
52
50
|
"s3a",
|
|
@@ -57,38 +55,27 @@ class FileConfig(Base):
|
|
|
57
55
|
"box",
|
|
58
56
|
"dropbox",
|
|
59
57
|
"sftp",
|
|
60
|
-
]
|
|
58
|
+
],
|
|
61
59
|
)
|
|
62
60
|
|
|
63
|
-
def
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
"Protocol {} not supported yet, only {} are supported.".format(
|
|
69
|
-
self.protocol, ", ".join(self.supported_protocols)
|
|
70
|
-
),
|
|
71
|
-
)
|
|
61
|
+
def __init__(self, **data):
|
|
62
|
+
protocol, path_without_protocol = data["remote_url"].split("://")
|
|
63
|
+
data["protocol"] = protocol
|
|
64
|
+
data["path_without_protocol"] = path_without_protocol
|
|
65
|
+
super().__init__(**data)
|
|
72
66
|
|
|
73
67
|
|
|
74
|
-
@dataclass
|
|
75
68
|
class FsspecIndexerConfig(FileConfig, IndexerConfig):
|
|
76
69
|
recursive: bool = False
|
|
77
|
-
file_glob: Optional[list[str]] = None
|
|
78
70
|
|
|
79
71
|
|
|
80
|
-
@dataclass
|
|
81
72
|
class FsspecAccessConfig(AccessConfig):
|
|
82
73
|
pass
|
|
83
74
|
|
|
84
75
|
|
|
85
|
-
FsspecAccessConfigT = TypeVar("FsspecAccessConfigT", bound=FsspecAccessConfig)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
@dataclass
|
|
89
76
|
class FsspecConnectionConfig(ConnectionConfig):
|
|
90
|
-
access_config:
|
|
91
|
-
connector_type: str = CONNECTOR_TYPE
|
|
77
|
+
access_config: Secret[FsspecAccessConfig]
|
|
78
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
92
79
|
|
|
93
80
|
|
|
94
81
|
FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
|
|
@@ -99,7 +86,7 @@ FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnect
|
|
|
99
86
|
class FsspecIndexer(Indexer):
|
|
100
87
|
connection_config: FsspecConnectionConfigT
|
|
101
88
|
index_config: FsspecIndexerConfigT
|
|
102
|
-
connector_type: str = CONNECTOR_TYPE
|
|
89
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
103
90
|
|
|
104
91
|
@property
|
|
105
92
|
def fs(self) -> "AbstractFileSystem":
|
|
@@ -109,17 +96,7 @@ class FsspecIndexer(Indexer):
|
|
|
109
96
|
**self.connection_config.get_access_config(),
|
|
110
97
|
)
|
|
111
98
|
|
|
112
|
-
def
|
|
113
|
-
if self.index_config.file_glob is None:
|
|
114
|
-
return True
|
|
115
|
-
patterns = self.index_config.file_glob
|
|
116
|
-
for pattern in patterns:
|
|
117
|
-
if fnmatch.filter([path], pattern):
|
|
118
|
-
return True
|
|
119
|
-
logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
|
|
120
|
-
return False
|
|
121
|
-
|
|
122
|
-
def check_connection(self):
|
|
99
|
+
def precheck(self) -> None:
|
|
123
100
|
from fsspec import get_filesystem_class
|
|
124
101
|
|
|
125
102
|
try:
|
|
@@ -157,10 +134,10 @@ class FsspecIndexer(Indexer):
|
|
|
157
134
|
else:
|
|
158
135
|
raise TypeError(f"unhandled response type from find: {type(found)}")
|
|
159
136
|
|
|
160
|
-
def get_metadata(self, path: str) ->
|
|
137
|
+
def get_metadata(self, path: str) -> FileDataSourceMetadata:
|
|
161
138
|
date_created = None
|
|
162
139
|
date_modified = None
|
|
163
|
-
|
|
140
|
+
file_size = None
|
|
164
141
|
try:
|
|
165
142
|
created: Optional[Any] = self.fs.created(path)
|
|
166
143
|
if created:
|
|
@@ -180,6 +157,8 @@ class FsspecIndexer(Indexer):
|
|
|
180
157
|
date_modified = str(modified)
|
|
181
158
|
except NotImplementedError:
|
|
182
159
|
pass
|
|
160
|
+
with contextlib.suppress(AttributeError):
|
|
161
|
+
file_size = self.fs.size(path)
|
|
183
162
|
|
|
184
163
|
version = self.fs.checksum(path)
|
|
185
164
|
metadata: dict[str, str] = {}
|
|
@@ -189,15 +168,19 @@ class FsspecIndexer(Indexer):
|
|
|
189
168
|
"protocol": self.index_config.protocol,
|
|
190
169
|
"remote_file_path": self.index_config.remote_url,
|
|
191
170
|
}
|
|
171
|
+
file_stat = self.fs.stat(path=path)
|
|
172
|
+
if file_id := file_stat.get("id"):
|
|
173
|
+
record_locator["file_id"] = file_id
|
|
192
174
|
if metadata:
|
|
193
175
|
record_locator["metadata"] = metadata
|
|
194
|
-
return
|
|
176
|
+
return FileDataSourceMetadata(
|
|
195
177
|
date_created=date_created,
|
|
196
178
|
date_modified=date_modified,
|
|
197
179
|
date_processed=str(time()),
|
|
198
180
|
version=str(version),
|
|
199
181
|
url=f"{self.index_config.protocol}://{path}",
|
|
200
182
|
record_locator=record_locator,
|
|
183
|
+
filesize_bytes=file_size,
|
|
201
184
|
)
|
|
202
185
|
|
|
203
186
|
def sterilize_info(self, path) -> dict:
|
|
@@ -205,8 +188,7 @@ class FsspecIndexer(Indexer):
|
|
|
205
188
|
return sterilize_dict(data=info)
|
|
206
189
|
|
|
207
190
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
208
|
-
|
|
209
|
-
files = [f for f in raw_files if self.does_path_match_glob(f)]
|
|
191
|
+
files = self.list_files()
|
|
210
192
|
for file in files:
|
|
211
193
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
212
194
|
# to get a valid relative path
|
|
@@ -227,7 +209,6 @@ class FsspecIndexer(Indexer):
|
|
|
227
209
|
)
|
|
228
210
|
|
|
229
211
|
|
|
230
|
-
@dataclass
|
|
231
212
|
class FsspecDownloaderConfig(DownloaderConfig):
|
|
232
213
|
pass
|
|
233
214
|
|
|
@@ -255,13 +236,6 @@ class FsspecDownloader(Downloader):
|
|
|
255
236
|
**self.connection_config.get_access_config(),
|
|
256
237
|
)
|
|
257
238
|
|
|
258
|
-
def get_download_path(self, file_data: FileData) -> Path:
|
|
259
|
-
return (
|
|
260
|
-
self.download_dir / Path(file_data.source_identifiers.relative_path)
|
|
261
|
-
if self.download_config
|
|
262
|
-
else Path(file_data.source_identifiers.rel_path)
|
|
263
|
-
)
|
|
264
|
-
|
|
265
239
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
266
240
|
download_path = self.get_download_path(file_data=file_data)
|
|
267
241
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -285,9 +259,10 @@ class FsspecDownloader(Downloader):
|
|
|
285
259
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
286
260
|
|
|
287
261
|
|
|
288
|
-
@dataclass
|
|
289
262
|
class FsspecUploaderConfig(FileConfig, UploaderConfig):
|
|
290
|
-
overwrite: bool =
|
|
263
|
+
overwrite: bool = Field(
|
|
264
|
+
default=False, description="If true, an existing file will be overwritten."
|
|
265
|
+
)
|
|
291
266
|
|
|
292
267
|
|
|
293
268
|
FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
|
|
@@ -315,6 +290,19 @@ class FsspecUploader(Uploader):
|
|
|
315
290
|
f"missing 1 required positional argument: 'upload_config'"
|
|
316
291
|
)
|
|
317
292
|
|
|
293
|
+
def precheck(self) -> None:
|
|
294
|
+
from fsspec import get_filesystem_class
|
|
295
|
+
|
|
296
|
+
try:
|
|
297
|
+
fs = get_filesystem_class(self.upload_config.protocol)(
|
|
298
|
+
**self.connection_config.get_access_config(),
|
|
299
|
+
)
|
|
300
|
+
root_dir = self.upload_config.path_without_protocol.split("/")[0]
|
|
301
|
+
fs.ls(path=root_dir, detail=False)
|
|
302
|
+
except Exception as e:
|
|
303
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
304
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
305
|
+
|
|
318
306
|
def get_upload_path(self, file_data: FileData) -> Path:
|
|
319
307
|
upload_path = (
|
|
320
308
|
Path(self.upload_config.path_without_protocol)
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional, Union
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
10
11
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
@@ -26,17 +27,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
27
|
CONNECTOR_TYPE = "gcs"
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
@dataclass
|
|
30
30
|
class GcsIndexerConfig(FsspecIndexerConfig):
|
|
31
31
|
pass
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
|
|
34
|
+
service_account_key_description = """
|
|
35
|
+
Options:
|
|
36
|
+
- ``None``, GCSFS will attempt to guess your credentials in the
|
|
37
|
+
following order: gcloud CLI default, gcsfs cached token, google compute
|
|
38
|
+
metadata service, anonymous.
|
|
39
|
+
- ``'google_default'``, your default gcloud credentials will be used,
|
|
40
|
+
which are typically established by doing ``gcloud login`` in a terminal.
|
|
41
|
+
- ``'cache'``, credentials from previously successful gcsfs
|
|
42
|
+
authentication will be used (use this after "browser" auth succeeded)
|
|
43
|
+
- ``'anon'``, no authentication is performed, and you can only
|
|
44
|
+
access data which is accessible to allUsers (in this case, the project and
|
|
45
|
+
access level parameters are meaningless)
|
|
46
|
+
- ``'browser'``, you get an access code with which you can
|
|
47
|
+
authenticate via a specially provided URL
|
|
48
|
+
- if ``'cloud'``, we assume we are running within google compute
|
|
49
|
+
or google container engine, and query the internal metadata directly for
|
|
50
|
+
a token.
|
|
51
|
+
- you may supply a token generated by the
|
|
52
|
+
[gcloud](https://cloud.google.com/sdk/docs/)
|
|
53
|
+
utility; this is either a python dictionary or the name of a file
|
|
54
|
+
containing the JSON returned by logging in with the gcloud CLI tool.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
|
|
35
58
|
class GcsAccessConfig(FsspecAccessConfig):
|
|
36
|
-
service_account_key: Optional[str] =
|
|
37
|
-
|
|
59
|
+
service_account_key: Optional[str] = Field(
|
|
60
|
+
default=None, description=service_account_key_description
|
|
61
|
+
)
|
|
62
|
+
token: Union[str, dict, None] = Field(init=False, default=None)
|
|
38
63
|
|
|
39
|
-
def
|
|
64
|
+
def model_post_init(self, __context: Any) -> None:
|
|
40
65
|
ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
|
|
41
66
|
|
|
42
67
|
# Case: null value
|
|
@@ -61,13 +86,15 @@ class GcsAccessConfig(FsspecAccessConfig):
|
|
|
61
86
|
raise ValueError("Invalid auth token value")
|
|
62
87
|
|
|
63
88
|
|
|
64
|
-
|
|
89
|
+
SecretGcsAccessConfig = Secret[GcsAccessConfig]
|
|
90
|
+
|
|
91
|
+
|
|
65
92
|
class GcsConnectionConfig(FsspecConnectionConfig):
|
|
66
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"])
|
|
67
|
-
access_config:
|
|
68
|
-
|
|
93
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"], init=False)
|
|
94
|
+
access_config: SecretGcsAccessConfig = Field(
|
|
95
|
+
default_factory=lambda: SecretGcsAccessConfig(secret_value=GcsAccessConfig())
|
|
69
96
|
)
|
|
70
|
-
connector_type: str = CONNECTOR_TYPE
|
|
97
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
71
98
|
|
|
72
99
|
|
|
73
100
|
@dataclass
|
|
@@ -80,8 +107,11 @@ class GcsIndexer(FsspecIndexer):
|
|
|
80
107
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
81
108
|
return super().run(**kwargs)
|
|
82
109
|
|
|
110
|
+
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
111
|
+
def precheck(self) -> None:
|
|
112
|
+
super().precheck()
|
|
113
|
+
|
|
83
114
|
|
|
84
|
-
@dataclass
|
|
85
115
|
class GcsDownloaderConfig(FsspecDownloaderConfig):
|
|
86
116
|
pass
|
|
87
117
|
|
|
@@ -102,7 +132,6 @@ class GcsDownloader(FsspecDownloader):
|
|
|
102
132
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
103
133
|
|
|
104
134
|
|
|
105
|
-
@dataclass
|
|
106
135
|
class GcsUploaderConfig(FsspecUploaderConfig):
|
|
107
136
|
pass
|
|
108
137
|
|
|
@@ -117,6 +146,10 @@ class GcsUploader(FsspecUploader):
|
|
|
117
146
|
def __post_init__(self):
|
|
118
147
|
super().__post_init__()
|
|
119
148
|
|
|
149
|
+
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
150
|
+
def precheck(self) -> None:
|
|
151
|
+
super().precheck()
|
|
152
|
+
|
|
120
153
|
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
121
154
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
122
155
|
return super().run(contents=contents, **kwargs)
|