unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +57 -13
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -23
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +8 -5
- unstructured_ingest/v2/interfaces/file_data.py +8 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/pipeline.py +9 -6
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +13 -11
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
- unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
- unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
- unstructured_ingest/v2/processes/connectors/local.py +27 -16
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
- unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
- unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
- unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
- unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
- unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +29 -31
- unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +11 -5
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -52
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -13
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -36,35 +37,59 @@ def azure_json_serial(obj):
|
|
|
36
37
|
return json_serial(obj)
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
@dataclass
|
|
40
40
|
class AzureIndexerConfig(FsspecIndexerConfig):
|
|
41
41
|
pass
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
@dataclass
|
|
45
44
|
class AzureAccessConfig(FsspecAccessConfig):
|
|
46
|
-
account_name: Optional[str] =
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
45
|
+
account_name: Optional[str] = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="The storage account name. This is used to authenticate "
|
|
48
|
+
"requests signed with an account key and to construct "
|
|
49
|
+
"the storage endpoint. It is required unless a connection "
|
|
50
|
+
"string is given, or if a custom domain is used with "
|
|
51
|
+
"anonymous authentication.",
|
|
52
|
+
)
|
|
53
|
+
account_key: Optional[str] = Field(
|
|
54
|
+
default=None,
|
|
55
|
+
description="The storage account key. This is used for shared key "
|
|
56
|
+
"authentication. If any of account key, sas token or "
|
|
57
|
+
"client_id are not specified, anonymous access will be used.",
|
|
58
|
+
)
|
|
59
|
+
connection_string: Optional[str] = Field(
|
|
60
|
+
default=None,
|
|
61
|
+
description="If specified, this will override all other parameters. See "
|
|
62
|
+
"http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501
|
|
63
|
+
"for the connection string format.",
|
|
64
|
+
)
|
|
65
|
+
sas_token: Optional[str] = Field(
|
|
66
|
+
default=None,
|
|
67
|
+
description="A shared access signature token to use to authenticate "
|
|
68
|
+
"requests instead of the account key. If account key and "
|
|
69
|
+
"sas token are both specified, account key will be used "
|
|
70
|
+
"to sign. If any of account key, sas token or client_id "
|
|
71
|
+
"are not specified, anonymous access will be used.",
|
|
72
|
+
)
|
|
50
73
|
|
|
51
|
-
def
|
|
74
|
+
def model_post_init(self, __context: Any) -> None:
|
|
52
75
|
if self.connection_string is None and self.account_name is None:
|
|
53
76
|
raise ValueError("either connection_string or account_name must be set")
|
|
54
77
|
|
|
55
78
|
|
|
56
|
-
|
|
79
|
+
SecretAzureAccessConfig = Secret[AzureAccessConfig]
|
|
80
|
+
|
|
81
|
+
|
|
57
82
|
class AzureConnectionConfig(FsspecConnectionConfig):
|
|
58
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["az"])
|
|
59
|
-
access_config:
|
|
60
|
-
|
|
83
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["az"], init=False)
|
|
84
|
+
access_config: SecretAzureAccessConfig = Field(
|
|
85
|
+
default_factory=lambda: SecretAzureAccessConfig(secret_value=AzureAccessConfig())
|
|
61
86
|
)
|
|
62
|
-
connector_type: str = CONNECTOR_TYPE
|
|
87
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
63
88
|
|
|
64
89
|
def get_access_config(self) -> dict[str, Any]:
|
|
65
90
|
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
66
91
|
access_configs: dict[str, Any] = {
|
|
67
|
-
k: v for k, v in self.access_config.
|
|
92
|
+
k: v for k, v in self.access_config.get_secret_value().dict().items() if v
|
|
68
93
|
}
|
|
69
94
|
return access_configs
|
|
70
95
|
|
|
@@ -88,7 +113,6 @@ class AzureIndexer(FsspecIndexer):
|
|
|
88
113
|
return super().run(**kwargs)
|
|
89
114
|
|
|
90
115
|
|
|
91
|
-
@dataclass
|
|
92
116
|
class AzureDownloaderConfig(FsspecDownloaderConfig):
|
|
93
117
|
pass
|
|
94
118
|
|
|
@@ -109,7 +133,6 @@ class AzureDownloader(FsspecDownloader):
|
|
|
109
133
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
110
134
|
|
|
111
135
|
|
|
112
|
-
@dataclass
|
|
113
136
|
class AzureUploaderConfig(FsspecUploaderConfig):
|
|
114
137
|
pass
|
|
115
138
|
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -25,35 +26,38 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
25
26
|
CONNECTOR_TYPE = "box"
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
@dataclass
|
|
29
29
|
class BoxIndexerConfig(FsspecIndexerConfig):
|
|
30
30
|
pass
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
@dataclass
|
|
34
33
|
class BoxAccessConfig(FsspecAccessConfig):
|
|
35
|
-
box_app_config: Optional[str] =
|
|
34
|
+
box_app_config: Optional[str] = Field(
|
|
35
|
+
default=None, description="Path to Box app credentials as json file."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
SecretBoxAccessConfig = Secret[BoxAccessConfig]
|
|
36
40
|
|
|
37
41
|
|
|
38
|
-
@dataclass
|
|
39
42
|
class BoxConnectionConfig(FsspecConnectionConfig):
|
|
40
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["box"])
|
|
41
|
-
access_config:
|
|
42
|
-
|
|
43
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
|
|
44
|
+
access_config: SecretBoxAccessConfig = Field(
|
|
45
|
+
default_factory=lambda: SecretBoxAccessConfig(secret_value=BoxAccessConfig())
|
|
43
46
|
)
|
|
44
|
-
connector_type: str = CONNECTOR_TYPE
|
|
47
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
45
48
|
|
|
46
49
|
def get_access_config(self) -> dict[str, Any]:
|
|
47
50
|
# Return access_kwargs with oauth. The oauth object can not be stored directly in the config
|
|
48
51
|
# because it is not serializable.
|
|
49
52
|
from boxsdk import JWTAuth
|
|
50
53
|
|
|
54
|
+
ac = self.access_config.get_secret_value()
|
|
51
55
|
access_kwargs_with_oauth: dict[str, Any] = {
|
|
52
56
|
"oauth": JWTAuth.from_settings_file(
|
|
53
|
-
|
|
57
|
+
ac.box_app_config,
|
|
54
58
|
),
|
|
55
59
|
}
|
|
56
|
-
access_config: dict[str, Any] =
|
|
60
|
+
access_config: dict[str, Any] = ac.dict()
|
|
57
61
|
access_config.pop("box_app_config", None)
|
|
58
62
|
access_kwargs_with_oauth.update(access_config)
|
|
59
63
|
|
|
@@ -75,7 +79,6 @@ class BoxIndexer(FsspecIndexer):
|
|
|
75
79
|
super().precheck()
|
|
76
80
|
|
|
77
81
|
|
|
78
|
-
@dataclass
|
|
79
82
|
class BoxDownloaderConfig(FsspecDownloaderConfig):
|
|
80
83
|
pass
|
|
81
84
|
|
|
@@ -96,7 +99,6 @@ class BoxDownloader(FsspecDownloader):
|
|
|
96
99
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
97
100
|
|
|
98
101
|
|
|
99
|
-
@dataclass
|
|
100
102
|
class BoxUploaderConfig(FsspecUploaderConfig):
|
|
101
103
|
pass
|
|
102
104
|
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -26,23 +27,23 @@ from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_d
|
|
|
26
27
|
CONNECTOR_TYPE = "dropbox"
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
@dataclass
|
|
30
30
|
class DropboxIndexerConfig(FsspecIndexerConfig):
|
|
31
31
|
pass
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@dataclass
|
|
35
34
|
class DropboxAccessConfig(FsspecAccessConfig):
|
|
36
|
-
token: Optional[str] = None
|
|
35
|
+
token: Optional[str] = Field(default=None, description="Dropbox access token.")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
SecretDropboxAccessConfig = Secret[DropboxAccessConfig]
|
|
37
39
|
|
|
38
40
|
|
|
39
|
-
@dataclass
|
|
40
41
|
class DropboxConnectionConfig(FsspecConnectionConfig):
|
|
41
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"])
|
|
42
|
-
access_config:
|
|
43
|
-
|
|
42
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"], init=False)
|
|
43
|
+
access_config: SecretDropboxAccessConfig = Field(
|
|
44
|
+
default_factory=lambda: SecretDropboxAccessConfig(secret_value=DropboxAccessConfig())
|
|
44
45
|
)
|
|
45
|
-
connector_type: str = CONNECTOR_TYPE
|
|
46
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
@dataclass
|
|
@@ -72,7 +73,6 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
72
73
|
return sterilize_dict(data=info)
|
|
73
74
|
|
|
74
75
|
|
|
75
|
-
@dataclass
|
|
76
76
|
class DropboxDownloaderConfig(FsspecDownloaderConfig):
|
|
77
77
|
pass
|
|
78
78
|
|
|
@@ -95,7 +95,6 @@ class DropboxDownloader(FsspecDownloader):
|
|
|
95
95
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
96
96
|
|
|
97
97
|
|
|
98
|
-
@dataclass
|
|
99
98
|
class DropboxUploaderConfig(FsspecUploaderConfig):
|
|
100
99
|
pass
|
|
101
100
|
|
|
@@ -8,7 +8,8 @@ from time import time
|
|
|
8
8
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
9
9
|
from uuid import NAMESPACE_DNS, uuid5
|
|
10
10
|
|
|
11
|
-
from
|
|
11
|
+
from pydantic import BaseModel, Field, Secret
|
|
12
|
+
|
|
12
13
|
from unstructured_ingest.error import (
|
|
13
14
|
DestinationConnectionError,
|
|
14
15
|
SourceConnectionError,
|
|
@@ -38,17 +39,12 @@ if TYPE_CHECKING:
|
|
|
38
39
|
CONNECTOR_TYPE = "fsspec"
|
|
39
40
|
|
|
40
41
|
|
|
41
|
-
class
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class FileConfig(Base):
|
|
48
|
-
remote_url: str
|
|
49
|
-
protocol: str = field(init=False)
|
|
50
|
-
path_without_protocol: str = field(init=False)
|
|
51
|
-
supported_protocols: list[str] = field(
|
|
42
|
+
class FileConfig(BaseModel):
|
|
43
|
+
remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
|
|
44
|
+
protocol: str = Field(init=False)
|
|
45
|
+
path_without_protocol: str = Field(init=False)
|
|
46
|
+
supported_protocols: list[str] = Field(
|
|
47
|
+
init=False,
|
|
52
48
|
default_factory=lambda: [
|
|
53
49
|
"s3",
|
|
54
50
|
"s3a",
|
|
@@ -59,37 +55,27 @@ class FileConfig(Base):
|
|
|
59
55
|
"box",
|
|
60
56
|
"dropbox",
|
|
61
57
|
"sftp",
|
|
62
|
-
]
|
|
58
|
+
],
|
|
63
59
|
)
|
|
64
60
|
|
|
65
|
-
def
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
"Protocol {} not supported yet, only {} are supported.".format(
|
|
71
|
-
self.protocol, ", ".join(self.supported_protocols)
|
|
72
|
-
),
|
|
73
|
-
)
|
|
61
|
+
def __init__(self, **data):
|
|
62
|
+
protocol, path_without_protocol = data["remote_url"].split("://")
|
|
63
|
+
data["protocol"] = protocol
|
|
64
|
+
data["path_without_protocol"] = path_without_protocol
|
|
65
|
+
super().__init__(**data)
|
|
74
66
|
|
|
75
67
|
|
|
76
|
-
@dataclass
|
|
77
68
|
class FsspecIndexerConfig(FileConfig, IndexerConfig):
|
|
78
69
|
recursive: bool = False
|
|
79
70
|
|
|
80
71
|
|
|
81
|
-
@dataclass
|
|
82
72
|
class FsspecAccessConfig(AccessConfig):
|
|
83
73
|
pass
|
|
84
74
|
|
|
85
75
|
|
|
86
|
-
FsspecAccessConfigT = TypeVar("FsspecAccessConfigT", bound=FsspecAccessConfig)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
@dataclass
|
|
90
76
|
class FsspecConnectionConfig(ConnectionConfig):
|
|
91
|
-
access_config:
|
|
92
|
-
connector_type: str = CONNECTOR_TYPE
|
|
77
|
+
access_config: Secret[FsspecAccessConfig]
|
|
78
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
93
79
|
|
|
94
80
|
|
|
95
81
|
FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
|
|
@@ -100,7 +86,7 @@ FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnect
|
|
|
100
86
|
class FsspecIndexer(Indexer):
|
|
101
87
|
connection_config: FsspecConnectionConfigT
|
|
102
88
|
index_config: FsspecIndexerConfigT
|
|
103
|
-
connector_type: str = CONNECTOR_TYPE
|
|
89
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
104
90
|
|
|
105
91
|
@property
|
|
106
92
|
def fs(self) -> "AbstractFileSystem":
|
|
@@ -223,7 +209,6 @@ class FsspecIndexer(Indexer):
|
|
|
223
209
|
)
|
|
224
210
|
|
|
225
211
|
|
|
226
|
-
@dataclass
|
|
227
212
|
class FsspecDownloaderConfig(DownloaderConfig):
|
|
228
213
|
pass
|
|
229
214
|
|
|
@@ -274,9 +259,10 @@ class FsspecDownloader(Downloader):
|
|
|
274
259
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
275
260
|
|
|
276
261
|
|
|
277
|
-
@dataclass
|
|
278
262
|
class FsspecUploaderConfig(FileConfig, UploaderConfig):
|
|
279
|
-
overwrite: bool =
|
|
263
|
+
overwrite: bool = Field(
|
|
264
|
+
default=False, description="If true, an existing file will be overwritten."
|
|
265
|
+
)
|
|
280
266
|
|
|
281
267
|
|
|
282
268
|
FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional, Union
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
10
11
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
@@ -26,17 +27,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
27
|
CONNECTOR_TYPE = "gcs"
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
@dataclass
|
|
30
30
|
class GcsIndexerConfig(FsspecIndexerConfig):
|
|
31
31
|
pass
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
|
|
34
|
+
service_account_key_description = """
|
|
35
|
+
Options:
|
|
36
|
+
- ``None``, GCSFS will attempt to guess your credentials in the
|
|
37
|
+
following order: gcloud CLI default, gcsfs cached token, google compute
|
|
38
|
+
metadata service, anonymous.
|
|
39
|
+
- ``'google_default'``, your default gcloud credentials will be used,
|
|
40
|
+
which are typically established by doing ``gcloud login`` in a terminal.
|
|
41
|
+
- ``'cache'``, credentials from previously successful gcsfs
|
|
42
|
+
authentication will be used (use this after "browser" auth succeeded)
|
|
43
|
+
- ``'anon'``, no authentication is performed, and you can only
|
|
44
|
+
access data which is accessible to allUsers (in this case, the project and
|
|
45
|
+
access level parameters are meaningless)
|
|
46
|
+
- ``'browser'``, you get an access code with which you can
|
|
47
|
+
authenticate via a specially provided URL
|
|
48
|
+
- if ``'cloud'``, we assume we are running within google compute
|
|
49
|
+
or google container engine, and query the internal metadata directly for
|
|
50
|
+
a token.
|
|
51
|
+
- you may supply a token generated by the
|
|
52
|
+
[gcloud](https://cloud.google.com/sdk/docs/)
|
|
53
|
+
utility; this is either a python dictionary or the name of a file
|
|
54
|
+
containing the JSON returned by logging in with the gcloud CLI tool.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
|
|
35
58
|
class GcsAccessConfig(FsspecAccessConfig):
|
|
36
|
-
service_account_key: Optional[str] =
|
|
37
|
-
|
|
59
|
+
service_account_key: Optional[str] = Field(
|
|
60
|
+
default=None, description=service_account_key_description
|
|
61
|
+
)
|
|
62
|
+
token: Union[str, dict, None] = Field(init=False, default=None)
|
|
38
63
|
|
|
39
|
-
def
|
|
64
|
+
def model_post_init(self, __context: Any) -> None:
|
|
40
65
|
ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
|
|
41
66
|
|
|
42
67
|
# Case: null value
|
|
@@ -61,13 +86,15 @@ class GcsAccessConfig(FsspecAccessConfig):
|
|
|
61
86
|
raise ValueError("Invalid auth token value")
|
|
62
87
|
|
|
63
88
|
|
|
64
|
-
|
|
89
|
+
SecretGcsAccessConfig = Secret[GcsAccessConfig]
|
|
90
|
+
|
|
91
|
+
|
|
65
92
|
class GcsConnectionConfig(FsspecConnectionConfig):
|
|
66
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"])
|
|
67
|
-
access_config:
|
|
68
|
-
|
|
93
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"], init=False)
|
|
94
|
+
access_config: SecretGcsAccessConfig = Field(
|
|
95
|
+
default_factory=lambda: SecretGcsAccessConfig(secret_value=GcsAccessConfig())
|
|
69
96
|
)
|
|
70
|
-
connector_type: str = CONNECTOR_TYPE
|
|
97
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
71
98
|
|
|
72
99
|
|
|
73
100
|
@dataclass
|
|
@@ -85,7 +112,6 @@ class GcsIndexer(FsspecIndexer):
|
|
|
85
112
|
super().precheck()
|
|
86
113
|
|
|
87
114
|
|
|
88
|
-
@dataclass
|
|
89
115
|
class GcsDownloaderConfig(FsspecDownloaderConfig):
|
|
90
116
|
pass
|
|
91
117
|
|
|
@@ -106,7 +132,6 @@ class GcsDownloader(FsspecDownloader):
|
|
|
106
132
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
107
133
|
|
|
108
134
|
|
|
109
|
-
@dataclass
|
|
110
135
|
class GcsUploaderConfig(FsspecUploaderConfig):
|
|
111
136
|
pass
|
|
112
137
|
|
|
@@ -5,9 +5,9 @@ from pathlib import Path
|
|
|
5
5
|
from time import time
|
|
6
6
|
from typing import Any, Generator, Optional
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
|
-
from unstructured_ingest.
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
11
|
from unstructured_ingest.v2.interfaces import (
|
|
12
12
|
DownloadResponse,
|
|
13
13
|
FileData,
|
|
@@ -32,27 +32,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
32
32
|
CONNECTOR_TYPE = "s3"
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
@dataclass
|
|
36
35
|
class S3IndexerConfig(FsspecIndexerConfig):
|
|
37
36
|
pass
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
@dataclass
|
|
41
39
|
class S3AccessConfig(FsspecAccessConfig):
|
|
42
|
-
key: Optional[str] =
|
|
43
|
-
|
|
44
|
-
|
|
40
|
+
key: Optional[str] = Field(
|
|
41
|
+
default=None,
|
|
42
|
+
description="If not anonymous, use this access key ID, if specified. Takes precedence "
|
|
43
|
+
"over `aws_access_key_id` in client_kwargs.",
|
|
44
|
+
)
|
|
45
|
+
secret: Optional[str] = Field(
|
|
46
|
+
default=None, description="If not anonymous, use this secret access key, if specified."
|
|
47
|
+
)
|
|
48
|
+
token: Optional[str] = Field(
|
|
49
|
+
default=None, description="If not anonymous, use this security token, if specified."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
SecretS3AccessConfig = Secret[S3AccessConfig]
|
|
45
54
|
|
|
46
55
|
|
|
47
|
-
@dataclass
|
|
48
56
|
class S3ConnectionConfig(FsspecConnectionConfig):
|
|
49
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"])
|
|
50
|
-
access_config:
|
|
51
|
-
|
|
57
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"], init=False)
|
|
58
|
+
access_config: SecretS3AccessConfig = Field(
|
|
59
|
+
default_factory=lambda: SecretS3AccessConfig(secret_value=S3AccessConfig())
|
|
52
60
|
)
|
|
53
|
-
endpoint_url: Optional[str] =
|
|
54
|
-
|
|
55
|
-
|
|
61
|
+
endpoint_url: Optional[str] = Field(
|
|
62
|
+
default=None,
|
|
63
|
+
description="Use this endpoint_url, if specified. Needed for "
|
|
64
|
+
"connecting to non-AWS S3 buckets.",
|
|
65
|
+
)
|
|
66
|
+
anonymous: bool = Field(
|
|
67
|
+
default=False, description="Connect to s3 without local AWS credentials."
|
|
68
|
+
)
|
|
69
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
56
70
|
|
|
57
71
|
def get_access_config(self) -> dict[str, Any]:
|
|
58
72
|
access_configs: dict[str, Any] = {"anon": self.anonymous}
|
|
@@ -60,7 +74,9 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
60
74
|
access_configs["endpoint_url"] = self.endpoint_url
|
|
61
75
|
|
|
62
76
|
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
63
|
-
access_configs.update(
|
|
77
|
+
access_configs.update(
|
|
78
|
+
{k: v for k, v in self.access_config.get_secret_value().dict().items() if v}
|
|
79
|
+
)
|
|
64
80
|
return access_configs
|
|
65
81
|
|
|
66
82
|
|
|
@@ -116,7 +132,6 @@ class S3Indexer(FsspecIndexer):
|
|
|
116
132
|
super().precheck()
|
|
117
133
|
|
|
118
134
|
|
|
119
|
-
@dataclass
|
|
120
135
|
class S3DownloaderConfig(FsspecDownloaderConfig):
|
|
121
136
|
pass
|
|
122
137
|
|
|
@@ -137,7 +152,6 @@ class S3Downloader(FsspecDownloader):
|
|
|
137
152
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
138
153
|
|
|
139
154
|
|
|
140
|
-
@dataclass
|
|
141
155
|
class S3UploaderConfig(FsspecUploaderConfig):
|
|
142
156
|
pass
|
|
143
157
|
|
|
@@ -6,7 +6,8 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Any, Generator, Optional
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
12
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
12
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -27,10 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
27
28
|
CONNECTOR_TYPE = "sftp"
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
@dataclass
|
|
31
31
|
class SftpIndexerConfig(FsspecIndexerConfig):
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
|
|
33
|
+
def model_post_init(self, __context: Any) -> None:
|
|
34
|
+
super().model_post_init(__context)
|
|
34
35
|
_, ext = os.path.splitext(self.remote_url)
|
|
35
36
|
parsed_url = urlparse(self.remote_url)
|
|
36
37
|
if ext:
|
|
@@ -39,21 +40,21 @@ class SftpIndexerConfig(FsspecIndexerConfig):
|
|
|
39
40
|
self.path_without_protocol = parsed_url.path.lstrip("/")
|
|
40
41
|
|
|
41
42
|
|
|
42
|
-
@dataclass
|
|
43
43
|
class SftpAccessConfig(FsspecAccessConfig):
|
|
44
|
-
password: str
|
|
44
|
+
password: str = Field(description="Password for sftp connection")
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
@dataclass
|
|
48
47
|
class SftpConnectionConfig(FsspecConnectionConfig):
|
|
49
|
-
supported_protocols: list[str] =
|
|
50
|
-
access_config: SftpAccessConfig
|
|
51
|
-
connector_type: str = CONNECTOR_TYPE
|
|
52
|
-
username:
|
|
53
|
-
host: Optional[str] = None
|
|
54
|
-
port: int = 22
|
|
55
|
-
look_for_keys: bool =
|
|
56
|
-
|
|
48
|
+
supported_protocols: list[str] = Field(default_factory=lambda: ["sftp"], init=False)
|
|
49
|
+
access_config: Secret[SftpAccessConfig]
|
|
50
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
51
|
+
username: str = Field(description="Username for sftp connection")
|
|
52
|
+
host: Optional[str] = Field(default=None, description="Hostname for sftp connection")
|
|
53
|
+
port: int = Field(default=22, description="Port for sftp connection")
|
|
54
|
+
look_for_keys: bool = Field(
|
|
55
|
+
default=False, description="Whether to search for private key files in ~/.ssh/"
|
|
56
|
+
)
|
|
57
|
+
allow_agent: bool = Field(default=False, description="Whether to connect to the SSH agent.")
|
|
57
58
|
|
|
58
59
|
def get_access_config(self) -> dict[str, Any]:
|
|
59
60
|
access_config = {
|
|
@@ -62,7 +63,7 @@ class SftpConnectionConfig(FsspecConnectionConfig):
|
|
|
62
63
|
"port": self.port,
|
|
63
64
|
"look_for_keys": self.look_for_keys,
|
|
64
65
|
"allow_agent": self.allow_agent,
|
|
65
|
-
"password": self.access_config.password,
|
|
66
|
+
"password": self.access_config.get_secret_value().password,
|
|
66
67
|
}
|
|
67
68
|
return access_config
|
|
68
69
|
|
|
@@ -96,24 +97,15 @@ class SftpIndexer(FsspecIndexer):
|
|
|
96
97
|
super().precheck()
|
|
97
98
|
|
|
98
99
|
|
|
99
|
-
@dataclass
|
|
100
100
|
class SftpDownloaderConfig(FsspecDownloaderConfig):
|
|
101
|
-
remote_url:
|
|
102
|
-
|
|
103
|
-
def __post_init__(self):
|
|
104
|
-
# TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
|
|
105
|
-
if not self.remote_url:
|
|
106
|
-
raise TypeError(
|
|
107
|
-
f"{self.__class__.__name__}.__init__() "
|
|
108
|
-
f"missing 1 required positional argument: 'remote_url'"
|
|
109
|
-
)
|
|
101
|
+
remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
|
|
110
102
|
|
|
111
103
|
|
|
112
104
|
@dataclass
|
|
113
105
|
class SftpDownloader(FsspecDownloader):
|
|
114
106
|
protocol: str = "sftp"
|
|
115
107
|
connection_config: SftpConnectionConfig
|
|
116
|
-
connector_type: str = CONNECTOR_TYPE
|
|
108
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
117
109
|
download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
|
|
118
110
|
|
|
119
111
|
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
@@ -131,7 +123,6 @@ class SftpDownloader(FsspecDownloader):
|
|
|
131
123
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
132
124
|
|
|
133
125
|
|
|
134
|
-
@dataclass
|
|
135
126
|
class SftpUploaderConfig(FsspecUploaderConfig):
|
|
136
127
|
pass
|
|
137
128
|
|