unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +57 -13
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -23
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +8 -5
- unstructured_ingest/v2/interfaces/file_data.py +8 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/pipeline.py +1 -5
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +13 -11
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
- unstructured_ingest/v2/processes/connectors/local.py +22 -14
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
- unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
- unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
- unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
- unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
- unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +29 -31
- unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +11 -5
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -52
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -13
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -5,9 +5,10 @@ import uuid
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from time import time
|
|
8
|
-
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field, Secret, SecretStr
|
|
9
11
|
|
|
10
|
-
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
11
12
|
from unstructured_ingest.error import (
|
|
12
13
|
DestinationConnectionError,
|
|
13
14
|
SourceConnectionError,
|
|
@@ -44,57 +45,74 @@ if TYPE_CHECKING:
|
|
|
44
45
|
CONNECTOR_TYPE = "elasticsearch"
|
|
45
46
|
|
|
46
47
|
|
|
47
|
-
@dataclass
|
|
48
48
|
class ElasticsearchAccessConfig(AccessConfig):
|
|
49
|
-
password: Optional[str] =
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
49
|
+
password: Optional[str] = Field(
|
|
50
|
+
default=None, description="password when using basic auth or connecting to a cloud instance"
|
|
51
|
+
)
|
|
52
|
+
es_api_key: Optional[str] = Field(default=None, description="api key used for authentication")
|
|
53
|
+
bearer_auth: Optional[str] = Field(
|
|
54
|
+
default=None, description="bearer token used for HTTP bearer authentication"
|
|
55
|
+
)
|
|
56
|
+
ssl_assert_fingerprint: Optional[str] = Field(
|
|
57
|
+
default=None, description="SHA256 fingerprint value"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ElasticsearchClientInput(BaseModel):
|
|
57
62
|
hosts: Optional[list[str]] = None
|
|
58
63
|
cloud_id: Optional[str] = None
|
|
59
|
-
ca_certs: Optional[
|
|
60
|
-
basic_auth: Optional[tuple[str, str]] =
|
|
61
|
-
api_key: Optional[str]
|
|
64
|
+
ca_certs: Optional[Path] = None
|
|
65
|
+
basic_auth: Optional[Secret[tuple[str, str]]] = None
|
|
66
|
+
api_key: Optional[Union[Secret[tuple[str, str]], SecretStr]] = None
|
|
62
67
|
|
|
63
68
|
|
|
64
|
-
@dataclass
|
|
65
69
|
class ElasticsearchConnectionConfig(ConnectionConfig):
|
|
66
|
-
hosts: Optional[list[str]] =
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
hosts: Optional[list[str]] = Field(
|
|
71
|
+
default=None,
|
|
72
|
+
description="list of the Elasticsearch hosts to connect to",
|
|
73
|
+
examples=["http://localhost:9200"],
|
|
74
|
+
)
|
|
75
|
+
username: Optional[str] = Field(default=None, description="username when using basic auth")
|
|
76
|
+
cloud_id: Optional[str] = Field(default=None, description="id used to connect to Elastic Cloud")
|
|
77
|
+
api_key_id: Optional[str] = Field(
|
|
78
|
+
default=None,
|
|
79
|
+
description="id associated with api key used for authentication: "
|
|
80
|
+
"https://www.elastic.co/guide/en/elasticsearch/reference/current/security-api-create-api-key.html", # noqa: E501
|
|
81
|
+
)
|
|
82
|
+
ca_certs: Optional[Path] = None
|
|
83
|
+
access_config: Secret[ElasticsearchAccessConfig]
|
|
72
84
|
|
|
73
85
|
def get_client_kwargs(self) -> dict:
|
|
74
86
|
# Update auth related fields to conform to what the SDK expects based on the
|
|
75
87
|
# supported methods:
|
|
76
88
|
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
|
|
77
|
-
|
|
89
|
+
client_input_kwargs: dict[str, Any] = {}
|
|
90
|
+
access_config = self.access_config.get_secret_value()
|
|
78
91
|
if self.hosts:
|
|
79
|
-
|
|
92
|
+
client_input_kwargs["hosts"] = self.hosts
|
|
80
93
|
if self.cloud_id:
|
|
81
|
-
|
|
94
|
+
client_input_kwargs["cloud_id"] = self.cloud_id
|
|
82
95
|
if self.ca_certs:
|
|
83
|
-
|
|
84
|
-
if
|
|
85
|
-
self.cloud_id or self.ca_certs or
|
|
96
|
+
client_input_kwargs["ca_certs"] = self.ca_certs
|
|
97
|
+
if access_config.password and (
|
|
98
|
+
self.cloud_id or self.ca_certs or access_config.ssl_assert_fingerprint
|
|
86
99
|
):
|
|
87
|
-
|
|
88
|
-
elif not self.cloud_id and self.username and
|
|
89
|
-
|
|
90
|
-
elif
|
|
91
|
-
|
|
92
|
-
elif
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
100
|
+
client_input_kwargs["basic_auth"] = ("elastic", access_config.password)
|
|
101
|
+
elif not self.cloud_id and self.username and access_config.password:
|
|
102
|
+
client_input_kwargs["basic_auth"] = (self.username, access_config.password)
|
|
103
|
+
elif access_config.es_api_key and self.api_key_id:
|
|
104
|
+
client_input_kwargs["api_key"] = (self.api_key_id, access_config.es_api_key)
|
|
105
|
+
elif access_config.es_api_key:
|
|
106
|
+
client_input_kwargs["api_key"] = access_config.es_api_key
|
|
107
|
+
client_input = ElasticsearchClientInput(**client_input_kwargs)
|
|
108
|
+
logger.debug(f"Elasticsearch client inputs mapped to: {client_input.dict()}")
|
|
109
|
+
client_kwargs = client_input.dict()
|
|
110
|
+
client_kwargs["basic_auth"] = (
|
|
111
|
+
client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
|
|
112
|
+
)
|
|
113
|
+
client_kwargs["api_key"] = (
|
|
114
|
+
client_input.api_key.get_secret_value() if client_input.api_key else None
|
|
96
115
|
)
|
|
97
|
-
client_kwargs = client_input.to_dict(redact_sensitive=False)
|
|
98
116
|
client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
|
|
99
117
|
return client_kwargs
|
|
100
118
|
|
|
@@ -114,7 +132,6 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
|
|
|
114
132
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
115
133
|
|
|
116
134
|
|
|
117
|
-
@dataclass
|
|
118
135
|
class ElasticsearchIndexerConfig(IndexerConfig):
|
|
119
136
|
index_name: str
|
|
120
137
|
batch_size: int = 100
|
|
@@ -186,7 +203,6 @@ class ElasticsearchIndexer(Indexer):
|
|
|
186
203
|
)
|
|
187
204
|
|
|
188
205
|
|
|
189
|
-
@dataclass
|
|
190
206
|
class ElasticsearchDownloaderConfig(DownloaderConfig):
|
|
191
207
|
fields: list[str] = field(default_factory=list)
|
|
192
208
|
|
|
@@ -292,9 +308,10 @@ class ElasticsearchDownloader(Downloader):
|
|
|
292
308
|
return download_responses
|
|
293
309
|
|
|
294
310
|
|
|
295
|
-
@dataclass
|
|
296
311
|
class ElasticsearchUploadStagerConfig(UploadStagerConfig):
|
|
297
|
-
index_name: str
|
|
312
|
+
index_name: str = Field(
|
|
313
|
+
description="Name of the Elasticsearch index to pull data from, or upload data to."
|
|
314
|
+
)
|
|
298
315
|
|
|
299
316
|
|
|
300
317
|
@dataclass
|
|
@@ -333,11 +350,19 @@ class ElasticsearchUploadStager(UploadStager):
|
|
|
333
350
|
return output_path
|
|
334
351
|
|
|
335
352
|
|
|
336
|
-
@dataclass
|
|
337
353
|
class ElasticsearchUploaderConfig(UploaderConfig):
|
|
338
|
-
index_name: str
|
|
339
|
-
|
|
340
|
-
|
|
354
|
+
index_name: str = Field(
|
|
355
|
+
description="Name of the Elasticsearch index to pull data from, or upload data to."
|
|
356
|
+
)
|
|
357
|
+
batch_size_bytes: int = Field(
|
|
358
|
+
default=15_000_000,
|
|
359
|
+
description="Size limit (in bytes) for each batch of items to be uploaded. Check"
|
|
360
|
+
" https://www.elastic.co/guide/en/elasticsearch/guide/current/bulk.html"
|
|
361
|
+
"#_how_big_is_too_big for more information.",
|
|
362
|
+
)
|
|
363
|
+
num_threads: int = Field(
|
|
364
|
+
default=4, description="Number of threads to be used while uploading content"
|
|
365
|
+
)
|
|
341
366
|
|
|
342
367
|
|
|
343
368
|
@dataclass
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -36,35 +37,59 @@ def azure_json_serial(obj):
|
|
|
36
37
|
return json_serial(obj)
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
@dataclass
|
|
40
40
|
class AzureIndexerConfig(FsspecIndexerConfig):
|
|
41
41
|
pass
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
@dataclass
|
|
45
44
|
class AzureAccessConfig(FsspecAccessConfig):
|
|
46
|
-
account_name: Optional[str] =
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
45
|
+
account_name: Optional[str] = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="The storage account name. This is used to authenticate "
|
|
48
|
+
"requests signed with an account key and to construct "
|
|
49
|
+
"the storage endpoint. It is required unless a connection "
|
|
50
|
+
"string is given, or if a custom domain is used with "
|
|
51
|
+
"anonymous authentication.",
|
|
52
|
+
)
|
|
53
|
+
account_key: Optional[str] = Field(
|
|
54
|
+
default=None,
|
|
55
|
+
description="The storage account key. This is used for shared key "
|
|
56
|
+
"authentication. If any of account key, sas token or "
|
|
57
|
+
"client_id are not specified, anonymous access will be used.",
|
|
58
|
+
)
|
|
59
|
+
connection_string: Optional[str] = Field(
|
|
60
|
+
default=None,
|
|
61
|
+
description="If specified, this will override all other parameters. See "
|
|
62
|
+
"http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501
|
|
63
|
+
"for the connection string format.",
|
|
64
|
+
)
|
|
65
|
+
sas_token: Optional[str] = Field(
|
|
66
|
+
default=None,
|
|
67
|
+
description="A shared access signature token to use to authenticate "
|
|
68
|
+
"requests instead of the account key. If account key and "
|
|
69
|
+
"sas token are both specified, account key will be used "
|
|
70
|
+
"to sign. If any of account key, sas token or client_id "
|
|
71
|
+
"are not specified, anonymous access will be used.",
|
|
72
|
+
)
|
|
50
73
|
|
|
51
|
-
def
|
|
74
|
+
def model_post_init(self, __context: Any) -> None:
|
|
52
75
|
if self.connection_string is None and self.account_name is None:
|
|
53
76
|
raise ValueError("either connection_string or account_name must be set")
|
|
54
77
|
|
|
55
78
|
|
|
56
|
-
|
|
79
|
+
SecretAzureAccessConfig = Secret[AzureAccessConfig]
|
|
80
|
+
|
|
81
|
+
|
|
57
82
|
class AzureConnectionConfig(FsspecConnectionConfig):
|
|
58
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["az"])
|
|
59
|
-
access_config:
|
|
60
|
-
|
|
83
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["az"], init=False)
|
|
84
|
+
access_config: SecretAzureAccessConfig = Field(
|
|
85
|
+
default_factory=lambda: SecretAzureAccessConfig(secret_value=AzureAccessConfig())
|
|
61
86
|
)
|
|
62
|
-
connector_type: str = CONNECTOR_TYPE
|
|
87
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
63
88
|
|
|
64
89
|
def get_access_config(self) -> dict[str, Any]:
|
|
65
90
|
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
66
91
|
access_configs: dict[str, Any] = {
|
|
67
|
-
k: v for k, v in self.access_config.
|
|
92
|
+
k: v for k, v in self.access_config.get_secret_value().dict().items() if v
|
|
68
93
|
}
|
|
69
94
|
return access_configs
|
|
70
95
|
|
|
@@ -88,7 +113,6 @@ class AzureIndexer(FsspecIndexer):
|
|
|
88
113
|
return super().run(**kwargs)
|
|
89
114
|
|
|
90
115
|
|
|
91
|
-
@dataclass
|
|
92
116
|
class AzureDownloaderConfig(FsspecDownloaderConfig):
|
|
93
117
|
pass
|
|
94
118
|
|
|
@@ -109,7 +133,6 @@ class AzureDownloader(FsspecDownloader):
|
|
|
109
133
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
110
134
|
|
|
111
135
|
|
|
112
|
-
@dataclass
|
|
113
136
|
class AzureUploaderConfig(FsspecUploaderConfig):
|
|
114
137
|
pass
|
|
115
138
|
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -25,35 +26,38 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
25
26
|
CONNECTOR_TYPE = "box"
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
@dataclass
|
|
29
29
|
class BoxIndexerConfig(FsspecIndexerConfig):
|
|
30
30
|
pass
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
@dataclass
|
|
34
33
|
class BoxAccessConfig(FsspecAccessConfig):
|
|
35
|
-
box_app_config: Optional[str] =
|
|
34
|
+
box_app_config: Optional[str] = Field(
|
|
35
|
+
default=None, description="Path to Box app credentials as json file."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
SecretBoxAccessConfig = Secret[BoxAccessConfig]
|
|
36
40
|
|
|
37
41
|
|
|
38
|
-
@dataclass
|
|
39
42
|
class BoxConnectionConfig(FsspecConnectionConfig):
|
|
40
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["box"])
|
|
41
|
-
access_config:
|
|
42
|
-
|
|
43
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
|
|
44
|
+
access_config: SecretBoxAccessConfig = Field(
|
|
45
|
+
default_factory=lambda: SecretBoxAccessConfig(secret_value=BoxAccessConfig())
|
|
43
46
|
)
|
|
44
|
-
connector_type: str = CONNECTOR_TYPE
|
|
47
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
45
48
|
|
|
46
49
|
def get_access_config(self) -> dict[str, Any]:
|
|
47
50
|
# Return access_kwargs with oauth. The oauth object can not be stored directly in the config
|
|
48
51
|
# because it is not serializable.
|
|
49
52
|
from boxsdk import JWTAuth
|
|
50
53
|
|
|
54
|
+
ac = self.access_config.get_secret_value()
|
|
51
55
|
access_kwargs_with_oauth: dict[str, Any] = {
|
|
52
56
|
"oauth": JWTAuth.from_settings_file(
|
|
53
|
-
|
|
57
|
+
ac.box_app_config,
|
|
54
58
|
),
|
|
55
59
|
}
|
|
56
|
-
access_config: dict[str, Any] =
|
|
60
|
+
access_config: dict[str, Any] = ac.dict()
|
|
57
61
|
access_config.pop("box_app_config", None)
|
|
58
62
|
access_kwargs_with_oauth.update(access_config)
|
|
59
63
|
|
|
@@ -75,7 +79,6 @@ class BoxIndexer(FsspecIndexer):
|
|
|
75
79
|
super().precheck()
|
|
76
80
|
|
|
77
81
|
|
|
78
|
-
@dataclass
|
|
79
82
|
class BoxDownloaderConfig(FsspecDownloaderConfig):
|
|
80
83
|
pass
|
|
81
84
|
|
|
@@ -96,7 +99,6 @@ class BoxDownloader(FsspecDownloader):
|
|
|
96
99
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
97
100
|
|
|
98
101
|
|
|
99
|
-
@dataclass
|
|
100
102
|
class BoxUploaderConfig(FsspecUploaderConfig):
|
|
101
103
|
pass
|
|
102
104
|
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
10
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -26,23 +27,23 @@ from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_d
|
|
|
26
27
|
CONNECTOR_TYPE = "dropbox"
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
@dataclass
|
|
30
30
|
class DropboxIndexerConfig(FsspecIndexerConfig):
|
|
31
31
|
pass
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@dataclass
|
|
35
34
|
class DropboxAccessConfig(FsspecAccessConfig):
|
|
36
|
-
token: Optional[str] = None
|
|
35
|
+
token: Optional[str] = Field(default=None, description="Dropbox access token.")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
SecretDropboxAccessConfig = Secret[DropboxAccessConfig]
|
|
37
39
|
|
|
38
40
|
|
|
39
|
-
@dataclass
|
|
40
41
|
class DropboxConnectionConfig(FsspecConnectionConfig):
|
|
41
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"])
|
|
42
|
-
access_config:
|
|
43
|
-
|
|
42
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"], init=False)
|
|
43
|
+
access_config: SecretDropboxAccessConfig = Field(
|
|
44
|
+
default_factory=lambda: SecretDropboxAccessConfig(secret_value=DropboxAccessConfig())
|
|
44
45
|
)
|
|
45
|
-
connector_type: str = CONNECTOR_TYPE
|
|
46
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
@dataclass
|
|
@@ -72,7 +73,6 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
72
73
|
return sterilize_dict(data=info)
|
|
73
74
|
|
|
74
75
|
|
|
75
|
-
@dataclass
|
|
76
76
|
class DropboxDownloaderConfig(FsspecDownloaderConfig):
|
|
77
77
|
pass
|
|
78
78
|
|
|
@@ -95,7 +95,6 @@ class DropboxDownloader(FsspecDownloader):
|
|
|
95
95
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
96
96
|
|
|
97
97
|
|
|
98
|
-
@dataclass
|
|
99
98
|
class DropboxUploaderConfig(FsspecUploaderConfig):
|
|
100
99
|
pass
|
|
101
100
|
|
|
@@ -8,7 +8,8 @@ from time import time
|
|
|
8
8
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
9
9
|
from uuid import NAMESPACE_DNS, uuid5
|
|
10
10
|
|
|
11
|
-
from
|
|
11
|
+
from pydantic import BaseModel, Field, Secret
|
|
12
|
+
|
|
12
13
|
from unstructured_ingest.error import (
|
|
13
14
|
DestinationConnectionError,
|
|
14
15
|
SourceConnectionError,
|
|
@@ -38,17 +39,12 @@ if TYPE_CHECKING:
|
|
|
38
39
|
CONNECTOR_TYPE = "fsspec"
|
|
39
40
|
|
|
40
41
|
|
|
41
|
-
class
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class FileConfig(Base):
|
|
48
|
-
remote_url: str
|
|
49
|
-
protocol: str = field(init=False)
|
|
50
|
-
path_without_protocol: str = field(init=False)
|
|
51
|
-
supported_protocols: list[str] = field(
|
|
42
|
+
class FileConfig(BaseModel):
|
|
43
|
+
remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
|
|
44
|
+
protocol: str = Field(init=False)
|
|
45
|
+
path_without_protocol: str = Field(init=False)
|
|
46
|
+
supported_protocols: list[str] = Field(
|
|
47
|
+
init=False,
|
|
52
48
|
default_factory=lambda: [
|
|
53
49
|
"s3",
|
|
54
50
|
"s3a",
|
|
@@ -59,37 +55,27 @@ class FileConfig(Base):
|
|
|
59
55
|
"box",
|
|
60
56
|
"dropbox",
|
|
61
57
|
"sftp",
|
|
62
|
-
]
|
|
58
|
+
],
|
|
63
59
|
)
|
|
64
60
|
|
|
65
|
-
def
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
"Protocol {} not supported yet, only {} are supported.".format(
|
|
71
|
-
self.protocol, ", ".join(self.supported_protocols)
|
|
72
|
-
),
|
|
73
|
-
)
|
|
61
|
+
def __init__(self, **data):
|
|
62
|
+
protocol, path_without_protocol = data["remote_url"].split("://")
|
|
63
|
+
data["protocol"] = protocol
|
|
64
|
+
data["path_without_protocol"] = path_without_protocol
|
|
65
|
+
super().__init__(**data)
|
|
74
66
|
|
|
75
67
|
|
|
76
|
-
@dataclass
|
|
77
68
|
class FsspecIndexerConfig(FileConfig, IndexerConfig):
|
|
78
69
|
recursive: bool = False
|
|
79
70
|
|
|
80
71
|
|
|
81
|
-
@dataclass
|
|
82
72
|
class FsspecAccessConfig(AccessConfig):
|
|
83
73
|
pass
|
|
84
74
|
|
|
85
75
|
|
|
86
|
-
FsspecAccessConfigT = TypeVar("FsspecAccessConfigT", bound=FsspecAccessConfig)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
@dataclass
|
|
90
76
|
class FsspecConnectionConfig(ConnectionConfig):
|
|
91
|
-
access_config:
|
|
92
|
-
connector_type: str = CONNECTOR_TYPE
|
|
77
|
+
access_config: Secret[FsspecAccessConfig]
|
|
78
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
93
79
|
|
|
94
80
|
|
|
95
81
|
FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
|
|
@@ -100,7 +86,7 @@ FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnect
|
|
|
100
86
|
class FsspecIndexer(Indexer):
|
|
101
87
|
connection_config: FsspecConnectionConfigT
|
|
102
88
|
index_config: FsspecIndexerConfigT
|
|
103
|
-
connector_type: str = CONNECTOR_TYPE
|
|
89
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
104
90
|
|
|
105
91
|
@property
|
|
106
92
|
def fs(self) -> "AbstractFileSystem":
|
|
@@ -223,7 +209,6 @@ class FsspecIndexer(Indexer):
|
|
|
223
209
|
)
|
|
224
210
|
|
|
225
211
|
|
|
226
|
-
@dataclass
|
|
227
212
|
class FsspecDownloaderConfig(DownloaderConfig):
|
|
228
213
|
pass
|
|
229
214
|
|
|
@@ -274,9 +259,10 @@ class FsspecDownloader(Downloader):
|
|
|
274
259
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
275
260
|
|
|
276
261
|
|
|
277
|
-
@dataclass
|
|
278
262
|
class FsspecUploaderConfig(FileConfig, UploaderConfig):
|
|
279
|
-
overwrite: bool =
|
|
263
|
+
overwrite: bool = Field(
|
|
264
|
+
default=False, description="If true, an existing file will be overwritten."
|
|
265
|
+
)
|
|
280
266
|
|
|
281
267
|
|
|
282
268
|
FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
|
|
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Optional, Union
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
10
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
10
11
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
@@ -26,17 +27,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
27
|
CONNECTOR_TYPE = "gcs"
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
@dataclass
|
|
30
30
|
class GcsIndexerConfig(FsspecIndexerConfig):
|
|
31
31
|
pass
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
|
|
34
|
+
service_account_key_description = """
|
|
35
|
+
Options:
|
|
36
|
+
- ``None``, GCSFS will attempt to guess your credentials in the
|
|
37
|
+
following order: gcloud CLI default, gcsfs cached token, google compute
|
|
38
|
+
metadata service, anonymous.
|
|
39
|
+
- ``'google_default'``, your default gcloud credentials will be used,
|
|
40
|
+
which are typically established by doing ``gcloud login`` in a terminal.
|
|
41
|
+
- ``'cache'``, credentials from previously successful gcsfs
|
|
42
|
+
authentication will be used (use this after "browser" auth succeeded)
|
|
43
|
+
- ``'anon'``, no authentication is performed, and you can only
|
|
44
|
+
access data which is accessible to allUsers (in this case, the project and
|
|
45
|
+
access level parameters are meaningless)
|
|
46
|
+
- ``'browser'``, you get an access code with which you can
|
|
47
|
+
authenticate via a specially provided URL
|
|
48
|
+
- if ``'cloud'``, we assume we are running within google compute
|
|
49
|
+
or google container engine, and query the internal metadata directly for
|
|
50
|
+
a token.
|
|
51
|
+
- you may supply a token generated by the
|
|
52
|
+
[gcloud](https://cloud.google.com/sdk/docs/)
|
|
53
|
+
utility; this is either a python dictionary or the name of a file
|
|
54
|
+
containing the JSON returned by logging in with the gcloud CLI tool.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
|
|
35
58
|
class GcsAccessConfig(FsspecAccessConfig):
|
|
36
|
-
service_account_key: Optional[str] =
|
|
37
|
-
|
|
59
|
+
service_account_key: Optional[str] = Field(
|
|
60
|
+
default=None, description=service_account_key_description
|
|
61
|
+
)
|
|
62
|
+
token: Union[str, dict, None] = Field(init=False, default=None)
|
|
38
63
|
|
|
39
|
-
def
|
|
64
|
+
def model_post_init(self, __context: Any) -> None:
|
|
40
65
|
ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
|
|
41
66
|
|
|
42
67
|
# Case: null value
|
|
@@ -61,13 +86,15 @@ class GcsAccessConfig(FsspecAccessConfig):
|
|
|
61
86
|
raise ValueError("Invalid auth token value")
|
|
62
87
|
|
|
63
88
|
|
|
64
|
-
|
|
89
|
+
SecretGcsAccessConfig = Secret[GcsAccessConfig]
|
|
90
|
+
|
|
91
|
+
|
|
65
92
|
class GcsConnectionConfig(FsspecConnectionConfig):
|
|
66
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"])
|
|
67
|
-
access_config:
|
|
68
|
-
|
|
93
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"], init=False)
|
|
94
|
+
access_config: SecretGcsAccessConfig = Field(
|
|
95
|
+
default_factory=lambda: SecretGcsAccessConfig(secret_value=GcsAccessConfig())
|
|
69
96
|
)
|
|
70
|
-
connector_type: str = CONNECTOR_TYPE
|
|
97
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
71
98
|
|
|
72
99
|
|
|
73
100
|
@dataclass
|
|
@@ -85,7 +112,6 @@ class GcsIndexer(FsspecIndexer):
|
|
|
85
112
|
super().precheck()
|
|
86
113
|
|
|
87
114
|
|
|
88
|
-
@dataclass
|
|
89
115
|
class GcsDownloaderConfig(FsspecDownloaderConfig):
|
|
90
116
|
pass
|
|
91
117
|
|
|
@@ -106,7 +132,6 @@ class GcsDownloader(FsspecDownloader):
|
|
|
106
132
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
107
133
|
|
|
108
134
|
|
|
109
|
-
@dataclass
|
|
110
135
|
class GcsUploaderConfig(FsspecUploaderConfig):
|
|
111
136
|
pass
|
|
112
137
|
|