unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (123) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +1 -5
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/local.py +22 -14
  69. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  72. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  73. unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
  74. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  75. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  76. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  77. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  78. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  79. unstructured_ingest/v2/processes/embedder.py +106 -47
  80. unstructured_ingest/v2/processes/filter.py +11 -5
  81. unstructured_ingest/v2/processes/partitioner.py +79 -33
  82. unstructured_ingest/v2/processes/uncompress.py +3 -3
  83. unstructured_ingest/v2/utils.py +45 -0
  84. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  85. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
  86. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  87. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  88. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  89. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  90. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  91. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  92. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  93. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  94. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  95. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  96. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  97. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  98. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  99. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  100. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  101. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  102. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  103. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  104. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  105. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  106. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  108. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  109. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  110. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  111. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  112. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  113. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  114. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  115. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  116. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  117. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  118. unstructured_ingest/v2/cli/interfaces.py +0 -27
  119. unstructured_ingest/v2/pipeline/utils.py +0 -15
  120. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  121. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  122. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  123. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -5,9 +5,10 @@ import uuid
5
5
  from dataclasses import dataclass, field
6
6
  from pathlib import Path
7
7
  from time import time
8
- from typing import TYPE_CHECKING, Any, Generator, Optional
8
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
9
+
10
+ from pydantic import BaseModel, Field, Secret, SecretStr
9
11
 
10
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
11
12
  from unstructured_ingest.error import (
12
13
  DestinationConnectionError,
13
14
  SourceConnectionError,
@@ -44,57 +45,74 @@ if TYPE_CHECKING:
44
45
  CONNECTOR_TYPE = "elasticsearch"
45
46
 
46
47
 
47
- @dataclass
48
48
  class ElasticsearchAccessConfig(AccessConfig):
49
- password: Optional[str] = None
50
- api_key: Optional[str] = enhanced_field(default=None, overload_name="es_api_key")
51
- bearer_auth: Optional[str] = None
52
- ssl_assert_fingerprint: Optional[str] = None
53
-
54
-
55
- @dataclass
56
- class ElasticsearchClientInput(EnhancedDataClassJsonMixin):
49
+ password: Optional[str] = Field(
50
+ default=None, description="password when using basic auth or connecting to a cloud instance"
51
+ )
52
+ es_api_key: Optional[str] = Field(default=None, description="api key used for authentication")
53
+ bearer_auth: Optional[str] = Field(
54
+ default=None, description="bearer token used for HTTP bearer authentication"
55
+ )
56
+ ssl_assert_fingerprint: Optional[str] = Field(
57
+ default=None, description="SHA256 fingerprint value"
58
+ )
59
+
60
+
61
+ class ElasticsearchClientInput(BaseModel):
57
62
  hosts: Optional[list[str]] = None
58
63
  cloud_id: Optional[str] = None
59
- ca_certs: Optional[str] = None
60
- basic_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None)
61
- api_key: Optional[str] = enhanced_field(sensitive=True, default=None)
64
+ ca_certs: Optional[Path] = None
65
+ basic_auth: Optional[Secret[tuple[str, str]]] = None
66
+ api_key: Optional[Union[Secret[tuple[str, str]], SecretStr]] = None
62
67
 
63
68
 
64
- @dataclass
65
69
  class ElasticsearchConnectionConfig(ConnectionConfig):
66
- hosts: Optional[list[str]] = None
67
- username: Optional[str] = None
68
- cloud_id: Optional[str] = None
69
- api_key_id: Optional[str] = None
70
- ca_certs: Optional[str] = None
71
- access_config: ElasticsearchAccessConfig = enhanced_field(sensitive=True)
70
+ hosts: Optional[list[str]] = Field(
71
+ default=None,
72
+ description="list of the Elasticsearch hosts to connect to",
73
+ examples=["http://localhost:9200"],
74
+ )
75
+ username: Optional[str] = Field(default=None, description="username when using basic auth")
76
+ cloud_id: Optional[str] = Field(default=None, description="id used to connect to Elastic Cloud")
77
+ api_key_id: Optional[str] = Field(
78
+ default=None,
79
+ description="id associated with api key used for authentication: "
80
+ "https://www.elastic.co/guide/en/elasticsearch/reference/current/security-api-create-api-key.html", # noqa: E501
81
+ )
82
+ ca_certs: Optional[Path] = None
83
+ access_config: Secret[ElasticsearchAccessConfig]
72
84
 
73
85
  def get_client_kwargs(self) -> dict:
74
86
  # Update auth related fields to conform to what the SDK expects based on the
75
87
  # supported methods:
76
88
  # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
77
- client_input = ElasticsearchClientInput()
89
+ client_input_kwargs: dict[str, Any] = {}
90
+ access_config = self.access_config.get_secret_value()
78
91
  if self.hosts:
79
- client_input.hosts = self.hosts
92
+ client_input_kwargs["hosts"] = self.hosts
80
93
  if self.cloud_id:
81
- client_input.cloud_id = self.cloud_id
94
+ client_input_kwargs["cloud_id"] = self.cloud_id
82
95
  if self.ca_certs:
83
- client_input.ca_certs = self.ca_certs
84
- if self.access_config.password and (
85
- self.cloud_id or self.ca_certs or self.access_config.ssl_assert_fingerprint
96
+ client_input_kwargs["ca_certs"] = self.ca_certs
97
+ if access_config.password and (
98
+ self.cloud_id or self.ca_certs or access_config.ssl_assert_fingerprint
86
99
  ):
87
- client_input.basic_auth = ("elastic", self.access_config.password)
88
- elif not self.cloud_id and self.username and self.access_config.password:
89
- client_input.basic_auth = (self.username, self.access_config.password)
90
- elif self.access_config.api_key and self.api_key_id:
91
- client_input.api_key = (self.api_key_id, self.access_config.api_key)
92
- elif self.access_config.api_key:
93
- client_input.api_key = self.access_config.api_key
94
- logger.debug(
95
- f"Elasticsearch client inputs mapped to: {client_input.to_dict(redact_sensitive=True)}"
100
+ client_input_kwargs["basic_auth"] = ("elastic", access_config.password)
101
+ elif not self.cloud_id and self.username and access_config.password:
102
+ client_input_kwargs["basic_auth"] = (self.username, access_config.password)
103
+ elif access_config.es_api_key and self.api_key_id:
104
+ client_input_kwargs["api_key"] = (self.api_key_id, access_config.es_api_key)
105
+ elif access_config.es_api_key:
106
+ client_input_kwargs["api_key"] = access_config.es_api_key
107
+ client_input = ElasticsearchClientInput(**client_input_kwargs)
108
+ logger.debug(f"Elasticsearch client inputs mapped to: {client_input.dict()}")
109
+ client_kwargs = client_input.dict()
110
+ client_kwargs["basic_auth"] = (
111
+ client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
112
+ )
113
+ client_kwargs["api_key"] = (
114
+ client_input.api_key.get_secret_value() if client_input.api_key else None
96
115
  )
97
- client_kwargs = client_input.to_dict(redact_sensitive=False)
98
116
  client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
99
117
  return client_kwargs
100
118
 
@@ -114,7 +132,6 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
114
132
  raise SourceConnectionError(f"failed to validate connection: {e}")
115
133
 
116
134
 
117
- @dataclass
118
135
  class ElasticsearchIndexerConfig(IndexerConfig):
119
136
  index_name: str
120
137
  batch_size: int = 100
@@ -186,7 +203,6 @@ class ElasticsearchIndexer(Indexer):
186
203
  )
187
204
 
188
205
 
189
- @dataclass
190
206
  class ElasticsearchDownloaderConfig(DownloaderConfig):
191
207
  fields: list[str] = field(default_factory=list)
192
208
 
@@ -292,9 +308,10 @@ class ElasticsearchDownloader(Downloader):
292
308
  return download_responses
293
309
 
294
310
 
295
- @dataclass
296
311
  class ElasticsearchUploadStagerConfig(UploadStagerConfig):
297
- index_name: str
312
+ index_name: str = Field(
313
+ description="Name of the Elasticsearch index to pull data from, or upload data to."
314
+ )
298
315
 
299
316
 
300
317
  @dataclass
@@ -333,11 +350,19 @@ class ElasticsearchUploadStager(UploadStager):
333
350
  return output_path
334
351
 
335
352
 
336
- @dataclass
337
353
  class ElasticsearchUploaderConfig(UploaderConfig):
338
- index_name: str
339
- batch_size_bytes: int = 15_000_000
340
- num_threads: int = 4
354
+ index_name: str = Field(
355
+ description="Name of the Elasticsearch index to pull data from, or upload data to."
356
+ )
357
+ batch_size_bytes: int = Field(
358
+ default=15_000_000,
359
+ description="Size limit (in bytes) for each batch of items to be uploaded. Check"
360
+ " https://www.elastic.co/guide/en/elasticsearch/guide/current/bulk.html"
361
+ "#_how_big_is_too_big for more information.",
362
+ )
363
+ num_threads: int = Field(
364
+ default=4, description="Number of threads to be used while uploading content"
365
+ )
341
366
 
342
367
 
343
368
  @dataclass
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
11
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -36,35 +37,59 @@ def azure_json_serial(obj):
36
37
  return json_serial(obj)
37
38
 
38
39
 
39
- @dataclass
40
40
  class AzureIndexerConfig(FsspecIndexerConfig):
41
41
  pass
42
42
 
43
43
 
44
- @dataclass
45
44
  class AzureAccessConfig(FsspecAccessConfig):
46
- account_name: Optional[str] = None
47
- account_key: Optional[str] = None
48
- connection_string: Optional[str] = None
49
- sas_token: Optional[str] = None
45
+ account_name: Optional[str] = Field(
46
+ default=None,
47
+ description="The storage account name. This is used to authenticate "
48
+ "requests signed with an account key and to construct "
49
+ "the storage endpoint. It is required unless a connection "
50
+ "string is given, or if a custom domain is used with "
51
+ "anonymous authentication.",
52
+ )
53
+ account_key: Optional[str] = Field(
54
+ default=None,
55
+ description="The storage account key. This is used for shared key "
56
+ "authentication. If any of account key, sas token or "
57
+ "client_id are not specified, anonymous access will be used.",
58
+ )
59
+ connection_string: Optional[str] = Field(
60
+ default=None,
61
+ description="If specified, this will override all other parameters. See "
62
+ "http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501
63
+ "for the connection string format.",
64
+ )
65
+ sas_token: Optional[str] = Field(
66
+ default=None,
67
+ description="A shared access signature token to use to authenticate "
68
+ "requests instead of the account key. If account key and "
69
+ "sas token are both specified, account key will be used "
70
+ "to sign. If any of account key, sas token or client_id "
71
+ "are not specified, anonymous access will be used.",
72
+ )
50
73
 
51
- def __post_init__(self):
74
+ def model_post_init(self, __context: Any) -> None:
52
75
  if self.connection_string is None and self.account_name is None:
53
76
  raise ValueError("either connection_string or account_name must be set")
54
77
 
55
78
 
56
- @dataclass
79
+ SecretAzureAccessConfig = Secret[AzureAccessConfig]
80
+
81
+
57
82
  class AzureConnectionConfig(FsspecConnectionConfig):
58
- supported_protocols: list[str] = field(default_factory=lambda: ["az"])
59
- access_config: AzureAccessConfig = enhanced_field(
60
- sensitive=True, default_factory=lambda: AzureAccessConfig()
83
+ supported_protocols: list[str] = field(default_factory=lambda: ["az"], init=False)
84
+ access_config: SecretAzureAccessConfig = Field(
85
+ default_factory=lambda: SecretAzureAccessConfig(secret_value=AzureAccessConfig())
61
86
  )
62
- connector_type: str = CONNECTOR_TYPE
87
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
63
88
 
64
89
  def get_access_config(self) -> dict[str, Any]:
65
90
  # Avoid injecting None by filtering out k,v pairs where the value is None
66
91
  access_configs: dict[str, Any] = {
67
- k: v for k, v in self.access_config.to_dict().items() if v
92
+ k: v for k, v in self.access_config.get_secret_value().dict().items() if v
68
93
  }
69
94
  return access_configs
70
95
 
@@ -88,7 +113,6 @@ class AzureIndexer(FsspecIndexer):
88
113
  return super().run(**kwargs)
89
114
 
90
115
 
91
- @dataclass
92
116
  class AzureDownloaderConfig(FsspecDownloaderConfig):
93
117
  pass
94
118
 
@@ -109,7 +133,6 @@ class AzureDownloader(FsspecDownloader):
109
133
  return await super().run_async(file_data=file_data, **kwargs)
110
134
 
111
135
 
112
- @dataclass
113
136
  class AzureUploaderConfig(FsspecUploaderConfig):
114
137
  pass
115
138
 
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
11
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -25,35 +26,38 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
25
26
  CONNECTOR_TYPE = "box"
26
27
 
27
28
 
28
- @dataclass
29
29
  class BoxIndexerConfig(FsspecIndexerConfig):
30
30
  pass
31
31
 
32
32
 
33
- @dataclass
34
33
  class BoxAccessConfig(FsspecAccessConfig):
35
- box_app_config: Optional[str] = None
34
+ box_app_config: Optional[str] = Field(
35
+ default=None, description="Path to Box app credentials as json file."
36
+ )
37
+
38
+
39
+ SecretBoxAccessConfig = Secret[BoxAccessConfig]
36
40
 
37
41
 
38
- @dataclass
39
42
  class BoxConnectionConfig(FsspecConnectionConfig):
40
- supported_protocols: list[str] = field(default_factory=lambda: ["box"])
41
- access_config: BoxAccessConfig = enhanced_field(
42
- sensitive=True, default_factory=lambda: BoxAccessConfig()
43
+ supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
44
+ access_config: SecretBoxAccessConfig = Field(
45
+ default_factory=lambda: SecretBoxAccessConfig(secret_value=BoxAccessConfig())
43
46
  )
44
- connector_type: str = CONNECTOR_TYPE
47
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
45
48
 
46
49
  def get_access_config(self) -> dict[str, Any]:
47
50
  # Return access_kwargs with oauth. The oauth object can not be stored directly in the config
48
51
  # because it is not serializable.
49
52
  from boxsdk import JWTAuth
50
53
 
54
+ ac = self.access_config.get_secret_value()
51
55
  access_kwargs_with_oauth: dict[str, Any] = {
52
56
  "oauth": JWTAuth.from_settings_file(
53
- self.access_config.box_app_config,
57
+ ac.box_app_config,
54
58
  ),
55
59
  }
56
- access_config: dict[str, Any] = self.access_config.to_dict()
60
+ access_config: dict[str, Any] = ac.dict()
57
61
  access_config.pop("box_app_config", None)
58
62
  access_kwargs_with_oauth.update(access_config)
59
63
 
@@ -75,7 +79,6 @@ class BoxIndexer(FsspecIndexer):
75
79
  super().precheck()
76
80
 
77
81
 
78
- @dataclass
79
82
  class BoxDownloaderConfig(FsspecDownloaderConfig):
80
83
  pass
81
84
 
@@ -96,7 +99,6 @@ class BoxDownloader(FsspecDownloader):
96
99
  return await super().run_async(file_data=file_data, **kwargs)
97
100
 
98
101
 
99
- @dataclass
100
102
  class BoxUploaderConfig(FsspecUploaderConfig):
101
103
  pass
102
104
 
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
11
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -26,23 +27,23 @@ from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_d
26
27
  CONNECTOR_TYPE = "dropbox"
27
28
 
28
29
 
29
- @dataclass
30
30
  class DropboxIndexerConfig(FsspecIndexerConfig):
31
31
  pass
32
32
 
33
33
 
34
- @dataclass
35
34
  class DropboxAccessConfig(FsspecAccessConfig):
36
- token: Optional[str] = None
35
+ token: Optional[str] = Field(default=None, description="Dropbox access token.")
36
+
37
+
38
+ SecretDropboxAccessConfig = Secret[DropboxAccessConfig]
37
39
 
38
40
 
39
- @dataclass
40
41
  class DropboxConnectionConfig(FsspecConnectionConfig):
41
- supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"])
42
- access_config: DropboxAccessConfig = enhanced_field(
43
- sensitive=True, default_factory=lambda: DropboxAccessConfig()
42
+ supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"], init=False)
43
+ access_config: SecretDropboxAccessConfig = Field(
44
+ default_factory=lambda: SecretDropboxAccessConfig(secret_value=DropboxAccessConfig())
44
45
  )
45
- connector_type: str = CONNECTOR_TYPE
46
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
46
47
 
47
48
 
48
49
  @dataclass
@@ -72,7 +73,6 @@ class DropboxIndexer(FsspecIndexer):
72
73
  return sterilize_dict(data=info)
73
74
 
74
75
 
75
- @dataclass
76
76
  class DropboxDownloaderConfig(FsspecDownloaderConfig):
77
77
  pass
78
78
 
@@ -95,7 +95,6 @@ class DropboxDownloader(FsspecDownloader):
95
95
  return await super().run_async(file_data=file_data, **kwargs)
96
96
 
97
97
 
98
- @dataclass
99
98
  class DropboxUploaderConfig(FsspecUploaderConfig):
100
99
  pass
101
100
 
@@ -8,7 +8,8 @@ from time import time
8
8
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
9
9
  from uuid import NAMESPACE_DNS, uuid5
10
10
 
11
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
+ from pydantic import BaseModel, Field, Secret
12
+
12
13
  from unstructured_ingest.error import (
13
14
  DestinationConnectionError,
14
15
  SourceConnectionError,
@@ -38,17 +39,12 @@ if TYPE_CHECKING:
38
39
  CONNECTOR_TYPE = "fsspec"
39
40
 
40
41
 
41
- class Base(object):
42
- def __post_init__(self):
43
- pass
44
-
45
-
46
- @dataclass
47
- class FileConfig(Base):
48
- remote_url: str
49
- protocol: str = field(init=False)
50
- path_without_protocol: str = field(init=False)
51
- supported_protocols: list[str] = field(
42
+ class FileConfig(BaseModel):
43
+ remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
44
+ protocol: str = Field(init=False)
45
+ path_without_protocol: str = Field(init=False)
46
+ supported_protocols: list[str] = Field(
47
+ init=False,
52
48
  default_factory=lambda: [
53
49
  "s3",
54
50
  "s3a",
@@ -59,37 +55,27 @@ class FileConfig(Base):
59
55
  "box",
60
56
  "dropbox",
61
57
  "sftp",
62
- ]
58
+ ],
63
59
  )
64
60
 
65
- def __post_init__(self):
66
- super().__post_init__()
67
- self.protocol, self.path_without_protocol = self.remote_url.split("://")
68
- if self.protocol not in self.supported_protocols:
69
- raise ValueError(
70
- "Protocol {} not supported yet, only {} are supported.".format(
71
- self.protocol, ", ".join(self.supported_protocols)
72
- ),
73
- )
61
+ def __init__(self, **data):
62
+ protocol, path_without_protocol = data["remote_url"].split("://")
63
+ data["protocol"] = protocol
64
+ data["path_without_protocol"] = path_without_protocol
65
+ super().__init__(**data)
74
66
 
75
67
 
76
- @dataclass
77
68
  class FsspecIndexerConfig(FileConfig, IndexerConfig):
78
69
  recursive: bool = False
79
70
 
80
71
 
81
- @dataclass
82
72
  class FsspecAccessConfig(AccessConfig):
83
73
  pass
84
74
 
85
75
 
86
- FsspecAccessConfigT = TypeVar("FsspecAccessConfigT", bound=FsspecAccessConfig)
87
-
88
-
89
- @dataclass
90
76
  class FsspecConnectionConfig(ConnectionConfig):
91
- access_config: FsspecAccessConfigT = enhanced_field(sensitive=True, default=None)
92
- connector_type: str = CONNECTOR_TYPE
77
+ access_config: Secret[FsspecAccessConfig]
78
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
93
79
 
94
80
 
95
81
  FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
@@ -100,7 +86,7 @@ FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnect
100
86
  class FsspecIndexer(Indexer):
101
87
  connection_config: FsspecConnectionConfigT
102
88
  index_config: FsspecIndexerConfigT
103
- connector_type: str = CONNECTOR_TYPE
89
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
104
90
 
105
91
  @property
106
92
  def fs(self) -> "AbstractFileSystem":
@@ -223,7 +209,6 @@ class FsspecIndexer(Indexer):
223
209
  )
224
210
 
225
211
 
226
- @dataclass
227
212
  class FsspecDownloaderConfig(DownloaderConfig):
228
213
  pass
229
214
 
@@ -274,9 +259,10 @@ class FsspecDownloader(Downloader):
274
259
  return self.generate_download_response(file_data=file_data, download_path=download_path)
275
260
 
276
261
 
277
- @dataclass
278
262
  class FsspecUploaderConfig(FileConfig, UploaderConfig):
279
- overwrite: bool = False
263
+ overwrite: bool = Field(
264
+ default=False, description="If true, an existing file will be overwritten."
265
+ )
280
266
 
281
267
 
282
268
  FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional, Union
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
10
11
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
@@ -26,17 +27,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
27
  CONNECTOR_TYPE = "gcs"
27
28
 
28
29
 
29
- @dataclass
30
30
  class GcsIndexerConfig(FsspecIndexerConfig):
31
31
  pass
32
32
 
33
33
 
34
- @dataclass
34
+ service_account_key_description = """
35
+ Options:
36
+ - ``None``, GCSFS will attempt to guess your credentials in the
37
+ following order: gcloud CLI default, gcsfs cached token, google compute
38
+ metadata service, anonymous.
39
+ - ``'google_default'``, your default gcloud credentials will be used,
40
+ which are typically established by doing ``gcloud login`` in a terminal.
41
+ - ``'cache'``, credentials from previously successful gcsfs
42
+ authentication will be used (use this after "browser" auth succeeded)
43
+ - ``'anon'``, no authentication is performed, and you can only
44
+ access data which is accessible to allUsers (in this case, the project and
45
+ access level parameters are meaningless)
46
+ - ``'browser'``, you get an access code with which you can
47
+ authenticate via a specially provided URL
48
+ - if ``'cloud'``, we assume we are running within google compute
49
+ or google container engine, and query the internal metadata directly for
50
+ a token.
51
+ - you may supply a token generated by the
52
+ [gcloud](https://cloud.google.com/sdk/docs/)
53
+ utility; this is either a python dictionary or the name of a file
54
+ containing the JSON returned by logging in with the gcloud CLI tool.
55
+ """
56
+
57
+
35
58
  class GcsAccessConfig(FsspecAccessConfig):
36
- service_account_key: Optional[str] = None
37
- token: Union[str, dict, None] = field(init=False, default=None)
59
+ service_account_key: Optional[str] = Field(
60
+ default=None, description=service_account_key_description
61
+ )
62
+ token: Union[str, dict, None] = Field(init=False, default=None)
38
63
 
39
- def __post_init__(self):
64
+ def model_post_init(self, __context: Any) -> None:
40
65
  ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
41
66
 
42
67
  # Case: null value
@@ -61,13 +86,15 @@ class GcsAccessConfig(FsspecAccessConfig):
61
86
  raise ValueError("Invalid auth token value")
62
87
 
63
88
 
64
- @dataclass
89
+ SecretGcsAccessConfig = Secret[GcsAccessConfig]
90
+
91
+
65
92
  class GcsConnectionConfig(FsspecConnectionConfig):
66
- supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"])
67
- access_config: GcsAccessConfig = enhanced_field(
68
- sensitive=True, default_factory=lambda: GcsAccessConfig()
93
+ supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"], init=False)
94
+ access_config: SecretGcsAccessConfig = Field(
95
+ default_factory=lambda: SecretGcsAccessConfig(secret_value=GcsAccessConfig())
69
96
  )
70
- connector_type: str = CONNECTOR_TYPE
97
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
71
98
 
72
99
 
73
100
  @dataclass
@@ -85,7 +112,6 @@ class GcsIndexer(FsspecIndexer):
85
112
  super().precheck()
86
113
 
87
114
 
88
- @dataclass
89
115
  class GcsDownloaderConfig(FsspecDownloaderConfig):
90
116
  pass
91
117
 
@@ -106,7 +132,6 @@ class GcsDownloader(FsspecDownloader):
106
132
  return await super().run_async(file_data=file_data, **kwargs)
107
133
 
108
134
 
109
- @dataclass
110
135
  class GcsUploaderConfig(FsspecUploaderConfig):
111
136
  pass
112
137