unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +9 -6
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
  69. unstructured_ingest/v2/processes/connectors/local.py +27 -16
  70. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  72. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  73. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
  75. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  76. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  77. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  78. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  79. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  80. unstructured_ingest/v2/processes/embedder.py +106 -47
  81. unstructured_ingest/v2/processes/filter.py +11 -5
  82. unstructured_ingest/v2/processes/partitioner.py +79 -33
  83. unstructured_ingest/v2/processes/uncompress.py +3 -3
  84. unstructured_ingest/v2/utils.py +45 -0
  85. unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
  86. unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
  87. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
  88. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
  89. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  90. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  91. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  92. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  93. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  94. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  95. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  96. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  97. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  99. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  100. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  101. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  102. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  103. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  104. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  105. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  106. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  107. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  108. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  109. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  110. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  111. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  112. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  113. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  114. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  115. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  116. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  117. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
11
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -36,35 +37,59 @@ def azure_json_serial(obj):
36
37
  return json_serial(obj)
37
38
 
38
39
 
39
- @dataclass
40
40
  class AzureIndexerConfig(FsspecIndexerConfig):
41
41
  pass
42
42
 
43
43
 
44
- @dataclass
45
44
  class AzureAccessConfig(FsspecAccessConfig):
46
- account_name: Optional[str] = None
47
- account_key: Optional[str] = None
48
- connection_string: Optional[str] = None
49
- sas_token: Optional[str] = None
45
+ account_name: Optional[str] = Field(
46
+ default=None,
47
+ description="The storage account name. This is used to authenticate "
48
+ "requests signed with an account key and to construct "
49
+ "the storage endpoint. It is required unless a connection "
50
+ "string is given, or if a custom domain is used with "
51
+ "anonymous authentication.",
52
+ )
53
+ account_key: Optional[str] = Field(
54
+ default=None,
55
+ description="The storage account key. This is used for shared key "
56
+ "authentication. If any of account key, sas token or "
57
+ "client_id are not specified, anonymous access will be used.",
58
+ )
59
+ connection_string: Optional[str] = Field(
60
+ default=None,
61
+ description="If specified, this will override all other parameters. See "
62
+ "http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501
63
+ "for the connection string format.",
64
+ )
65
+ sas_token: Optional[str] = Field(
66
+ default=None,
67
+ description="A shared access signature token to use to authenticate "
68
+ "requests instead of the account key. If account key and "
69
+ "sas token are both specified, account key will be used "
70
+ "to sign. If any of account key, sas token or client_id "
71
+ "are not specified, anonymous access will be used.",
72
+ )
50
73
 
51
- def __post_init__(self):
74
+ def model_post_init(self, __context: Any) -> None:
52
75
  if self.connection_string is None and self.account_name is None:
53
76
  raise ValueError("either connection_string or account_name must be set")
54
77
 
55
78
 
56
- @dataclass
79
+ SecretAzureAccessConfig = Secret[AzureAccessConfig]
80
+
81
+
57
82
  class AzureConnectionConfig(FsspecConnectionConfig):
58
- supported_protocols: list[str] = field(default_factory=lambda: ["az"])
59
- access_config: AzureAccessConfig = enhanced_field(
60
- sensitive=True, default_factory=lambda: AzureAccessConfig()
83
+ supported_protocols: list[str] = field(default_factory=lambda: ["az"], init=False)
84
+ access_config: SecretAzureAccessConfig = Field(
85
+ default_factory=lambda: SecretAzureAccessConfig(secret_value=AzureAccessConfig())
61
86
  )
62
- connector_type: str = CONNECTOR_TYPE
87
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
63
88
 
64
89
  def get_access_config(self) -> dict[str, Any]:
65
90
  # Avoid injecting None by filtering out k,v pairs where the value is None
66
91
  access_configs: dict[str, Any] = {
67
- k: v for k, v in self.access_config.to_dict().items() if v
92
+ k: v for k, v in self.access_config.get_secret_value().dict().items() if v
68
93
  }
69
94
  return access_configs
70
95
 
@@ -88,7 +113,6 @@ class AzureIndexer(FsspecIndexer):
88
113
  return super().run(**kwargs)
89
114
 
90
115
 
91
- @dataclass
92
116
  class AzureDownloaderConfig(FsspecDownloaderConfig):
93
117
  pass
94
118
 
@@ -109,7 +133,6 @@ class AzureDownloader(FsspecDownloader):
109
133
  return await super().run_async(file_data=file_data, **kwargs)
110
134
 
111
135
 
112
- @dataclass
113
136
  class AzureUploaderConfig(FsspecUploaderConfig):
114
137
  pass
115
138
 
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
11
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -25,35 +26,38 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
25
26
  CONNECTOR_TYPE = "box"
26
27
 
27
28
 
28
- @dataclass
29
29
  class BoxIndexerConfig(FsspecIndexerConfig):
30
30
  pass
31
31
 
32
32
 
33
- @dataclass
34
33
  class BoxAccessConfig(FsspecAccessConfig):
35
- box_app_config: Optional[str] = None
34
+ box_app_config: Optional[str] = Field(
35
+ default=None, description="Path to Box app credentials as json file."
36
+ )
37
+
38
+
39
+ SecretBoxAccessConfig = Secret[BoxAccessConfig]
36
40
 
37
41
 
38
- @dataclass
39
42
  class BoxConnectionConfig(FsspecConnectionConfig):
40
- supported_protocols: list[str] = field(default_factory=lambda: ["box"])
41
- access_config: BoxAccessConfig = enhanced_field(
42
- sensitive=True, default_factory=lambda: BoxAccessConfig()
43
+ supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
44
+ access_config: SecretBoxAccessConfig = Field(
45
+ default_factory=lambda: SecretBoxAccessConfig(secret_value=BoxAccessConfig())
43
46
  )
44
- connector_type: str = CONNECTOR_TYPE
47
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
45
48
 
46
49
  def get_access_config(self) -> dict[str, Any]:
47
50
  # Return access_kwargs with oauth. The oauth object can not be stored directly in the config
48
51
  # because it is not serializable.
49
52
  from boxsdk import JWTAuth
50
53
 
54
+ ac = self.access_config.get_secret_value()
51
55
  access_kwargs_with_oauth: dict[str, Any] = {
52
56
  "oauth": JWTAuth.from_settings_file(
53
- self.access_config.box_app_config,
57
+ ac.box_app_config,
54
58
  ),
55
59
  }
56
- access_config: dict[str, Any] = self.access_config.to_dict()
60
+ access_config: dict[str, Any] = ac.dict()
57
61
  access_config.pop("box_app_config", None)
58
62
  access_kwargs_with_oauth.update(access_config)
59
63
 
@@ -75,7 +79,6 @@ class BoxIndexer(FsspecIndexer):
75
79
  super().precheck()
76
80
 
77
81
 
78
- @dataclass
79
82
  class BoxDownloaderConfig(FsspecDownloaderConfig):
80
83
  pass
81
84
 
@@ -96,7 +99,6 @@ class BoxDownloader(FsspecDownloader):
96
99
  return await super().run_async(file_data=file_data, **kwargs)
97
100
 
98
101
 
99
- @dataclass
100
102
  class BoxUploaderConfig(FsspecUploaderConfig):
101
103
  pass
102
104
 
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
11
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -26,23 +27,23 @@ from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_d
26
27
  CONNECTOR_TYPE = "dropbox"
27
28
 
28
29
 
29
- @dataclass
30
30
  class DropboxIndexerConfig(FsspecIndexerConfig):
31
31
  pass
32
32
 
33
33
 
34
- @dataclass
35
34
  class DropboxAccessConfig(FsspecAccessConfig):
36
- token: Optional[str] = None
35
+ token: Optional[str] = Field(default=None, description="Dropbox access token.")
36
+
37
+
38
+ SecretDropboxAccessConfig = Secret[DropboxAccessConfig]
37
39
 
38
40
 
39
- @dataclass
40
41
  class DropboxConnectionConfig(FsspecConnectionConfig):
41
- supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"])
42
- access_config: DropboxAccessConfig = enhanced_field(
43
- sensitive=True, default_factory=lambda: DropboxAccessConfig()
42
+ supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"], init=False)
43
+ access_config: SecretDropboxAccessConfig = Field(
44
+ default_factory=lambda: SecretDropboxAccessConfig(secret_value=DropboxAccessConfig())
44
45
  )
45
- connector_type: str = CONNECTOR_TYPE
46
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
46
47
 
47
48
 
48
49
  @dataclass
@@ -72,7 +73,6 @@ class DropboxIndexer(FsspecIndexer):
72
73
  return sterilize_dict(data=info)
73
74
 
74
75
 
75
- @dataclass
76
76
  class DropboxDownloaderConfig(FsspecDownloaderConfig):
77
77
  pass
78
78
 
@@ -95,7 +95,6 @@ class DropboxDownloader(FsspecDownloader):
95
95
  return await super().run_async(file_data=file_data, **kwargs)
96
96
 
97
97
 
98
- @dataclass
99
98
  class DropboxUploaderConfig(FsspecUploaderConfig):
100
99
  pass
101
100
 
@@ -8,7 +8,8 @@ from time import time
8
8
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
9
9
  from uuid import NAMESPACE_DNS, uuid5
10
10
 
11
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
+ from pydantic import BaseModel, Field, Secret
12
+
12
13
  from unstructured_ingest.error import (
13
14
  DestinationConnectionError,
14
15
  SourceConnectionError,
@@ -38,17 +39,12 @@ if TYPE_CHECKING:
38
39
  CONNECTOR_TYPE = "fsspec"
39
40
 
40
41
 
41
- class Base(object):
42
- def __post_init__(self):
43
- pass
44
-
45
-
46
- @dataclass
47
- class FileConfig(Base):
48
- remote_url: str
49
- protocol: str = field(init=False)
50
- path_without_protocol: str = field(init=False)
51
- supported_protocols: list[str] = field(
42
+ class FileConfig(BaseModel):
43
+ remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
44
+ protocol: str = Field(init=False)
45
+ path_without_protocol: str = Field(init=False)
46
+ supported_protocols: list[str] = Field(
47
+ init=False,
52
48
  default_factory=lambda: [
53
49
  "s3",
54
50
  "s3a",
@@ -59,37 +55,27 @@ class FileConfig(Base):
59
55
  "box",
60
56
  "dropbox",
61
57
  "sftp",
62
- ]
58
+ ],
63
59
  )
64
60
 
65
- def __post_init__(self):
66
- super().__post_init__()
67
- self.protocol, self.path_without_protocol = self.remote_url.split("://")
68
- if self.protocol not in self.supported_protocols:
69
- raise ValueError(
70
- "Protocol {} not supported yet, only {} are supported.".format(
71
- self.protocol, ", ".join(self.supported_protocols)
72
- ),
73
- )
61
+ def __init__(self, **data):
62
+ protocol, path_without_protocol = data["remote_url"].split("://")
63
+ data["protocol"] = protocol
64
+ data["path_without_protocol"] = path_without_protocol
65
+ super().__init__(**data)
74
66
 
75
67
 
76
- @dataclass
77
68
  class FsspecIndexerConfig(FileConfig, IndexerConfig):
78
69
  recursive: bool = False
79
70
 
80
71
 
81
- @dataclass
82
72
  class FsspecAccessConfig(AccessConfig):
83
73
  pass
84
74
 
85
75
 
86
- FsspecAccessConfigT = TypeVar("FsspecAccessConfigT", bound=FsspecAccessConfig)
87
-
88
-
89
- @dataclass
90
76
  class FsspecConnectionConfig(ConnectionConfig):
91
- access_config: FsspecAccessConfigT = enhanced_field(sensitive=True, default=None)
92
- connector_type: str = CONNECTOR_TYPE
77
+ access_config: Secret[FsspecAccessConfig]
78
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
93
79
 
94
80
 
95
81
  FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
@@ -100,7 +86,7 @@ FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnect
100
86
  class FsspecIndexer(Indexer):
101
87
  connection_config: FsspecConnectionConfigT
102
88
  index_config: FsspecIndexerConfigT
103
- connector_type: str = CONNECTOR_TYPE
89
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
104
90
 
105
91
  @property
106
92
  def fs(self) -> "AbstractFileSystem":
@@ -223,7 +209,6 @@ class FsspecIndexer(Indexer):
223
209
  )
224
210
 
225
211
 
226
- @dataclass
227
212
  class FsspecDownloaderConfig(DownloaderConfig):
228
213
  pass
229
214
 
@@ -274,9 +259,10 @@ class FsspecDownloader(Downloader):
274
259
  return self.generate_download_response(file_data=file_data, download_path=download_path)
275
260
 
276
261
 
277
- @dataclass
278
262
  class FsspecUploaderConfig(FileConfig, UploaderConfig):
279
- overwrite: bool = False
263
+ overwrite: bool = Field(
264
+ default=False, description="If true, an existing file will be overwritten."
265
+ )
280
266
 
281
267
 
282
268
  FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional, Union
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
10
11
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
@@ -26,17 +27,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
27
  CONNECTOR_TYPE = "gcs"
27
28
 
28
29
 
29
- @dataclass
30
30
  class GcsIndexerConfig(FsspecIndexerConfig):
31
31
  pass
32
32
 
33
33
 
34
- @dataclass
34
+ service_account_key_description = """
35
+ Options:
36
+ - ``None``, GCSFS will attempt to guess your credentials in the
37
+ following order: gcloud CLI default, gcsfs cached token, google compute
38
+ metadata service, anonymous.
39
+ - ``'google_default'``, your default gcloud credentials will be used,
40
+ which are typically established by doing ``gcloud login`` in a terminal.
41
+ - ``'cache'``, credentials from previously successful gcsfs
42
+ authentication will be used (use this after "browser" auth succeeded)
43
+ - ``'anon'``, no authentication is performed, and you can only
44
+ access data which is accessible to allUsers (in this case, the project and
45
+ access level parameters are meaningless)
46
+ - ``'browser'``, you get an access code with which you can
47
+ authenticate via a specially provided URL
48
+ - if ``'cloud'``, we assume we are running within google compute
49
+ or google container engine, and query the internal metadata directly for
50
+ a token.
51
+ - you may supply a token generated by the
52
+ [gcloud](https://cloud.google.com/sdk/docs/)
53
+ utility; this is either a python dictionary or the name of a file
54
+ containing the JSON returned by logging in with the gcloud CLI tool.
55
+ """
56
+
57
+
35
58
  class GcsAccessConfig(FsspecAccessConfig):
36
- service_account_key: Optional[str] = None
37
- token: Union[str, dict, None] = field(init=False, default=None)
59
+ service_account_key: Optional[str] = Field(
60
+ default=None, description=service_account_key_description
61
+ )
62
+ token: Union[str, dict, None] = Field(init=False, default=None)
38
63
 
39
- def __post_init__(self):
64
+ def model_post_init(self, __context: Any) -> None:
40
65
  ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
41
66
 
42
67
  # Case: null value
@@ -61,13 +86,15 @@ class GcsAccessConfig(FsspecAccessConfig):
61
86
  raise ValueError("Invalid auth token value")
62
87
 
63
88
 
64
- @dataclass
89
+ SecretGcsAccessConfig = Secret[GcsAccessConfig]
90
+
91
+
65
92
  class GcsConnectionConfig(FsspecConnectionConfig):
66
- supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"])
67
- access_config: GcsAccessConfig = enhanced_field(
68
- sensitive=True, default_factory=lambda: GcsAccessConfig()
93
+ supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"], init=False)
94
+ access_config: SecretGcsAccessConfig = Field(
95
+ default_factory=lambda: SecretGcsAccessConfig(secret_value=GcsAccessConfig())
69
96
  )
70
- connector_type: str = CONNECTOR_TYPE
97
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
71
98
 
72
99
 
73
100
  @dataclass
@@ -85,7 +112,6 @@ class GcsIndexer(FsspecIndexer):
85
112
  super().precheck()
86
113
 
87
114
 
88
- @dataclass
89
115
  class GcsDownloaderConfig(FsspecDownloaderConfig):
90
116
  pass
91
117
 
@@ -106,7 +132,6 @@ class GcsDownloader(FsspecDownloader):
106
132
  return await super().run_async(file_data=file_data, **kwargs)
107
133
 
108
134
 
109
- @dataclass
110
135
  class GcsUploaderConfig(FsspecUploaderConfig):
111
136
  pass
112
137
 
@@ -5,9 +5,9 @@ from pathlib import Path
5
5
  from time import time
6
6
  from typing import Any, Generator, Optional
7
7
 
8
- from unstructured.utils import requires_dependencies
8
+ from pydantic import Field, Secret
9
9
 
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
11
  from unstructured_ingest.v2.interfaces import (
12
12
  DownloadResponse,
13
13
  FileData,
@@ -32,27 +32,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
32
32
  CONNECTOR_TYPE = "s3"
33
33
 
34
34
 
35
- @dataclass
36
35
  class S3IndexerConfig(FsspecIndexerConfig):
37
36
  pass
38
37
 
39
38
 
40
- @dataclass
41
39
  class S3AccessConfig(FsspecAccessConfig):
42
- key: Optional[str] = None
43
- secret: Optional[str] = None
44
- token: Optional[str] = None
40
+ key: Optional[str] = Field(
41
+ default=None,
42
+ description="If not anonymous, use this access key ID, if specified. Takes precedence "
43
+ "over `aws_access_key_id` in client_kwargs.",
44
+ )
45
+ secret: Optional[str] = Field(
46
+ default=None, description="If not anonymous, use this secret access key, if specified."
47
+ )
48
+ token: Optional[str] = Field(
49
+ default=None, description="If not anonymous, use this security token, if specified."
50
+ )
51
+
52
+
53
+ SecretS3AccessConfig = Secret[S3AccessConfig]
45
54
 
46
55
 
47
- @dataclass
48
56
  class S3ConnectionConfig(FsspecConnectionConfig):
49
- supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"])
50
- access_config: S3AccessConfig = enhanced_field(
51
- sensitive=True, default_factory=lambda: S3AccessConfig()
57
+ supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"], init=False)
58
+ access_config: SecretS3AccessConfig = Field(
59
+ default_factory=lambda: SecretS3AccessConfig(secret_value=S3AccessConfig())
52
60
  )
53
- endpoint_url: Optional[str] = None
54
- anonymous: bool = False
55
- connector_type: str = CONNECTOR_TYPE
61
+ endpoint_url: Optional[str] = Field(
62
+ default=None,
63
+ description="Use this endpoint_url, if specified. Needed for "
64
+ "connecting to non-AWS S3 buckets.",
65
+ )
66
+ anonymous: bool = Field(
67
+ default=False, description="Connect to s3 without local AWS credentials."
68
+ )
69
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
56
70
 
57
71
  def get_access_config(self) -> dict[str, Any]:
58
72
  access_configs: dict[str, Any] = {"anon": self.anonymous}
@@ -60,7 +74,9 @@ class S3ConnectionConfig(FsspecConnectionConfig):
60
74
  access_configs["endpoint_url"] = self.endpoint_url
61
75
 
62
76
  # Avoid injecting None by filtering out k,v pairs where the value is None
63
- access_configs.update({k: v for k, v in self.access_config.to_dict().items() if v})
77
+ access_configs.update(
78
+ {k: v for k, v in self.access_config.get_secret_value().dict().items() if v}
79
+ )
64
80
  return access_configs
65
81
 
66
82
 
@@ -116,7 +132,6 @@ class S3Indexer(FsspecIndexer):
116
132
  super().precheck()
117
133
 
118
134
 
119
- @dataclass
120
135
  class S3DownloaderConfig(FsspecDownloaderConfig):
121
136
  pass
122
137
 
@@ -137,7 +152,6 @@ class S3Downloader(FsspecDownloader):
137
152
  return await super().run_async(file_data=file_data, **kwargs)
138
153
 
139
154
 
140
- @dataclass
141
155
  class S3UploaderConfig(FsspecUploaderConfig):
142
156
  pass
143
157
 
@@ -6,7 +6,8 @@ from pathlib import Path
6
6
  from typing import Any, Generator, Optional
7
7
  from urllib.parse import urlparse
8
8
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
9
+ from pydantic import Field, Secret
10
+
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
12
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
12
13
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -27,10 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
27
28
  CONNECTOR_TYPE = "sftp"
28
29
 
29
30
 
30
- @dataclass
31
31
  class SftpIndexerConfig(FsspecIndexerConfig):
32
- def __post_init__(self):
33
- super().__post_init__()
32
+
33
+ def model_post_init(self, __context: Any) -> None:
34
+ super().model_post_init(__context)
34
35
  _, ext = os.path.splitext(self.remote_url)
35
36
  parsed_url = urlparse(self.remote_url)
36
37
  if ext:
@@ -39,21 +40,21 @@ class SftpIndexerConfig(FsspecIndexerConfig):
39
40
  self.path_without_protocol = parsed_url.path.lstrip("/")
40
41
 
41
42
 
42
- @dataclass
43
43
  class SftpAccessConfig(FsspecAccessConfig):
44
- password: str
44
+ password: str = Field(description="Password for sftp connection")
45
45
 
46
46
 
47
- @dataclass
48
47
  class SftpConnectionConfig(FsspecConnectionConfig):
49
- supported_protocols: list[str] = field(default_factory=lambda: ["sftp"])
50
- access_config: SftpAccessConfig = enhanced_field(sensitive=True)
51
- connector_type: str = CONNECTOR_TYPE
52
- username: Optional[str] = None
53
- host: Optional[str] = None
54
- port: int = 22
55
- look_for_keys: bool = False
56
- allow_agent: bool = False
48
+ supported_protocols: list[str] = Field(default_factory=lambda: ["sftp"], init=False)
49
+ access_config: Secret[SftpAccessConfig]
50
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
51
+ username: str = Field(description="Username for sftp connection")
52
+ host: Optional[str] = Field(default=None, description="Hostname for sftp connection")
53
+ port: int = Field(default=22, description="Port for sftp connection")
54
+ look_for_keys: bool = Field(
55
+ default=False, description="Whether to search for private key files in ~/.ssh/"
56
+ )
57
+ allow_agent: bool = Field(default=False, description="Whether to connect to the SSH agent.")
57
58
 
58
59
  def get_access_config(self) -> dict[str, Any]:
59
60
  access_config = {
@@ -62,7 +63,7 @@ class SftpConnectionConfig(FsspecConnectionConfig):
62
63
  "port": self.port,
63
64
  "look_for_keys": self.look_for_keys,
64
65
  "allow_agent": self.allow_agent,
65
- "password": self.access_config.password,
66
+ "password": self.access_config.get_secret_value().password,
66
67
  }
67
68
  return access_config
68
69
 
@@ -96,24 +97,15 @@ class SftpIndexer(FsspecIndexer):
96
97
  super().precheck()
97
98
 
98
99
 
99
- @dataclass
100
100
  class SftpDownloaderConfig(FsspecDownloaderConfig):
101
- remote_url: Optional[str] = None
102
-
103
- def __post_init__(self):
104
- # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
105
- if not self.remote_url:
106
- raise TypeError(
107
- f"{self.__class__.__name__}.__init__() "
108
- f"missing 1 required positional argument: 'remote_url'"
109
- )
101
+ remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
110
102
 
111
103
 
112
104
  @dataclass
113
105
  class SftpDownloader(FsspecDownloader):
114
106
  protocol: str = "sftp"
115
107
  connection_config: SftpConnectionConfig
116
- connector_type: str = CONNECTOR_TYPE
108
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
117
109
  download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
118
110
 
119
111
  @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
@@ -131,7 +123,6 @@ class SftpDownloader(FsspecDownloader):
131
123
  return await super().run_async(file_data=file_data, **kwargs)
132
124
 
133
125
 
134
- @dataclass
135
126
  class SftpUploaderConfig(FsspecUploaderConfig):
136
127
  pass
137
128