unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +66 -12
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -21
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  34. unstructured_ingest/v2/interfaces/connector.py +5 -7
  35. unstructured_ingest/v2/interfaces/downloader.py +17 -8
  36. unstructured_ingest/v2/interfaces/file_data.py +13 -2
  37. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  38. unstructured_ingest/v2/interfaces/process.py +3 -4
  39. unstructured_ingest/v2/interfaces/processor.py +10 -10
  40. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  41. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  42. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  43. unstructured_ingest/v2/pipeline/pipeline.py +73 -7
  44. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  45. unstructured_ingest/v2/pipeline/steps/download.py +90 -24
  46. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  47. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  48. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  49. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  50. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  51. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  52. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  53. unstructured_ingest/v2/processes/__init__.py +18 -0
  54. unstructured_ingest/v2/processes/chunker.py +74 -28
  55. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  56. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  57. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
  58. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
  59. unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
  60. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  61. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
  63. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
  64. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
  66. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
  67. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
  68. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
  69. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
  70. unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
  71. unstructured_ingest/v2/processes/connectors/local.py +36 -28
  72. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
  74. unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
  75. unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
  76. unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
  77. unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
  78. unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
  79. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  80. unstructured_ingest/v2/processes/connectors/sql.py +52 -39
  81. unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
  82. unstructured_ingest/v2/processes/embedder.py +106 -47
  83. unstructured_ingest/v2/processes/filter.py +60 -0
  84. unstructured_ingest/v2/processes/partitioner.py +79 -33
  85. unstructured_ingest/v2/processes/uncompress.py +3 -3
  86. unstructured_ingest/v2/utils.py +45 -0
  87. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  88. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
  89. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  90. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  91. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  92. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  93. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  94. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  95. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  96. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  97. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  99. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
  100. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  101. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  102. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  103. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  104. unstructured_ingest/v2/cli/cmds/local.py +0 -60
  105. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  106. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  108. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  109. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  110. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  111. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  112. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  113. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  114. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  115. unstructured_ingest/v2/cli/configs/__init__.py +0 -6
  116. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  117. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
11
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -36,35 +37,59 @@ def azure_json_serial(obj):
36
37
  return json_serial(obj)
37
38
 
38
39
 
39
- @dataclass
40
40
  class AzureIndexerConfig(FsspecIndexerConfig):
41
41
  pass
42
42
 
43
43
 
44
- @dataclass
45
44
  class AzureAccessConfig(FsspecAccessConfig):
46
- account_name: Optional[str] = None
47
- account_key: Optional[str] = None
48
- connection_string: Optional[str] = None
49
- sas_token: Optional[str] = None
45
+ account_name: Optional[str] = Field(
46
+ default=None,
47
+ description="The storage account name. This is used to authenticate "
48
+ "requests signed with an account key and to construct "
49
+ "the storage endpoint. It is required unless a connection "
50
+ "string is given, or if a custom domain is used with "
51
+ "anonymous authentication.",
52
+ )
53
+ account_key: Optional[str] = Field(
54
+ default=None,
55
+ description="The storage account key. This is used for shared key "
56
+ "authentication. If any of account key, sas token or "
57
+ "client_id are not specified, anonymous access will be used.",
58
+ )
59
+ connection_string: Optional[str] = Field(
60
+ default=None,
61
+ description="If specified, this will override all other parameters. See "
62
+ "http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501
63
+ "for the connection string format.",
64
+ )
65
+ sas_token: Optional[str] = Field(
66
+ default=None,
67
+ description="A shared access signature token to use to authenticate "
68
+ "requests instead of the account key. If account key and "
69
+ "sas token are both specified, account key will be used "
70
+ "to sign. If any of account key, sas token or client_id "
71
+ "are not specified, anonymous access will be used.",
72
+ )
50
73
 
51
- def __post_init__(self):
74
+ def model_post_init(self, __context: Any) -> None:
52
75
  if self.connection_string is None and self.account_name is None:
53
76
  raise ValueError("either connection_string or account_name must be set")
54
77
 
55
78
 
56
- @dataclass
79
+ SecretAzureAccessConfig = Secret[AzureAccessConfig]
80
+
81
+
57
82
  class AzureConnectionConfig(FsspecConnectionConfig):
58
- supported_protocols: list[str] = field(default_factory=lambda: ["az"])
59
- access_config: AzureAccessConfig = enhanced_field(
60
- sensitive=True, default_factory=lambda: AzureAccessConfig()
83
+ supported_protocols: list[str] = field(default_factory=lambda: ["az"], init=False)
84
+ access_config: SecretAzureAccessConfig = Field(
85
+ default_factory=lambda: SecretAzureAccessConfig(secret_value=AzureAccessConfig())
61
86
  )
62
- connector_type: str = CONNECTOR_TYPE
87
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
63
88
 
64
89
  def get_access_config(self) -> dict[str, Any]:
65
90
  # Avoid injecting None by filtering out k,v pairs where the value is None
66
91
  access_configs: dict[str, Any] = {
67
- k: v for k, v in self.access_config.to_dict().items() if v
92
+ k: v for k, v in self.access_config.get_secret_value().dict().items() if v
68
93
  }
69
94
  return access_configs
70
95
 
@@ -75,6 +100,10 @@ class AzureIndexer(FsspecIndexer):
75
100
  index_config: AzureIndexerConfig
76
101
  connector_type: str = CONNECTOR_TYPE
77
102
 
103
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
104
+ def precheck(self) -> None:
105
+ super().precheck()
106
+
78
107
  def sterilize_info(self, path) -> dict:
79
108
  info = self.fs.info(path=path)
80
109
  return sterilize_dict(data=info, default=azure_json_serial)
@@ -84,7 +113,6 @@ class AzureIndexer(FsspecIndexer):
84
113
  return super().run(**kwargs)
85
114
 
86
115
 
87
- @dataclass
88
116
  class AzureDownloaderConfig(FsspecDownloaderConfig):
89
117
  pass
90
118
 
@@ -105,7 +133,6 @@ class AzureDownloader(FsspecDownloader):
105
133
  return await super().run_async(file_data=file_data, **kwargs)
106
134
 
107
135
 
108
- @dataclass
109
136
  class AzureUploaderConfig(FsspecUploaderConfig):
110
137
  pass
111
138
 
@@ -120,6 +147,10 @@ class AzureUploader(FsspecUploader):
120
147
  def __post_init__(self):
121
148
  super().__post_init__()
122
149
 
150
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
151
+ def precheck(self) -> None:
152
+ super().precheck()
153
+
123
154
  @requires_dependencies(["adlfs", "fsspec"], extras="azure")
124
155
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
125
156
  return super().run(contents=contents, **kwargs)
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
11
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -25,35 +26,38 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
25
26
  CONNECTOR_TYPE = "box"
26
27
 
27
28
 
28
- @dataclass
29
29
  class BoxIndexerConfig(FsspecIndexerConfig):
30
30
  pass
31
31
 
32
32
 
33
- @dataclass
34
33
  class BoxAccessConfig(FsspecAccessConfig):
35
- box_app_config: Optional[str] = None
34
+ box_app_config: Optional[str] = Field(
35
+ default=None, description="Path to Box app credentials as json file."
36
+ )
37
+
38
+
39
+ SecretBoxAccessConfig = Secret[BoxAccessConfig]
36
40
 
37
41
 
38
- @dataclass
39
42
  class BoxConnectionConfig(FsspecConnectionConfig):
40
- supported_protocols: list[str] = field(default_factory=lambda: ["box"])
41
- access_config: BoxAccessConfig = enhanced_field(
42
- sensitive=True, default_factory=lambda: BoxAccessConfig()
43
+ supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
44
+ access_config: SecretBoxAccessConfig = Field(
45
+ default_factory=lambda: SecretBoxAccessConfig(secret_value=BoxAccessConfig())
43
46
  )
44
- connector_type: str = CONNECTOR_TYPE
47
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
45
48
 
46
49
  def get_access_config(self) -> dict[str, Any]:
47
50
  # Return access_kwargs with oauth. The oauth object can not be stored directly in the config
48
51
  # because it is not serializable.
49
52
  from boxsdk import JWTAuth
50
53
 
54
+ ac = self.access_config.get_secret_value()
51
55
  access_kwargs_with_oauth: dict[str, Any] = {
52
56
  "oauth": JWTAuth.from_settings_file(
53
- self.access_config.box_app_config,
57
+ ac.box_app_config,
54
58
  ),
55
59
  }
56
- access_config: dict[str, Any] = self.access_config.to_dict()
60
+ access_config: dict[str, Any] = ac.dict()
57
61
  access_config.pop("box_app_config", None)
58
62
  access_kwargs_with_oauth.update(access_config)
59
63
 
@@ -70,8 +74,11 @@ class BoxIndexer(FsspecIndexer):
70
74
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
71
75
  return super().run(**kwargs)
72
76
 
77
+ @requires_dependencies(["boxfs"], extras="box")
78
+ def precheck(self) -> None:
79
+ super().precheck()
80
+
73
81
 
74
- @dataclass
75
82
  class BoxDownloaderConfig(FsspecDownloaderConfig):
76
83
  pass
77
84
 
@@ -92,7 +99,6 @@ class BoxDownloader(FsspecDownloader):
92
99
  return await super().run_async(file_data=file_data, **kwargs)
93
100
 
94
101
 
95
- @dataclass
96
102
  class BoxUploaderConfig(FsspecUploaderConfig):
97
103
  pass
98
104
 
@@ -107,6 +113,10 @@ class BoxUploader(FsspecUploader):
107
113
  def __post_init__(self):
108
114
  super().__post_init__()
109
115
 
116
+ @requires_dependencies(["boxfs"], extras="box")
117
+ def precheck(self) -> None:
118
+ super().precheck()
119
+
110
120
  @requires_dependencies(["boxfs"], extras="box")
111
121
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
112
122
  return super().run(contents=contents, **kwargs)
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
11
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -26,23 +27,23 @@ from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_d
26
27
  CONNECTOR_TYPE = "dropbox"
27
28
 
28
29
 
29
- @dataclass
30
30
  class DropboxIndexerConfig(FsspecIndexerConfig):
31
31
  pass
32
32
 
33
33
 
34
- @dataclass
35
34
  class DropboxAccessConfig(FsspecAccessConfig):
36
- token: Optional[str] = None
35
+ token: Optional[str] = Field(default=None, description="Dropbox access token.")
36
+
37
+
38
+ SecretDropboxAccessConfig = Secret[DropboxAccessConfig]
37
39
 
38
40
 
39
- @dataclass
40
41
  class DropboxConnectionConfig(FsspecConnectionConfig):
41
- supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"])
42
- access_config: DropboxAccessConfig = enhanced_field(
43
- sensitive=True, default_factory=lambda: DropboxAccessConfig()
42
+ supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"], init=False)
43
+ access_config: SecretDropboxAccessConfig = Field(
44
+ default_factory=lambda: SecretDropboxAccessConfig(secret_value=DropboxAccessConfig())
44
45
  )
45
- connector_type: str = CONNECTOR_TYPE
46
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
46
47
 
47
48
 
48
49
  @dataclass
@@ -57,6 +58,10 @@ class DropboxIndexer(FsspecIndexer):
57
58
  if not self.index_config.path_without_protocol.startswith("/"):
58
59
  self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
59
60
 
61
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
62
+ def precheck(self) -> None:
63
+ super().precheck()
64
+
60
65
  @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
61
66
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
62
67
  return super().run(**kwargs)
@@ -68,7 +73,6 @@ class DropboxIndexer(FsspecIndexer):
68
73
  return sterilize_dict(data=info)
69
74
 
70
75
 
71
- @dataclass
72
76
  class DropboxDownloaderConfig(FsspecDownloaderConfig):
73
77
  pass
74
78
 
@@ -91,7 +95,6 @@ class DropboxDownloader(FsspecDownloader):
91
95
  return await super().run_async(file_data=file_data, **kwargs)
92
96
 
93
97
 
94
- @dataclass
95
98
  class DropboxUploaderConfig(FsspecUploaderConfig):
96
99
  pass
97
100
 
@@ -106,6 +109,10 @@ class DropboxUploader(FsspecUploader):
106
109
  def __post_init__(self):
107
110
  super().__post_init__()
108
111
 
112
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
113
+ def precheck(self) -> None:
114
+ super().precheck()
115
+
109
116
  @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
110
117
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
111
118
  return super().run(contents=contents, **kwargs)
@@ -1,7 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
- import fnmatch
5
4
  from dataclasses import dataclass, field
6
5
  from datetime import datetime
7
6
  from pathlib import Path
@@ -9,10 +8,13 @@ from time import time
9
8
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
10
9
  from uuid import NAMESPACE_DNS, uuid5
11
10
 
12
- from unstructured.documents.elements import DataSourceMetadata
11
+ from pydantic import BaseModel, Field, Secret
13
12
 
14
- from unstructured_ingest.enhanced_dataclass import enhanced_field
15
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
13
+ from unstructured_ingest.error import (
14
+ DestinationConnectionError,
15
+ SourceConnectionError,
16
+ SourceConnectionNetworkError,
17
+ )
16
18
  from unstructured_ingest.v2.interfaces import (
17
19
  AccessConfig,
18
20
  ConnectionConfig,
@@ -20,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
20
22
  DownloaderConfig,
21
23
  DownloadResponse,
22
24
  FileData,
25
+ FileDataSourceMetadata,
23
26
  Indexer,
24
27
  IndexerConfig,
25
28
  SourceIdentifiers,
@@ -36,17 +39,12 @@ if TYPE_CHECKING:
36
39
  CONNECTOR_TYPE = "fsspec"
37
40
 
38
41
 
39
- class Base(object):
40
- def __post_init__(self):
41
- pass
42
-
43
-
44
- @dataclass
45
- class FileConfig(Base):
46
- remote_url: str
47
- protocol: str = field(init=False)
48
- path_without_protocol: str = field(init=False)
49
- supported_protocols: list[str] = field(
42
+ class FileConfig(BaseModel):
43
+ remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
44
+ protocol: str = Field(init=False)
45
+ path_without_protocol: str = Field(init=False)
46
+ supported_protocols: list[str] = Field(
47
+ init=False,
50
48
  default_factory=lambda: [
51
49
  "s3",
52
50
  "s3a",
@@ -57,38 +55,27 @@ class FileConfig(Base):
57
55
  "box",
58
56
  "dropbox",
59
57
  "sftp",
60
- ]
58
+ ],
61
59
  )
62
60
 
63
- def __post_init__(self):
64
- super().__post_init__()
65
- self.protocol, self.path_without_protocol = self.remote_url.split("://")
66
- if self.protocol not in self.supported_protocols:
67
- raise ValueError(
68
- "Protocol {} not supported yet, only {} are supported.".format(
69
- self.protocol, ", ".join(self.supported_protocols)
70
- ),
71
- )
61
+ def __init__(self, **data):
62
+ protocol, path_without_protocol = data["remote_url"].split("://")
63
+ data["protocol"] = protocol
64
+ data["path_without_protocol"] = path_without_protocol
65
+ super().__init__(**data)
72
66
 
73
67
 
74
- @dataclass
75
68
  class FsspecIndexerConfig(FileConfig, IndexerConfig):
76
69
  recursive: bool = False
77
- file_glob: Optional[list[str]] = None
78
70
 
79
71
 
80
- @dataclass
81
72
  class FsspecAccessConfig(AccessConfig):
82
73
  pass
83
74
 
84
75
 
85
- FsspecAccessConfigT = TypeVar("FsspecAccessConfigT", bound=FsspecAccessConfig)
86
-
87
-
88
- @dataclass
89
76
  class FsspecConnectionConfig(ConnectionConfig):
90
- access_config: FsspecAccessConfigT = enhanced_field(sensitive=True, default=None)
91
- connector_type: str = CONNECTOR_TYPE
77
+ access_config: Secret[FsspecAccessConfig]
78
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
92
79
 
93
80
 
94
81
  FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
@@ -99,7 +86,7 @@ FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnect
99
86
  class FsspecIndexer(Indexer):
100
87
  connection_config: FsspecConnectionConfigT
101
88
  index_config: FsspecIndexerConfigT
102
- connector_type: str = CONNECTOR_TYPE
89
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
103
90
 
104
91
  @property
105
92
  def fs(self) -> "AbstractFileSystem":
@@ -109,17 +96,7 @@ class FsspecIndexer(Indexer):
109
96
  **self.connection_config.get_access_config(),
110
97
  )
111
98
 
112
- def does_path_match_glob(self, path: str) -> bool:
113
- if self.index_config.file_glob is None:
114
- return True
115
- patterns = self.index_config.file_glob
116
- for pattern in patterns:
117
- if fnmatch.filter([path], pattern):
118
- return True
119
- logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
120
- return False
121
-
122
- def check_connection(self):
99
+ def precheck(self) -> None:
123
100
  from fsspec import get_filesystem_class
124
101
 
125
102
  try:
@@ -157,10 +134,10 @@ class FsspecIndexer(Indexer):
157
134
  else:
158
135
  raise TypeError(f"unhandled response type from find: {type(found)}")
159
136
 
160
- def get_metadata(self, path: str) -> DataSourceMetadata:
137
+ def get_metadata(self, path: str) -> FileDataSourceMetadata:
161
138
  date_created = None
162
139
  date_modified = None
163
-
140
+ file_size = None
164
141
  try:
165
142
  created: Optional[Any] = self.fs.created(path)
166
143
  if created:
@@ -180,6 +157,8 @@ class FsspecIndexer(Indexer):
180
157
  date_modified = str(modified)
181
158
  except NotImplementedError:
182
159
  pass
160
+ with contextlib.suppress(AttributeError):
161
+ file_size = self.fs.size(path)
183
162
 
184
163
  version = self.fs.checksum(path)
185
164
  metadata: dict[str, str] = {}
@@ -189,15 +168,19 @@ class FsspecIndexer(Indexer):
189
168
  "protocol": self.index_config.protocol,
190
169
  "remote_file_path": self.index_config.remote_url,
191
170
  }
171
+ file_stat = self.fs.stat(path=path)
172
+ if file_id := file_stat.get("id"):
173
+ record_locator["file_id"] = file_id
192
174
  if metadata:
193
175
  record_locator["metadata"] = metadata
194
- return DataSourceMetadata(
176
+ return FileDataSourceMetadata(
195
177
  date_created=date_created,
196
178
  date_modified=date_modified,
197
179
  date_processed=str(time()),
198
180
  version=str(version),
199
181
  url=f"{self.index_config.protocol}://{path}",
200
182
  record_locator=record_locator,
183
+ filesize_bytes=file_size,
201
184
  )
202
185
 
203
186
  def sterilize_info(self, path) -> dict:
@@ -205,8 +188,7 @@ class FsspecIndexer(Indexer):
205
188
  return sterilize_dict(data=info)
206
189
 
207
190
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
208
- raw_files = self.list_files()
209
- files = [f for f in raw_files if self.does_path_match_glob(f)]
191
+ files = self.list_files()
210
192
  for file in files:
211
193
  # Note: we remove any remaining leading slashes (Box introduces these)
212
194
  # to get a valid relative path
@@ -227,7 +209,6 @@ class FsspecIndexer(Indexer):
227
209
  )
228
210
 
229
211
 
230
- @dataclass
231
212
  class FsspecDownloaderConfig(DownloaderConfig):
232
213
  pass
233
214
 
@@ -255,13 +236,6 @@ class FsspecDownloader(Downloader):
255
236
  **self.connection_config.get_access_config(),
256
237
  )
257
238
 
258
- def get_download_path(self, file_data: FileData) -> Path:
259
- return (
260
- self.download_dir / Path(file_data.source_identifiers.relative_path)
261
- if self.download_config
262
- else Path(file_data.source_identifiers.rel_path)
263
- )
264
-
265
239
  def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
266
240
  download_path = self.get_download_path(file_data=file_data)
267
241
  download_path.parent.mkdir(parents=True, exist_ok=True)
@@ -285,9 +259,10 @@ class FsspecDownloader(Downloader):
285
259
  return self.generate_download_response(file_data=file_data, download_path=download_path)
286
260
 
287
261
 
288
- @dataclass
289
262
  class FsspecUploaderConfig(FileConfig, UploaderConfig):
290
- overwrite: bool = False
263
+ overwrite: bool = Field(
264
+ default=False, description="If true, an existing file will be overwritten."
265
+ )
291
266
 
292
267
 
293
268
  FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
@@ -315,6 +290,19 @@ class FsspecUploader(Uploader):
315
290
  f"missing 1 required positional argument: 'upload_config'"
316
291
  )
317
292
 
293
+ def precheck(self) -> None:
294
+ from fsspec import get_filesystem_class
295
+
296
+ try:
297
+ fs = get_filesystem_class(self.upload_config.protocol)(
298
+ **self.connection_config.get_access_config(),
299
+ )
300
+ root_dir = self.upload_config.path_without_protocol.split("/")[0]
301
+ fs.ls(path=root_dir, detail=False)
302
+ except Exception as e:
303
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
304
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
305
+
318
306
  def get_upload_path(self, file_data: FileData) -> Path:
319
307
  upload_path = (
320
308
  Path(self.upload_config.path_without_protocol)
@@ -4,7 +4,8 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Optional, Union
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from pydantic import Field, Secret
8
+
8
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
10
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
10
11
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
@@ -26,17 +27,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
27
  CONNECTOR_TYPE = "gcs"
27
28
 
28
29
 
29
- @dataclass
30
30
  class GcsIndexerConfig(FsspecIndexerConfig):
31
31
  pass
32
32
 
33
33
 
34
- @dataclass
34
+ service_account_key_description = """
35
+ Options:
36
+ - ``None``, GCSFS will attempt to guess your credentials in the
37
+ following order: gcloud CLI default, gcsfs cached token, google compute
38
+ metadata service, anonymous.
39
+ - ``'google_default'``, your default gcloud credentials will be used,
40
+ which are typically established by doing ``gcloud login`` in a terminal.
41
+ - ``'cache'``, credentials from previously successful gcsfs
42
+ authentication will be used (use this after "browser" auth succeeded)
43
+ - ``'anon'``, no authentication is performed, and you can only
44
+ access data which is accessible to allUsers (in this case, the project and
45
+ access level parameters are meaningless)
46
+ - ``'browser'``, you get an access code with which you can
47
+ authenticate via a specially provided URL
48
+ - if ``'cloud'``, we assume we are running within google compute
49
+ or google container engine, and query the internal metadata directly for
50
+ a token.
51
+ - you may supply a token generated by the
52
+ [gcloud](https://cloud.google.com/sdk/docs/)
53
+ utility; this is either a python dictionary or the name of a file
54
+ containing the JSON returned by logging in with the gcloud CLI tool.
55
+ """
56
+
57
+
35
58
  class GcsAccessConfig(FsspecAccessConfig):
36
- service_account_key: Optional[str] = None
37
- token: Union[str, dict, None] = field(init=False, default=None)
59
+ service_account_key: Optional[str] = Field(
60
+ default=None, description=service_account_key_description
61
+ )
62
+ token: Union[str, dict, None] = Field(init=False, default=None)
38
63
 
39
- def __post_init__(self):
64
+ def model_post_init(self, __context: Any) -> None:
40
65
  ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
41
66
 
42
67
  # Case: null value
@@ -61,13 +86,15 @@ class GcsAccessConfig(FsspecAccessConfig):
61
86
  raise ValueError("Invalid auth token value")
62
87
 
63
88
 
64
- @dataclass
89
+ SecretGcsAccessConfig = Secret[GcsAccessConfig]
90
+
91
+
65
92
  class GcsConnectionConfig(FsspecConnectionConfig):
66
- supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"])
67
- access_config: GcsAccessConfig = enhanced_field(
68
- sensitive=True, default_factory=lambda: GcsAccessConfig()
93
+ supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"], init=False)
94
+ access_config: SecretGcsAccessConfig = Field(
95
+ default_factory=lambda: SecretGcsAccessConfig(secret_value=GcsAccessConfig())
69
96
  )
70
- connector_type: str = CONNECTOR_TYPE
97
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
71
98
 
72
99
 
73
100
  @dataclass
@@ -80,8 +107,11 @@ class GcsIndexer(FsspecIndexer):
80
107
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
81
108
  return super().run(**kwargs)
82
109
 
110
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
111
+ def precheck(self) -> None:
112
+ super().precheck()
113
+
83
114
 
84
- @dataclass
85
115
  class GcsDownloaderConfig(FsspecDownloaderConfig):
86
116
  pass
87
117
 
@@ -102,7 +132,6 @@ class GcsDownloader(FsspecDownloader):
102
132
  return await super().run_async(file_data=file_data, **kwargs)
103
133
 
104
134
 
105
- @dataclass
106
135
  class GcsUploaderConfig(FsspecUploaderConfig):
107
136
  pass
108
137
 
@@ -117,6 +146,10 @@ class GcsUploader(FsspecUploader):
117
146
  def __post_init__(self):
118
147
  super().__post_init__()
119
148
 
149
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
150
+ def precheck(self) -> None:
151
+ super().precheck()
152
+
120
153
  @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
121
154
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
122
155
  return super().run(contents=contents, **kwargs)