unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (123) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +1 -5
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/local.py +22 -14
  69. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  72. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  73. unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
  74. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  75. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  76. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  77. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  78. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  79. unstructured_ingest/v2/processes/embedder.py +106 -47
  80. unstructured_ingest/v2/processes/filter.py +11 -5
  81. unstructured_ingest/v2/processes/partitioner.py +79 -33
  82. unstructured_ingest/v2/processes/uncompress.py +3 -3
  83. unstructured_ingest/v2/utils.py +45 -0
  84. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  85. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
  86. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  87. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  88. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  89. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  90. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  91. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  92. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  93. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  94. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  95. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  96. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  97. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  98. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  99. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  100. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  101. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  102. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  103. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  104. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  105. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  106. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  108. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  109. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  110. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  111. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  112. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  113. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  114. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  115. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  116. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  117. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  118. unstructured_ingest/v2/cli/interfaces.py +0 -27
  119. unstructured_ingest/v2/pipeline/utils.py +0 -15
  120. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  121. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  122. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  123. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,11 @@
1
1
  import json
2
- import typing as t
3
2
  import uuid
4
3
  from dataclasses import dataclass, field
5
4
  from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from pydantic import Field, Secret
6
8
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
9
  from unstructured_ingest.error import DestinationConnectionError, WriteError
9
10
  from unstructured_ingest.utils.data_prep import batch_generator
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -20,27 +21,31 @@ from unstructured_ingest.v2.interfaces import (
20
21
  from unstructured_ingest.v2.logger import logger
21
22
  from unstructured_ingest.v2.processes.connector_registry import (
22
23
  DestinationRegistryEntry,
23
- add_destination_entry,
24
24
  )
25
25
  from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
26
26
 
27
- if t.TYPE_CHECKING:
27
+ if TYPE_CHECKING:
28
28
  from azure.search.documents import SearchClient
29
29
 
30
30
 
31
31
  CONNECTOR_TYPE = "azure_cognitive_search"
32
32
 
33
33
 
34
- @dataclass
35
34
  class AzureCognitiveSearchAccessConfig(AccessConfig):
36
- key: t.Optional[str] = enhanced_field(default=None, overload_name="azure_cognitive_search_key")
35
+ azure_cognitive_search_key: str = Field(
36
+ alias="key", description="Credential that is used for authenticating to an Azure service"
37
+ )
37
38
 
38
39
 
39
- @dataclass
40
40
  class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
41
- endpoint: str
42
- index: str
43
- access_config: AzureCognitiveSearchAccessConfig = enhanced_field(sensitive=True)
41
+ endpoint: str = Field(
42
+ description="The URL endpoint of an Azure AI (Cognitive) search service. "
43
+ "In the form of https://{{service_name}}.search.windows.net"
44
+ )
45
+ index: str = Field(
46
+ description="The name of the Azure AI (Cognitive) Search index to connect to."
47
+ )
48
+ access_config: Secret[AzureCognitiveSearchAccessConfig]
44
49
 
45
50
  @requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
46
51
  def generate_client(self) -> "SearchClient":
@@ -50,18 +55,18 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
50
55
  return SearchClient(
51
56
  endpoint=self.endpoint,
52
57
  index_name=self.index,
53
- credential=AzureKeyCredential(self.access_config.key),
58
+ credential=AzureKeyCredential(
59
+ self.access_config.get_secret_value().azure_cognitive_search_key
60
+ ),
54
61
  )
55
62
 
56
63
 
57
- @dataclass
58
64
  class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
59
65
  pass
60
66
 
61
67
 
62
- @dataclass
63
68
  class AzureCognitiveSearchUploaderConfig(UploaderConfig):
64
- batch_size: int = 100
69
+ batch_size: int = Field(default=100, description="Number of records per batch")
65
70
 
66
71
 
67
72
  @dataclass
@@ -122,7 +127,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
122
127
  elements_filepath: Path,
123
128
  output_dir: Path,
124
129
  output_filename: str,
125
- **kwargs: t.Any,
130
+ **kwargs: Any,
126
131
  ) -> Path:
127
132
  with open(elements_filepath) as elements_file:
128
133
  elements_contents = json.load(elements_file)
@@ -143,7 +148,7 @@ class AzureCognitiveSearchUploader(Uploader):
143
148
 
144
149
  @DestinationConnectionError.wrap
145
150
  @requires_dependencies(["azure"], extras="azure-cognitive-search")
146
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
151
+ def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
147
152
  import azure.core.exceptions
148
153
 
149
154
  logger.info(
@@ -169,7 +174,8 @@ class AzureCognitiveSearchUploader(Uploader):
169
174
  raise WriteError(
170
175
  ", ".join(
171
176
  [
172
- f"{error.key}: [{error.status_code}] {error.error_message}"
177
+ f"{error.azure_cognitive_search_key}: "
178
+ f"[{error.status_code}] {error.error_message}"
173
179
  for error in errors
174
180
  ],
175
181
  ),
@@ -186,7 +192,7 @@ class AzureCognitiveSearchUploader(Uploader):
186
192
  def write_dict_wrapper(self, elements_dict):
187
193
  return self.write_dict(elements_dict=elements_dict)
188
194
 
189
- def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None:
195
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
190
196
 
191
197
  elements_dict = []
192
198
  for content in contents:
@@ -207,13 +213,10 @@ class AzureCognitiveSearchUploader(Uploader):
207
213
  self.write_dict(elements_dict=chunk) # noqa: E203
208
214
 
209
215
 
210
- add_destination_entry(
211
- destination_type=CONNECTOR_TYPE,
212
- entry=DestinationRegistryEntry(
213
- connection_config=AzureCognitiveSearchConnectionConfig,
214
- uploader=AzureCognitiveSearchUploader,
215
- uploader_config=AzureCognitiveSearchUploaderConfig,
216
- upload_stager=AzureCognitiveSearchUploadStager,
217
- upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
218
- ),
216
+ azure_cognitive_search_destination_entry = DestinationRegistryEntry(
217
+ connection_config=AzureCognitiveSearchConnectionConfig,
218
+ uploader=AzureCognitiveSearchUploader,
219
+ uploader_config=AzureCognitiveSearchUploaderConfig,
220
+ upload_stager=AzureCognitiveSearchUploadStager,
221
+ upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
219
222
  )
@@ -3,11 +3,11 @@ import uuid
3
3
  from dataclasses import dataclass, field
4
4
  from datetime import date, datetime
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Dict, Optional
6
+ from typing import TYPE_CHECKING, Any, Optional
7
7
 
8
8
  from dateutil import parser
9
+ from pydantic import Field, Secret
9
10
 
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
11
  from unstructured_ingest.error import DestinationConnectionError
12
12
  from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -32,26 +32,35 @@ if TYPE_CHECKING:
32
32
  CONNECTOR_TYPE = "chroma"
33
33
 
34
34
 
35
- @dataclass
36
35
  class ChromaAccessConfig(AccessConfig):
37
- settings: Optional[Dict[str, str]] = None
38
- headers: Optional[Dict[str, str]] = None
36
+ settings: Optional[dict[str, str]] = Field(
37
+ default=None, description="A dictionary of settings to communicate with the chroma server."
38
+ )
39
+ headers: Optional[dict[str, str]] = Field(
40
+ default=None, description="A dictionary of headers to send to the Chroma server."
41
+ )
39
42
 
40
43
 
41
- @dataclass
42
44
  class ChromaConnectionConfig(ConnectionConfig):
43
- collection_name: str
44
- access_config: ChromaAccessConfig = enhanced_field(sensitive=True)
45
- path: Optional[str] = None
46
- tenant: Optional[str] = "default_tenant"
47
- database: Optional[str] = "default_database"
48
- host: Optional[str] = None
49
- port: Optional[int] = None
50
- ssl: bool = False
51
- connector_type: str = CONNECTOR_TYPE
45
+ collection_name: str = Field(description="The name of the Chroma collection to write into.")
46
+ access_config: Secret[ChromaAccessConfig]
47
+ path: Optional[str] = Field(
48
+ default=None, description="Location where Chroma is persisted, if not connecting via http."
49
+ )
50
+ tenant: Optional[str] = Field(
51
+ default="default_tenant", description="The tenant to use for this client."
52
+ )
53
+ database: Optional[str] = Field(
54
+ default="default_database", description="The database to use for this client."
55
+ )
56
+ host: Optional[str] = Field(default=None, description="The hostname of the Chroma server.")
57
+ port: Optional[int] = Field(default=None, description="The port of the Chroma server.")
58
+ ssl: bool = Field(
59
+ default=False, description="Whether to use SSL to connect to the Chroma server."
60
+ )
61
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
52
62
 
53
63
 
54
- @dataclass
55
64
  class ChromaUploadStagerConfig(UploadStagerConfig):
56
65
  pass
57
66
 
@@ -101,9 +110,8 @@ class ChromaUploadStager(UploadStager):
101
110
  return output_path
102
111
 
103
112
 
104
- @dataclass
105
113
  class ChromaUploaderConfig(UploaderConfig):
106
- batch_size: int = 100
114
+ batch_size: int = Field(default=100, description="Number of records per batch")
107
115
 
108
116
 
109
117
  @dataclass
@@ -123,10 +131,11 @@ class ChromaUploader(Uploader):
123
131
  def create_client(self) -> "Client":
124
132
  import chromadb
125
133
 
134
+ access_config = self.connection_config.access_config.get_secret_value()
126
135
  if self.connection_config.path:
127
136
  return chromadb.PersistentClient(
128
137
  path=self.connection_config.path,
129
- settings=self.connection_config.access_config.settings,
138
+ settings=access_config.settings,
130
139
  tenant=self.connection_config.tenant,
131
140
  database=self.connection_config.database,
132
141
  )
@@ -136,8 +145,8 @@ class ChromaUploader(Uploader):
136
145
  host=self.connection_config.host,
137
146
  port=self.connection_config.port,
138
147
  ssl=self.connection_config.ssl,
139
- headers=self.connection_config.access_config.headers,
140
- settings=self.connection_config.access_config.settings,
148
+ headers=access_config.headers,
149
+ settings=access_config.settings,
141
150
  tenant=self.connection_config.tenant,
142
151
  database=self.connection_config.database,
143
152
  )
@@ -0,0 +1,151 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from datetime import timedelta
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from pydantic import Field, Secret
8
+
9
+ from unstructured_ingest.error import DestinationConnectionError
10
+ from unstructured_ingest.utils.data_prep import batch_generator
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces import (
13
+ AccessConfig,
14
+ ConnectionConfig,
15
+ UploadContent,
16
+ Uploader,
17
+ UploaderConfig,
18
+ UploadStager,
19
+ UploadStagerConfig,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ DestinationRegistryEntry,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from couchbase.cluster import Cluster
28
+
29
+ CONNECTOR_TYPE = "couchbase"
30
+ SERVER_API_VERSION = "1"
31
+
32
+
33
+ class CouchbaseAccessConfig(AccessConfig):
34
+ password: str = Field(description="The password for the Couchbase server")
35
+
36
+
37
+ class CouchbaseConnectionConfig(ConnectionConfig):
38
+ username: str = Field(description="The username for the Couchbase server")
39
+ bucket: str = Field(description="The bucket to connect to on the Couchbase server")
40
+ connection_string: str = Field(
41
+ default="couchbase://localhost", description="The connection string of the Couchbase server"
42
+ )
43
+ scope: str = Field(
44
+ default="_default", description="The scope to connect to on the Couchbase server"
45
+ )
46
+ collection: str = Field(
47
+ default="_default", description="The collection to connect to on the Couchbase server"
48
+ )
49
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
50
+ access_config: Secret[CouchbaseAccessConfig]
51
+
52
+
53
+ class CouchbaseUploadStagerConfig(UploadStagerConfig):
54
+ pass
55
+
56
+
57
+ @dataclass
58
+ class CouchbaseUploadStager(UploadStager):
59
+ upload_stager_config: CouchbaseUploadStagerConfig = field(
60
+ default_factory=lambda: CouchbaseUploadStagerConfig()
61
+ )
62
+
63
+ def run(
64
+ self,
65
+ elements_filepath: Path,
66
+ output_dir: Path,
67
+ output_filename: str,
68
+ **kwargs: Any,
69
+ ) -> Path:
70
+ with open(elements_filepath) as elements_file:
71
+ elements_contents = json.load(elements_file)
72
+
73
+ output_elements = []
74
+ for element in elements_contents:
75
+ new_doc = {
76
+ element["element_id"]: {
77
+ "embedding": element.get("embeddings", None),
78
+ "text": element.get("text", None),
79
+ "metadata": element.get("metadata", None),
80
+ "type": element.get("type", None),
81
+ }
82
+ }
83
+ output_elements.append(new_doc)
84
+
85
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
86
+ with open(output_path, "w") as output_file:
87
+ json.dump(output_elements, output_file)
88
+ return output_path
89
+
90
+
91
+ class CouchbaseUploaderConfig(UploaderConfig):
92
+ batch_size: int = Field(default=50, description="Number of documents to upload per batch")
93
+
94
+
95
+ @dataclass
96
+ class CouchbaseUploader(Uploader):
97
+ connection_config: CouchbaseConnectionConfig
98
+ upload_config: CouchbaseUploaderConfig
99
+ connector_type: str = CONNECTOR_TYPE
100
+
101
+ @requires_dependencies(["couchbase"], extras="couchbase")
102
+ def connect_to_couchbase(self) -> "Cluster":
103
+ from couchbase.auth import PasswordAuthenticator
104
+ from couchbase.cluster import Cluster
105
+ from couchbase.options import ClusterOptions
106
+
107
+ connection_string = self.connection_config.connection_string
108
+ username = self.connection_config.username
109
+ password = self.connection_config.access_config.get_secret_value().password
110
+
111
+ auth = PasswordAuthenticator(username, password)
112
+ options = ClusterOptions(auth)
113
+ options.apply_profile("wan_development")
114
+ cluster = Cluster(connection_string, options)
115
+ cluster.wait_until_ready(timedelta(seconds=5))
116
+ return cluster
117
+
118
+ def precheck(self) -> None:
119
+ try:
120
+ self.connect_to_couchbase()
121
+ except Exception as e:
122
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
123
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
124
+
125
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
126
+ elements = []
127
+ for content in contents:
128
+ with open(content.path) as elements_file:
129
+ elements.extend(json.load(elements_file))
130
+
131
+ logger.info(
132
+ f"writing {len(elements)} objects to destination "
133
+ f"bucket, {self.connection_config.bucket} "
134
+ f"at {self.connection_config.connection_string}",
135
+ )
136
+ cluster = self.connect_to_couchbase()
137
+ bucket = cluster.bucket(self.connection_config.bucket)
138
+ scope = bucket.scope(self.connection_config.scope)
139
+ collection = scope.collection(self.connection_config.collection)
140
+
141
+ for chunk in batch_generator(elements, self.upload_config.batch_size):
142
+ collection.upsert_multi({doc_id: doc for doc in chunk for doc_id, doc in doc.items()})
143
+
144
+
145
+ couchbase_destination_entry = DestinationRegistryEntry(
146
+ connection_config=CouchbaseConnectionConfig,
147
+ uploader=CouchbaseUploader,
148
+ uploader_config=CouchbaseUploaderConfig,
149
+ upload_stager=CouchbaseUploadStager,
150
+ upload_stager_config=CouchbaseUploadStagerConfig,
151
+ )
@@ -1,8 +1,9 @@
1
1
  import os
2
- from dataclasses import dataclass, field
2
+ from dataclasses import dataclass
3
3
  from typing import TYPE_CHECKING, Any, Optional
4
4
 
5
- from unstructured_ingest.enhanced_dataclass import enhanced_field
5
+ from pydantic import Field, Secret
6
+
6
7
  from unstructured_ingest.error import DestinationConnectionError
7
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
8
9
  from unstructured_ingest.v2.interfaces import (
@@ -21,45 +22,99 @@ if TYPE_CHECKING:
21
22
  CONNECTOR_TYPE = "databricks_volumes"
22
23
 
23
24
 
24
- @dataclass
25
25
  class DatabricksVolumesAccessConfig(AccessConfig):
26
- account_id: Optional[str] = None
27
- username: Optional[str] = None
28
- password: Optional[str] = None
29
- client_id: Optional[str] = None
30
- client_secret: Optional[str] = None
31
- token: Optional[str] = None
26
+ account_id: Optional[str] = Field(
27
+ default=None,
28
+ description="The Databricks account ID for the Databricks "
29
+ "accounts endpoint. Only has effect when Host is "
30
+ "either https://accounts.cloud.databricks.com/ (AWS), "
31
+ "https://accounts.azuredatabricks.net/ (Azure), "
32
+ "or https://accounts.gcp.databricks.com/ (GCP).",
33
+ )
34
+ username: Optional[str] = Field(
35
+ default=None,
36
+ description="The Databricks username part of basic authentication. "
37
+ "Only possible when Host is *.cloud.databricks.com (AWS).",
38
+ )
39
+ password: Optional[str] = Field(
40
+ default=None,
41
+ description="The Databricks password part of basic authentication. "
42
+ "Only possible when Host is *.cloud.databricks.com (AWS).",
43
+ )
44
+ client_id: Optional[str] = Field(default=None)
45
+ client_secret: Optional[str] = Field(default=None)
46
+ token: Optional[str] = Field(
47
+ default=None,
48
+ description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
49
+ "Azure Active Directory (Azure AD) token (Azure).",
50
+ )
32
51
  profile: Optional[str] = None
33
- azure_workspace_resource_id: Optional[str] = None
34
- azure_client_secret: Optional[str] = None
35
- azure_client_id: Optional[str] = None
36
- azure_tenant_id: Optional[str] = None
37
- azure_environment: Optional[str] = None
38
- auth_type: Optional[str] = None
52
+ azure_workspace_resource_id: Optional[str] = Field(
53
+ default=None,
54
+ description="The Azure Resource Manager ID for the Azure Databricks workspace, "
55
+ "which is exchanged for a Databricks host URL.",
56
+ )
57
+ azure_client_secret: Optional[str] = Field(
58
+ default=None, description="The Azure AD service principal’s client secret."
59
+ )
60
+ azure_client_id: Optional[str] = Field(
61
+ default=None, description="The Azure AD service principal’s application ID."
62
+ )
63
+ azure_tenant_id: Optional[str] = Field(
64
+ default=None, description="The Azure AD service principal’s tenant ID."
65
+ )
66
+ azure_environment: Optional[str] = Field(
67
+ default=None,
68
+ description="The Azure environment type for a " "specific set of API endpoints",
69
+ examples=["Public", "UsGov", "China", "Germany"],
70
+ )
71
+ auth_type: Optional[str] = Field(
72
+ default=None,
73
+ description="When multiple auth attributes are available in the "
74
+ "environment, use the auth type specified by this "
75
+ "argument. This argument also holds the currently "
76
+ "selected auth.",
77
+ )
39
78
  cluster_id: Optional[str] = None
40
79
  google_credentials: Optional[str] = None
41
80
  google_service_account: Optional[str] = None
42
81
 
43
82
 
44
- @dataclass
83
+ SecretDatabricksVolumesAccessConfig = Secret[DatabricksVolumesAccessConfig]
84
+
85
+
45
86
  class DatabricksVolumesConnectionConfig(ConnectionConfig):
46
- access_config: DatabricksVolumesAccessConfig = enhanced_field(
47
- default_factory=DatabricksVolumesAccessConfig, sensitive=True
87
+ access_config: SecretDatabricksVolumesAccessConfig = Field(
88
+ default_factory=lambda: SecretDatabricksVolumesAccessConfig(
89
+ secret_value=DatabricksVolumesAccessConfig()
90
+ )
91
+ )
92
+ host: Optional[str] = Field(
93
+ default=None,
94
+ description="The Databricks host URL for either the "
95
+ "Databricks workspace endpoint or the "
96
+ "Databricks accounts endpoint.",
48
97
  )
49
- host: Optional[str] = None
50
98
 
51
99
 
52
- @dataclass
53
100
  class DatabricksVolumesUploaderConfig(UploaderConfig):
54
- volume: str
55
- catalog: str
56
- volume_path: Optional[str] = None
57
- overwrite: bool = False
58
- schema: str = "default"
101
+ volume: str = Field(description="Name of volume in the Unity Catalog")
102
+ catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
103
+ volume_path: Optional[str] = Field(
104
+ default=None, description="Optional path within the volume to write to"
105
+ )
106
+ overwrite: bool = Field(
107
+ default=False, description="If true, an existing file will be overwritten."
108
+ )
109
+ databricks_schema: str = Field(
110
+ default="default",
111
+ alias="schema",
112
+ description="Schema associated with the volume to write to in the Unity Catalog service",
113
+ )
59
114
 
60
115
  @property
61
116
  def path(self) -> str:
62
- path = f"/Volumes/{self.catalog}/{self.schema}/{self.volume}"
117
+ path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
63
118
  if self.volume_path:
64
119
  path = f"{path}/{self.volume_path}"
65
120
  return path
@@ -70,19 +125,19 @@ class DatabricksVolumesUploader(Uploader):
70
125
  connector_type: str = CONNECTOR_TYPE
71
126
  upload_config: DatabricksVolumesUploaderConfig
72
127
  connection_config: DatabricksVolumesConnectionConfig
73
- client: Optional["WorkspaceClient"] = field(init=False, default=None)
74
128
 
75
129
  @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
76
- def __post_init__(self) -> "WorkspaceClient":
130
+ def get_client(self) -> "WorkspaceClient":
77
131
  from databricks.sdk import WorkspaceClient
78
132
 
79
- self.client = WorkspaceClient(
80
- host=self.connection_config.host, **self.connection_config.access_config.to_dict()
133
+ return WorkspaceClient(
134
+ host=self.connection_config.host,
135
+ **self.connection_config.access_config.get_secret_value().dict(),
81
136
  )
82
137
 
83
138
  def precheck(self) -> None:
84
139
  try:
85
- assert self.client.current_user.me().active
140
+ assert self.get_client().current_user.me().active
86
141
  except Exception as e:
87
142
  logger.error(f"failed to validate connection: {e}", exc_info=True)
88
143
  raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -91,7 +146,7 @@ class DatabricksVolumesUploader(Uploader):
91
146
  for content in contents:
92
147
  with open(content.path, "rb") as elements_file:
93
148
  output_path = os.path.join(self.upload_config.path, content.path.name)
94
- self.client.files.upload(
149
+ self.get_client().files.upload(
95
150
  file_path=output_path,
96
151
  contents=elements_file,
97
152
  overwrite=self.upload_config.overwrite,