unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +9 -6
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
  69. unstructured_ingest/v2/processes/connectors/local.py +27 -16
  70. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  72. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  73. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
  75. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  76. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  77. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  78. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  79. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  80. unstructured_ingest/v2/processes/embedder.py +106 -47
  81. unstructured_ingest/v2/processes/filter.py +11 -5
  82. unstructured_ingest/v2/processes/partitioner.py +79 -33
  83. unstructured_ingest/v2/processes/uncompress.py +3 -3
  84. unstructured_ingest/v2/utils.py +45 -0
  85. unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
  86. unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
  87. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
  88. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
  89. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  90. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  91. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  92. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  93. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  94. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  95. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  96. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  97. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  99. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  100. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  101. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  102. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  103. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  104. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  105. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  106. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  107. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  108. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  109. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  110. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  111. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  112. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  113. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  114. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  115. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  116. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  117. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
@@ -3,10 +3,10 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Optional
5
5
 
6
- from unstructured import __name__ as integration_name
7
- from unstructured.__version__ import __version__ as integration_version
6
+ from pydantic import Field, Secret
8
7
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest import __name__ as integration_name
9
+ from unstructured_ingest.__version__ import __version__ as integration_version
10
10
  from unstructured_ingest.error import DestinationConnectionError
11
11
  from unstructured_ingest.utils.data_prep import batch_generator
12
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -28,30 +28,27 @@ from unstructured_ingest.v2.processes.connector_registry import (
28
28
  if TYPE_CHECKING:
29
29
  from astrapy.db import AstraDBCollection
30
30
 
31
- CONNECTOR_TYPE = "astra"
31
+ CONNECTOR_TYPE = "astradb"
32
32
 
33
33
 
34
- @dataclass
35
- class AstraAccessConfig(AccessConfig):
36
- token: str
37
- api_endpoint: str
34
+ class AstraDBAccessConfig(AccessConfig):
35
+ token: str = Field(description="Astra DB Token with access to the database.")
36
+ api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
38
37
 
39
38
 
40
- @dataclass
41
- class AstraConnectionConfig(ConnectionConfig):
42
- connection_type: str = CONNECTOR_TYPE
43
- access_config: AstraAccessConfig = enhanced_field(sensitive=True)
39
+ class AstraDBConnectionConfig(ConnectionConfig):
40
+ connection_type: str = Field(default=CONNECTOR_TYPE, init=False)
41
+ access_config: Secret[AstraDBAccessConfig]
44
42
 
45
43
 
46
- @dataclass
47
- class AstraUploadStagerConfig(UploadStagerConfig):
44
+ class AstraDBUploadStagerConfig(UploadStagerConfig):
48
45
  pass
49
46
 
50
47
 
51
48
  @dataclass
52
- class AstraUploadStager(UploadStager):
53
- upload_stager_config: AstraUploadStagerConfig = field(
54
- default_factory=lambda: AstraUploadStagerConfig()
49
+ class AstraDBUploadStager(UploadStager):
50
+ upload_stager_config: AstraDBUploadStagerConfig = field(
51
+ default_factory=lambda: AstraDBUploadStagerConfig()
55
52
  )
56
53
 
57
54
  def conform_dict(self, element_dict: dict) -> dict:
@@ -80,19 +77,28 @@ class AstraUploadStager(UploadStager):
80
77
  return output_path
81
78
 
82
79
 
83
- @dataclass
84
- class AstraUploaderConfig(UploaderConfig):
85
- collection_name: str
86
- embedding_dimension: int
87
- namespace: Optional[str] = None
88
- requested_indexing_policy: Optional[dict[str, Any]] = None
89
- batch_size: int = 20
80
+ class AstraDBUploaderConfig(UploaderConfig):
81
+ collection_name: str = Field(
82
+ description="The name of the Astra DB collection. "
83
+ "Note that the collection name must only include letters, "
84
+ "numbers, and underscores."
85
+ )
86
+ embedding_dimension: int = Field(
87
+ default=384, description="The dimensionality of the embeddings"
88
+ )
89
+ namespace: Optional[str] = Field(default=None, description="The Astra DB connection namespace.")
90
+ requested_indexing_policy: Optional[dict[str, Any]] = Field(
91
+ default=None,
92
+ description="The indexing policy to use for the collection.",
93
+ examples=['{"deny": ["metadata"]}'],
94
+ )
95
+ batch_size: int = Field(default=20, description="Number of records per batch")
90
96
 
91
97
 
92
98
  @dataclass
93
- class AstraUploader(Uploader):
94
- connection_config: AstraConnectionConfig
95
- upload_config: AstraUploaderConfig
99
+ class AstraDBUploader(Uploader):
100
+ connection_config: AstraDBConnectionConfig
101
+ upload_config: AstraDBUploaderConfig
96
102
  connector_type: str = CONNECTOR_TYPE
97
103
 
98
104
  def precheck(self) -> None:
@@ -102,7 +108,7 @@ class AstraUploader(Uploader):
102
108
  logger.error(f"Failed to validate connection {e}", exc_info=True)
103
109
  raise DestinationConnectionError(f"failed to validate connection: {e}")
104
110
 
105
- @requires_dependencies(["astrapy"], extras="astra")
111
+ @requires_dependencies(["astrapy"], extras="astradb")
106
112
  def get_collection(self) -> "AstraDBCollection":
107
113
  from astrapy.db import AstraDB
108
114
 
@@ -111,14 +117,15 @@ class AstraUploader(Uploader):
111
117
  embedding_dimension = self.upload_config.embedding_dimension
112
118
  requested_indexing_policy = self.upload_config.requested_indexing_policy
113
119
 
114
- # If the user has requested an indexing policy, pass it to the AstraDB
120
+ # If the user has requested an indexing policy, pass it to the Astra DB
115
121
  options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
116
122
 
117
123
  # Build the Astra DB object.
118
124
  # caller_name/version for AstraDB tracking
125
+ access_configs = self.connection_config.access_config.get_secret_value()
119
126
  astra_db = AstraDB(
120
- api_endpoint=self.connection_config.access_config.api_endpoint,
121
- token=self.connection_config.access_config.token,
127
+ api_endpoint=access_configs.api_endpoint,
128
+ token=access_configs.token,
122
129
  namespace=self.upload_config.namespace,
123
130
  caller_name=integration_name,
124
131
  caller_version=integration_version,
@@ -144,17 +151,17 @@ class AstraUploader(Uploader):
144
151
  f"collection {self.upload_config.collection_name}"
145
152
  )
146
153
 
147
- astra_batch_size = self.upload_config.batch_size
154
+ astra_db_batch_size = self.upload_config.batch_size
148
155
  collection = self.get_collection()
149
156
 
150
- for chunk in batch_generator(elements_dict, astra_batch_size):
157
+ for chunk in batch_generator(elements_dict, astra_db_batch_size):
151
158
  collection.insert_many(chunk)
152
159
 
153
160
 
154
- astra_destination_entry = DestinationRegistryEntry(
155
- connection_config=AstraConnectionConfig,
156
- upload_stager_config=AstraUploadStagerConfig,
157
- upload_stager=AstraUploadStager,
158
- uploader_config=AstraUploaderConfig,
159
- uploader=AstraUploader,
161
+ astra_db_destination_entry = DestinationRegistryEntry(
162
+ connection_config=AstraDBConnectionConfig,
163
+ upload_stager_config=AstraDBUploadStagerConfig,
164
+ upload_stager=AstraDBUploadStager,
165
+ uploader_config=AstraDBUploaderConfig,
166
+ uploader=AstraDBUploader,
160
167
  )
@@ -1,10 +1,11 @@
1
1
  import json
2
- import typing as t
3
2
  import uuid
4
3
  from dataclasses import dataclass, field
5
4
  from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from pydantic import Field, Secret
6
8
 
7
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
9
  from unstructured_ingest.error import DestinationConnectionError, WriteError
9
10
  from unstructured_ingest.utils.data_prep import batch_generator
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -20,27 +21,31 @@ from unstructured_ingest.v2.interfaces import (
20
21
  from unstructured_ingest.v2.logger import logger
21
22
  from unstructured_ingest.v2.processes.connector_registry import (
22
23
  DestinationRegistryEntry,
23
- add_destination_entry,
24
24
  )
25
25
  from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
26
26
 
27
- if t.TYPE_CHECKING:
27
+ if TYPE_CHECKING:
28
28
  from azure.search.documents import SearchClient
29
29
 
30
30
 
31
31
  CONNECTOR_TYPE = "azure_cognitive_search"
32
32
 
33
33
 
34
- @dataclass
35
34
  class AzureCognitiveSearchAccessConfig(AccessConfig):
36
- key: t.Optional[str] = enhanced_field(default=None, overload_name="azure_cognitive_search_key")
35
+ azure_cognitive_search_key: str = Field(
36
+ alias="key", description="Credential that is used for authenticating to an Azure service"
37
+ )
37
38
 
38
39
 
39
- @dataclass
40
40
  class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
41
- endpoint: str
42
- index: str
43
- access_config: AzureCognitiveSearchAccessConfig = enhanced_field(sensitive=True)
41
+ endpoint: str = Field(
42
+ description="The URL endpoint of an Azure AI (Cognitive) search service. "
43
+ "In the form of https://{{service_name}}.search.windows.net"
44
+ )
45
+ index: str = Field(
46
+ description="The name of the Azure AI (Cognitive) Search index to connect to."
47
+ )
48
+ access_config: Secret[AzureCognitiveSearchAccessConfig]
44
49
 
45
50
  @requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
46
51
  def generate_client(self) -> "SearchClient":
@@ -50,18 +55,18 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
50
55
  return SearchClient(
51
56
  endpoint=self.endpoint,
52
57
  index_name=self.index,
53
- credential=AzureKeyCredential(self.access_config.key),
58
+ credential=AzureKeyCredential(
59
+ self.access_config.get_secret_value().azure_cognitive_search_key
60
+ ),
54
61
  )
55
62
 
56
63
 
57
- @dataclass
58
64
  class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
59
65
  pass
60
66
 
61
67
 
62
- @dataclass
63
68
  class AzureCognitiveSearchUploaderConfig(UploaderConfig):
64
- batch_size: int = 100
69
+ batch_size: int = Field(default=100, description="Number of records per batch")
65
70
 
66
71
 
67
72
  @dataclass
@@ -122,7 +127,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
122
127
  elements_filepath: Path,
123
128
  output_dir: Path,
124
129
  output_filename: str,
125
- **kwargs: t.Any,
130
+ **kwargs: Any,
126
131
  ) -> Path:
127
132
  with open(elements_filepath) as elements_file:
128
133
  elements_contents = json.load(elements_file)
@@ -143,7 +148,7 @@ class AzureCognitiveSearchUploader(Uploader):
143
148
 
144
149
  @DestinationConnectionError.wrap
145
150
  @requires_dependencies(["azure"], extras="azure-cognitive-search")
146
- def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
151
+ def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
147
152
  import azure.core.exceptions
148
153
 
149
154
  logger.info(
@@ -169,7 +174,8 @@ class AzureCognitiveSearchUploader(Uploader):
169
174
  raise WriteError(
170
175
  ", ".join(
171
176
  [
172
- f"{error.key}: [{error.status_code}] {error.error_message}"
177
+ f"{error.azure_cognitive_search_key}: "
178
+ f"[{error.status_code}] {error.error_message}"
173
179
  for error in errors
174
180
  ],
175
181
  ),
@@ -186,7 +192,7 @@ class AzureCognitiveSearchUploader(Uploader):
186
192
  def write_dict_wrapper(self, elements_dict):
187
193
  return self.write_dict(elements_dict=elements_dict)
188
194
 
189
- def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None:
195
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
190
196
 
191
197
  elements_dict = []
192
198
  for content in contents:
@@ -207,13 +213,10 @@ class AzureCognitiveSearchUploader(Uploader):
207
213
  self.write_dict(elements_dict=chunk) # noqa: E203
208
214
 
209
215
 
210
- add_destination_entry(
211
- destination_type=CONNECTOR_TYPE,
212
- entry=DestinationRegistryEntry(
213
- connection_config=AzureCognitiveSearchConnectionConfig,
214
- uploader=AzureCognitiveSearchUploader,
215
- uploader_config=AzureCognitiveSearchUploaderConfig,
216
- upload_stager=AzureCognitiveSearchUploadStager,
217
- upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
218
- ),
216
+ azure_cognitive_search_destination_entry = DestinationRegistryEntry(
217
+ connection_config=AzureCognitiveSearchConnectionConfig,
218
+ uploader=AzureCognitiveSearchUploader,
219
+ uploader_config=AzureCognitiveSearchUploaderConfig,
220
+ upload_stager=AzureCognitiveSearchUploadStager,
221
+ upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
219
222
  )
@@ -3,11 +3,11 @@ import uuid
3
3
  from dataclasses import dataclass, field
4
4
  from datetime import date, datetime
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Dict, Optional
6
+ from typing import TYPE_CHECKING, Any, Optional
7
7
 
8
8
  from dateutil import parser
9
+ from pydantic import Field, Secret
9
10
 
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
11
  from unstructured_ingest.error import DestinationConnectionError
12
12
  from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -32,26 +32,35 @@ if TYPE_CHECKING:
32
32
  CONNECTOR_TYPE = "chroma"
33
33
 
34
34
 
35
- @dataclass
36
35
  class ChromaAccessConfig(AccessConfig):
37
- settings: Optional[Dict[str, str]] = None
38
- headers: Optional[Dict[str, str]] = None
36
+ settings: Optional[dict[str, str]] = Field(
37
+ default=None, description="A dictionary of settings to communicate with the chroma server."
38
+ )
39
+ headers: Optional[dict[str, str]] = Field(
40
+ default=None, description="A dictionary of headers to send to the Chroma server."
41
+ )
39
42
 
40
43
 
41
- @dataclass
42
44
  class ChromaConnectionConfig(ConnectionConfig):
43
- collection_name: str
44
- access_config: ChromaAccessConfig = enhanced_field(sensitive=True)
45
- path: Optional[str] = None
46
- tenant: Optional[str] = "default_tenant"
47
- database: Optional[str] = "default_database"
48
- host: Optional[str] = None
49
- port: Optional[int] = None
50
- ssl: bool = False
51
- connector_type: str = CONNECTOR_TYPE
45
+ collection_name: str = Field(description="The name of the Chroma collection to write into.")
46
+ access_config: Secret[ChromaAccessConfig]
47
+ path: Optional[str] = Field(
48
+ default=None, description="Location where Chroma is persisted, if not connecting via http."
49
+ )
50
+ tenant: Optional[str] = Field(
51
+ default="default_tenant", description="The tenant to use for this client."
52
+ )
53
+ database: Optional[str] = Field(
54
+ default="default_database", description="The database to use for this client."
55
+ )
56
+ host: Optional[str] = Field(default=None, description="The hostname of the Chroma server.")
57
+ port: Optional[int] = Field(default=None, description="The port of the Chroma server.")
58
+ ssl: bool = Field(
59
+ default=False, description="Whether to use SSL to connect to the Chroma server."
60
+ )
61
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
52
62
 
53
63
 
54
- @dataclass
55
64
  class ChromaUploadStagerConfig(UploadStagerConfig):
56
65
  pass
57
66
 
@@ -101,9 +110,8 @@ class ChromaUploadStager(UploadStager):
101
110
  return output_path
102
111
 
103
112
 
104
- @dataclass
105
113
  class ChromaUploaderConfig(UploaderConfig):
106
- batch_size: int = 100
114
+ batch_size: int = Field(default=100, description="Number of records per batch")
107
115
 
108
116
 
109
117
  @dataclass
@@ -123,10 +131,11 @@ class ChromaUploader(Uploader):
123
131
  def create_client(self) -> "Client":
124
132
  import chromadb
125
133
 
134
+ access_config = self.connection_config.access_config.get_secret_value()
126
135
  if self.connection_config.path:
127
136
  return chromadb.PersistentClient(
128
137
  path=self.connection_config.path,
129
- settings=self.connection_config.access_config.settings,
138
+ settings=access_config.settings,
130
139
  tenant=self.connection_config.tenant,
131
140
  database=self.connection_config.database,
132
141
  )
@@ -136,8 +145,8 @@ class ChromaUploader(Uploader):
136
145
  host=self.connection_config.host,
137
146
  port=self.connection_config.port,
138
147
  ssl=self.connection_config.ssl,
139
- headers=self.connection_config.access_config.headers,
140
- settings=self.connection_config.access_config.settings,
148
+ headers=access_config.headers,
149
+ settings=access_config.settings,
141
150
  tenant=self.connection_config.tenant,
142
151
  database=self.connection_config.database,
143
152
  )