unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +57 -13
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -23
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +8 -5
- unstructured_ingest/v2/interfaces/file_data.py +8 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/pipeline.py +9 -6
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +13 -11
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
- unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
- unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
- unstructured_ingest/v2/processes/connectors/local.py +27 -16
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
- unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
- unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
- unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
- unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
- unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +29 -31
- unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +11 -5
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -52
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -13
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
|
@@ -3,10 +3,10 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from unstructured.__version__ import __version__ as integration_version
|
|
6
|
+
from pydantic import Field, Secret
|
|
8
7
|
|
|
9
|
-
from unstructured_ingest
|
|
8
|
+
from unstructured_ingest import __name__ as integration_name
|
|
9
|
+
from unstructured_ingest.__version__ import __version__ as integration_version
|
|
10
10
|
from unstructured_ingest.error import DestinationConnectionError
|
|
11
11
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
12
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -28,30 +28,27 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
29
|
from astrapy.db import AstraDBCollection
|
|
30
30
|
|
|
31
|
-
CONNECTOR_TYPE = "
|
|
31
|
+
CONNECTOR_TYPE = "astradb"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
api_endpoint: str
|
|
34
|
+
class AstraDBAccessConfig(AccessConfig):
|
|
35
|
+
token: str = Field(description="Astra DB Token with access to the database.")
|
|
36
|
+
api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
access_config: AstraAccessConfig = enhanced_field(sensitive=True)
|
|
39
|
+
class AstraDBConnectionConfig(ConnectionConfig):
|
|
40
|
+
connection_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
41
|
+
access_config: Secret[AstraDBAccessConfig]
|
|
44
42
|
|
|
45
43
|
|
|
46
|
-
|
|
47
|
-
class AstraUploadStagerConfig(UploadStagerConfig):
|
|
44
|
+
class AstraDBUploadStagerConfig(UploadStagerConfig):
|
|
48
45
|
pass
|
|
49
46
|
|
|
50
47
|
|
|
51
48
|
@dataclass
|
|
52
|
-
class
|
|
53
|
-
upload_stager_config:
|
|
54
|
-
default_factory=lambda:
|
|
49
|
+
class AstraDBUploadStager(UploadStager):
|
|
50
|
+
upload_stager_config: AstraDBUploadStagerConfig = field(
|
|
51
|
+
default_factory=lambda: AstraDBUploadStagerConfig()
|
|
55
52
|
)
|
|
56
53
|
|
|
57
54
|
def conform_dict(self, element_dict: dict) -> dict:
|
|
@@ -80,19 +77,28 @@ class AstraUploadStager(UploadStager):
|
|
|
80
77
|
return output_path
|
|
81
78
|
|
|
82
79
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
80
|
+
class AstraDBUploaderConfig(UploaderConfig):
|
|
81
|
+
collection_name: str = Field(
|
|
82
|
+
description="The name of the Astra DB collection. "
|
|
83
|
+
"Note that the collection name must only include letters, "
|
|
84
|
+
"numbers, and underscores."
|
|
85
|
+
)
|
|
86
|
+
embedding_dimension: int = Field(
|
|
87
|
+
default=384, description="The dimensionality of the embeddings"
|
|
88
|
+
)
|
|
89
|
+
namespace: Optional[str] = Field(default=None, description="The Astra DB connection namespace.")
|
|
90
|
+
requested_indexing_policy: Optional[dict[str, Any]] = Field(
|
|
91
|
+
default=None,
|
|
92
|
+
description="The indexing policy to use for the collection.",
|
|
93
|
+
examples=['{"deny": ["metadata"]}'],
|
|
94
|
+
)
|
|
95
|
+
batch_size: int = Field(default=20, description="Number of records per batch")
|
|
90
96
|
|
|
91
97
|
|
|
92
98
|
@dataclass
|
|
93
|
-
class
|
|
94
|
-
connection_config:
|
|
95
|
-
upload_config:
|
|
99
|
+
class AstraDBUploader(Uploader):
|
|
100
|
+
connection_config: AstraDBConnectionConfig
|
|
101
|
+
upload_config: AstraDBUploaderConfig
|
|
96
102
|
connector_type: str = CONNECTOR_TYPE
|
|
97
103
|
|
|
98
104
|
def precheck(self) -> None:
|
|
@@ -102,7 +108,7 @@ class AstraUploader(Uploader):
|
|
|
102
108
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
103
109
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
104
110
|
|
|
105
|
-
@requires_dependencies(["astrapy"], extras="
|
|
111
|
+
@requires_dependencies(["astrapy"], extras="astradb")
|
|
106
112
|
def get_collection(self) -> "AstraDBCollection":
|
|
107
113
|
from astrapy.db import AstraDB
|
|
108
114
|
|
|
@@ -111,14 +117,15 @@ class AstraUploader(Uploader):
|
|
|
111
117
|
embedding_dimension = self.upload_config.embedding_dimension
|
|
112
118
|
requested_indexing_policy = self.upload_config.requested_indexing_policy
|
|
113
119
|
|
|
114
|
-
# If the user has requested an indexing policy, pass it to the
|
|
120
|
+
# If the user has requested an indexing policy, pass it to the Astra DB
|
|
115
121
|
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
|
|
116
122
|
|
|
117
123
|
# Build the Astra DB object.
|
|
118
124
|
# caller_name/version for AstraDB tracking
|
|
125
|
+
access_configs = self.connection_config.access_config.get_secret_value()
|
|
119
126
|
astra_db = AstraDB(
|
|
120
|
-
api_endpoint=
|
|
121
|
-
token=
|
|
127
|
+
api_endpoint=access_configs.api_endpoint,
|
|
128
|
+
token=access_configs.token,
|
|
122
129
|
namespace=self.upload_config.namespace,
|
|
123
130
|
caller_name=integration_name,
|
|
124
131
|
caller_version=integration_version,
|
|
@@ -144,17 +151,17 @@ class AstraUploader(Uploader):
|
|
|
144
151
|
f"collection {self.upload_config.collection_name}"
|
|
145
152
|
)
|
|
146
153
|
|
|
147
|
-
|
|
154
|
+
astra_db_batch_size = self.upload_config.batch_size
|
|
148
155
|
collection = self.get_collection()
|
|
149
156
|
|
|
150
|
-
for chunk in batch_generator(elements_dict,
|
|
157
|
+
for chunk in batch_generator(elements_dict, astra_db_batch_size):
|
|
151
158
|
collection.insert_many(chunk)
|
|
152
159
|
|
|
153
160
|
|
|
154
|
-
|
|
155
|
-
connection_config=
|
|
156
|
-
upload_stager_config=
|
|
157
|
-
upload_stager=
|
|
158
|
-
uploader_config=
|
|
159
|
-
uploader=
|
|
161
|
+
astra_db_destination_entry = DestinationRegistryEntry(
|
|
162
|
+
connection_config=AstraDBConnectionConfig,
|
|
163
|
+
upload_stager_config=AstraDBUploadStagerConfig,
|
|
164
|
+
upload_stager=AstraDBUploadStager,
|
|
165
|
+
uploader_config=AstraDBUploaderConfig,
|
|
166
|
+
uploader=AstraDBUploader,
|
|
160
167
|
)
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import typing as t
|
|
3
2
|
import uuid
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from pydantic import Field, Secret
|
|
6
8
|
|
|
7
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
8
9
|
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
9
10
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -20,27 +21,31 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
21
|
from unstructured_ingest.v2.logger import logger
|
|
21
22
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
23
|
DestinationRegistryEntry,
|
|
23
|
-
add_destination_entry,
|
|
24
24
|
)
|
|
25
25
|
from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
|
|
26
26
|
|
|
27
|
-
if
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
28
|
from azure.search.documents import SearchClient
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
CONNECTOR_TYPE = "azure_cognitive_search"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@dataclass
|
|
35
34
|
class AzureCognitiveSearchAccessConfig(AccessConfig):
|
|
36
|
-
|
|
35
|
+
azure_cognitive_search_key: str = Field(
|
|
36
|
+
alias="key", description="Credential that is used for authenticating to an Azure service"
|
|
37
|
+
)
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
@dataclass
|
|
40
40
|
class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
41
|
-
endpoint: str
|
|
42
|
-
|
|
43
|
-
|
|
41
|
+
endpoint: str = Field(
|
|
42
|
+
description="The URL endpoint of an Azure AI (Cognitive) search service. "
|
|
43
|
+
"In the form of https://{{service_name}}.search.windows.net"
|
|
44
|
+
)
|
|
45
|
+
index: str = Field(
|
|
46
|
+
description="The name of the Azure AI (Cognitive) Search index to connect to."
|
|
47
|
+
)
|
|
48
|
+
access_config: Secret[AzureCognitiveSearchAccessConfig]
|
|
44
49
|
|
|
45
50
|
@requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
|
|
46
51
|
def generate_client(self) -> "SearchClient":
|
|
@@ -50,18 +55,18 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
|
50
55
|
return SearchClient(
|
|
51
56
|
endpoint=self.endpoint,
|
|
52
57
|
index_name=self.index,
|
|
53
|
-
credential=AzureKeyCredential(
|
|
58
|
+
credential=AzureKeyCredential(
|
|
59
|
+
self.access_config.get_secret_value().azure_cognitive_search_key
|
|
60
|
+
),
|
|
54
61
|
)
|
|
55
62
|
|
|
56
63
|
|
|
57
|
-
@dataclass
|
|
58
64
|
class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
|
|
59
65
|
pass
|
|
60
66
|
|
|
61
67
|
|
|
62
|
-
@dataclass
|
|
63
68
|
class AzureCognitiveSearchUploaderConfig(UploaderConfig):
|
|
64
|
-
batch_size: int = 100
|
|
69
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
65
70
|
|
|
66
71
|
|
|
67
72
|
@dataclass
|
|
@@ -122,7 +127,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
122
127
|
elements_filepath: Path,
|
|
123
128
|
output_dir: Path,
|
|
124
129
|
output_filename: str,
|
|
125
|
-
**kwargs:
|
|
130
|
+
**kwargs: Any,
|
|
126
131
|
) -> Path:
|
|
127
132
|
with open(elements_filepath) as elements_file:
|
|
128
133
|
elements_contents = json.load(elements_file)
|
|
@@ -143,7 +148,7 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
143
148
|
|
|
144
149
|
@DestinationConnectionError.wrap
|
|
145
150
|
@requires_dependencies(["azure"], extras="azure-cognitive-search")
|
|
146
|
-
def write_dict(self, *args, elements_dict:
|
|
151
|
+
def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
|
|
147
152
|
import azure.core.exceptions
|
|
148
153
|
|
|
149
154
|
logger.info(
|
|
@@ -169,7 +174,8 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
169
174
|
raise WriteError(
|
|
170
175
|
", ".join(
|
|
171
176
|
[
|
|
172
|
-
f"{error.
|
|
177
|
+
f"{error.azure_cognitive_search_key}: "
|
|
178
|
+
f"[{error.status_code}] {error.error_message}"
|
|
173
179
|
for error in errors
|
|
174
180
|
],
|
|
175
181
|
),
|
|
@@ -186,7 +192,7 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
186
192
|
def write_dict_wrapper(self, elements_dict):
|
|
187
193
|
return self.write_dict(elements_dict=elements_dict)
|
|
188
194
|
|
|
189
|
-
def run(self, contents: list[UploadContent], **kwargs:
|
|
195
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
190
196
|
|
|
191
197
|
elements_dict = []
|
|
192
198
|
for content in contents:
|
|
@@ -207,13 +213,10 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
207
213
|
self.write_dict(elements_dict=chunk) # noqa: E203
|
|
208
214
|
|
|
209
215
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
upload_stager=AzureCognitiveSearchUploadStager,
|
|
217
|
-
upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
|
|
218
|
-
),
|
|
216
|
+
azure_cognitive_search_destination_entry = DestinationRegistryEntry(
|
|
217
|
+
connection_config=AzureCognitiveSearchConnectionConfig,
|
|
218
|
+
uploader=AzureCognitiveSearchUploader,
|
|
219
|
+
uploader_config=AzureCognitiveSearchUploaderConfig,
|
|
220
|
+
upload_stager=AzureCognitiveSearchUploadStager,
|
|
221
|
+
upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
|
|
219
222
|
)
|
|
@@ -3,11 +3,11 @@ import uuid
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from datetime import date, datetime
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Any,
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
7
7
|
|
|
8
8
|
from dateutil import parser
|
|
9
|
+
from pydantic import Field, Secret
|
|
9
10
|
|
|
10
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
11
|
from unstructured_ingest.error import DestinationConnectionError
|
|
12
12
|
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -32,26 +32,35 @@ if TYPE_CHECKING:
|
|
|
32
32
|
CONNECTOR_TYPE = "chroma"
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
@dataclass
|
|
36
35
|
class ChromaAccessConfig(AccessConfig):
|
|
37
|
-
settings: Optional[
|
|
38
|
-
|
|
36
|
+
settings: Optional[dict[str, str]] = Field(
|
|
37
|
+
default=None, description="A dictionary of settings to communicate with the chroma server."
|
|
38
|
+
)
|
|
39
|
+
headers: Optional[dict[str, str]] = Field(
|
|
40
|
+
default=None, description="A dictionary of headers to send to the Chroma server."
|
|
41
|
+
)
|
|
39
42
|
|
|
40
43
|
|
|
41
|
-
@dataclass
|
|
42
44
|
class ChromaConnectionConfig(ConnectionConfig):
|
|
43
|
-
collection_name: str
|
|
44
|
-
access_config: ChromaAccessConfig
|
|
45
|
-
path: Optional[str] =
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
45
|
+
collection_name: str = Field(description="The name of the Chroma collection to write into.")
|
|
46
|
+
access_config: Secret[ChromaAccessConfig]
|
|
47
|
+
path: Optional[str] = Field(
|
|
48
|
+
default=None, description="Location where Chroma is persisted, if not connecting via http."
|
|
49
|
+
)
|
|
50
|
+
tenant: Optional[str] = Field(
|
|
51
|
+
default="default_tenant", description="The tenant to use for this client."
|
|
52
|
+
)
|
|
53
|
+
database: Optional[str] = Field(
|
|
54
|
+
default="default_database", description="The database to use for this client."
|
|
55
|
+
)
|
|
56
|
+
host: Optional[str] = Field(default=None, description="The hostname of the Chroma server.")
|
|
57
|
+
port: Optional[int] = Field(default=None, description="The port of the Chroma server.")
|
|
58
|
+
ssl: bool = Field(
|
|
59
|
+
default=False, description="Whether to use SSL to connect to the Chroma server."
|
|
60
|
+
)
|
|
61
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
52
62
|
|
|
53
63
|
|
|
54
|
-
@dataclass
|
|
55
64
|
class ChromaUploadStagerConfig(UploadStagerConfig):
|
|
56
65
|
pass
|
|
57
66
|
|
|
@@ -101,9 +110,8 @@ class ChromaUploadStager(UploadStager):
|
|
|
101
110
|
return output_path
|
|
102
111
|
|
|
103
112
|
|
|
104
|
-
@dataclass
|
|
105
113
|
class ChromaUploaderConfig(UploaderConfig):
|
|
106
|
-
batch_size: int = 100
|
|
114
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
107
115
|
|
|
108
116
|
|
|
109
117
|
@dataclass
|
|
@@ -123,10 +131,11 @@ class ChromaUploader(Uploader):
|
|
|
123
131
|
def create_client(self) -> "Client":
|
|
124
132
|
import chromadb
|
|
125
133
|
|
|
134
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
126
135
|
if self.connection_config.path:
|
|
127
136
|
return chromadb.PersistentClient(
|
|
128
137
|
path=self.connection_config.path,
|
|
129
|
-
settings=
|
|
138
|
+
settings=access_config.settings,
|
|
130
139
|
tenant=self.connection_config.tenant,
|
|
131
140
|
database=self.connection_config.database,
|
|
132
141
|
)
|
|
@@ -136,8 +145,8 @@ class ChromaUploader(Uploader):
|
|
|
136
145
|
host=self.connection_config.host,
|
|
137
146
|
port=self.connection_config.port,
|
|
138
147
|
ssl=self.connection_config.ssl,
|
|
139
|
-
headers=
|
|
140
|
-
settings=
|
|
148
|
+
headers=access_config.headers,
|
|
149
|
+
settings=access_config.settings,
|
|
141
150
|
tenant=self.connection_config.tenant,
|
|
142
151
|
database=self.connection_config.database,
|
|
143
152
|
)
|