unstructured-ingest 0.0.18__py3-none-any.whl → 0.0.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/astradb.py +2 -2
- unstructured_ingest/connector/astradb.py +54 -24
- unstructured_ingest/v2/processes/chunker.py +8 -29
- unstructured_ingest/v2/processes/connectors/astradb.py +26 -19
- unstructured_ingest/v2/processes/connectors/chroma.py +2 -5
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +12 -14
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +1 -6
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -6
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +2 -5
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +1 -6
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +1 -6
- unstructured_ingest/v2/processes/connectors/kdbai.py +2 -5
- unstructured_ingest/v2/processes/connectors/local.py +2 -5
- unstructured_ingest/v2/processes/connectors/milvus.py +2 -5
- unstructured_ingest/v2/processes/connectors/mongodb.py +2 -5
- unstructured_ingest/v2/processes/connectors/pinecone.py +2 -5
- unstructured_ingest/v2/processes/connectors/sql.py +1 -6
- unstructured_ingest/v2/processes/connectors/weaviate.py +2 -5
- unstructured_ingest/v2/processes/partitioner.py +9 -55
- unstructured_ingest/v2/unstructured_api.py +87 -0
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/METADATA +369 -369
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/RECORD +27 -26
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.21" # pragma: no cover
|
|
@@ -37,11 +37,11 @@ class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig):
|
|
|
37
37
|
"numbers, and underscores.",
|
|
38
38
|
),
|
|
39
39
|
click.Option(
|
|
40
|
-
["--
|
|
40
|
+
["--keyspace"],
|
|
41
41
|
required=False,
|
|
42
42
|
default=None,
|
|
43
43
|
type=str,
|
|
44
|
-
help="The Astra DB connection
|
|
44
|
+
help="The Astra DB connection keyspace.",
|
|
45
45
|
),
|
|
46
46
|
]
|
|
47
47
|
return options
|
|
@@ -24,7 +24,8 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
|
24
24
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
25
25
|
|
|
26
26
|
if t.TYPE_CHECKING:
|
|
27
|
-
from astrapy
|
|
27
|
+
from astrapy import Collection as AstraDBCollection
|
|
28
|
+
from astrapy import Database as AstraDB
|
|
28
29
|
|
|
29
30
|
NON_INDEXED_FIELDS = ["metadata._node_content", "content"]
|
|
30
31
|
|
|
@@ -39,6 +40,7 @@ class AstraDBAccessConfig(AccessConfig):
|
|
|
39
40
|
class SimpleAstraDBConfig(BaseConnectorConfig):
|
|
40
41
|
access_config: AstraDBAccessConfig
|
|
41
42
|
collection_name: str
|
|
43
|
+
keyspace: t.Optional[str] = None
|
|
42
44
|
namespace: t.Optional[str] = None
|
|
43
45
|
|
|
44
46
|
|
|
@@ -98,22 +100,30 @@ class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
98
100
|
@requires_dependencies(["astrapy"], extras="astradb")
|
|
99
101
|
def astra_db_collection(self) -> "AstraDBCollection":
|
|
100
102
|
if self._astra_db_collection is None:
|
|
101
|
-
from astrapy
|
|
103
|
+
from astrapy import DataAPIClient as AstraDBClient
|
|
102
104
|
|
|
103
|
-
#
|
|
105
|
+
# Choose keyspace or deprecated namespace
|
|
106
|
+
keyspace_param = self.connector_config.keyspace or self.connector_config.namespace
|
|
107
|
+
|
|
108
|
+
# Create a client object to interact with the Astra DB
|
|
104
109
|
# caller_name/version for Astra DB tracking
|
|
105
|
-
|
|
106
|
-
api_endpoint=self.connector_config.access_config.api_endpoint,
|
|
107
|
-
token=self.connector_config.access_config.token,
|
|
108
|
-
namespace=self.connector_config.namespace,
|
|
110
|
+
my_client = AstraDBClient(
|
|
109
111
|
caller_name=integration_name,
|
|
110
112
|
caller_version=integration_version,
|
|
111
113
|
)
|
|
112
114
|
|
|
113
|
-
#
|
|
114
|
-
self.
|
|
115
|
-
|
|
115
|
+
# Get the database object
|
|
116
|
+
self._astra_db = my_client.get_database(
|
|
117
|
+
api_endpoint=self.connector_config.access_config.api_endpoint,
|
|
118
|
+
token=self.connector_config.access_config.token,
|
|
119
|
+
keyspace=keyspace_param,
|
|
116
120
|
)
|
|
121
|
+
|
|
122
|
+
# Create and connect to the newly created collection
|
|
123
|
+
self._astra_db_collection = self._astra_db.get_collection(
|
|
124
|
+
name=self.connector_config.collection_name,
|
|
125
|
+
)
|
|
126
|
+
|
|
117
127
|
return self._astra_db_collection # type: ignore
|
|
118
128
|
|
|
119
129
|
@requires_dependencies(["astrapy"], extras="astradb")
|
|
@@ -132,8 +142,14 @@ class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
|
|
132
142
|
@requires_dependencies(["astrapy"], extras="astradb")
|
|
133
143
|
def get_ingest_docs(self): # type: ignore
|
|
134
144
|
# Perform the find operation
|
|
135
|
-
|
|
145
|
+
astra_db_docs_cursor = self.astra_db_collection.find({})
|
|
136
146
|
|
|
147
|
+
# Iterate over the cursor
|
|
148
|
+
astra_db_docs = []
|
|
149
|
+
for result in astra_db_docs_cursor:
|
|
150
|
+
astra_db_docs.append(result)
|
|
151
|
+
|
|
152
|
+
# Create a list of AstraDBIngestDoc objects
|
|
137
153
|
doc_list = []
|
|
138
154
|
for record in astra_db_docs:
|
|
139
155
|
doc = AstraDBIngestDoc(
|
|
@@ -182,30 +198,41 @@ class AstraDBDestinationConnector(BaseDestinationConnector):
|
|
|
182
198
|
@requires_dependencies(["astrapy"], extras="astradb")
|
|
183
199
|
def astra_db_collection(self) -> "AstraDBCollection":
|
|
184
200
|
if self._astra_db_collection is None:
|
|
185
|
-
from astrapy
|
|
201
|
+
from astrapy import DataAPIClient as AstraDBClient
|
|
202
|
+
from astrapy.exceptions import CollectionAlreadyExistsException
|
|
203
|
+
|
|
204
|
+
# Choose keyspace or deprecated namespace
|
|
205
|
+
keyspace_param = self.connector_config.keyspace or self.connector_config.namespace
|
|
186
206
|
|
|
187
207
|
collection_name = self.connector_config.collection_name
|
|
188
208
|
embedding_dimension = self.write_config.embedding_dimension
|
|
189
|
-
|
|
190
|
-
# If the user has requested an indexing policy, pass it to the Astra DB
|
|
191
209
|
requested_indexing_policy = self.write_config.requested_indexing_policy
|
|
192
|
-
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
|
|
193
210
|
|
|
211
|
+
# Create a client object to interact with the Astra DB
|
|
194
212
|
# caller_name/version for Astra DB tracking
|
|
195
|
-
|
|
196
|
-
api_endpoint=self.connector_config.access_config.api_endpoint,
|
|
197
|
-
token=self.connector_config.access_config.token,
|
|
198
|
-
namespace=self.connector_config.namespace,
|
|
213
|
+
my_client = AstraDBClient(
|
|
199
214
|
caller_name=integration_name,
|
|
200
215
|
caller_version=integration_version,
|
|
201
216
|
)
|
|
202
217
|
|
|
203
|
-
#
|
|
204
|
-
self.
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
218
|
+
# Get the database object
|
|
219
|
+
self._astra_db = my_client.get_database(
|
|
220
|
+
api_endpoint=self.connector_config.access_config.api_endpoint,
|
|
221
|
+
token=self.connector_config.access_config.token,
|
|
222
|
+
keyspace=keyspace_param,
|
|
208
223
|
)
|
|
224
|
+
|
|
225
|
+
# Create and connect to the newly created collection
|
|
226
|
+
try:
|
|
227
|
+
self._astra_db_collection = self._astra_db.create_collection(
|
|
228
|
+
name=collection_name,
|
|
229
|
+
dimension=embedding_dimension,
|
|
230
|
+
indexing=requested_indexing_policy,
|
|
231
|
+
)
|
|
232
|
+
except CollectionAlreadyExistsException as e:
|
|
233
|
+
logger.info(f"{e}", exc_info=True)
|
|
234
|
+
self._astra_db_collection = self._astra_db.get_collection(name=collection_name)
|
|
235
|
+
|
|
209
236
|
return self._astra_db_collection
|
|
210
237
|
|
|
211
238
|
@requires_dependencies(["astrapy"], extras="astradb")
|
|
@@ -224,6 +251,9 @@ class AstraDBDestinationConnector(BaseDestinationConnector):
|
|
|
224
251
|
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
|
225
252
|
logger.info(f"inserting / updating {len(elements_dict)} documents to Astra DB.")
|
|
226
253
|
|
|
254
|
+
if self._astra_db_collection is None:
|
|
255
|
+
raise DestinationConnectionError("Astra DB collection not available for insertion.")
|
|
256
|
+
|
|
227
257
|
astra_db_batch_size = self.write_config.batch_size
|
|
228
258
|
|
|
229
259
|
for batch in batch_generator(elements_dict, astra_db_batch_size):
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Any, Optional
|
|
5
5
|
|
|
@@ -9,6 +9,7 @@ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
11
|
from unstructured_ingest.v2.logger import logger
|
|
12
|
+
from unstructured_ingest.v2.unstructured_api import call_api
|
|
12
13
|
|
|
13
14
|
CHUNK_MAX_CHARS_DEFAULT: int = 500
|
|
14
15
|
CHUNK_MULTI_PAGE_DEFAULT: bool = True
|
|
@@ -111,35 +112,13 @@ class Chunker(BaseProcess, ABC):
|
|
|
111
112
|
|
|
112
113
|
@requires_dependencies(dependencies=["unstructured_client"], extras="remote")
|
|
113
114
|
async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
114
|
-
|
|
115
|
-
from unstructured_client.models.operations import PartitionRequest
|
|
116
|
-
from unstructured_client.models.shared import Files, PartitionParameters
|
|
117
|
-
|
|
118
|
-
client = UnstructuredClient(
|
|
119
|
-
api_key_auth=self.config.chunk_api_key.get_secret_value(),
|
|
115
|
+
elements = await call_api(
|
|
120
116
|
server_url=self.config.chunking_endpoint,
|
|
117
|
+
api_key=self.config.chunk_api_key.get_secret_value(),
|
|
118
|
+
filename=elements_filepath,
|
|
119
|
+
api_parameters=self.config.to_chunking_kwargs(),
|
|
121
120
|
)
|
|
122
|
-
|
|
123
|
-
possible_fields = [f.name for f in fields(PartitionParameters)]
|
|
124
|
-
filtered_partition_request = {
|
|
125
|
-
k: v for k, v in partition_request.items() if k in possible_fields
|
|
126
|
-
}
|
|
127
|
-
if len(filtered_partition_request) != len(partition_request):
|
|
128
|
-
logger.debug(
|
|
129
|
-
"Following fields were omitted due to not being "
|
|
130
|
-
"supported by the currently used unstructured client: {}".format(
|
|
131
|
-
", ".join([v for v in partition_request if v not in filtered_partition_request])
|
|
132
|
-
)
|
|
133
|
-
)
|
|
134
|
-
with open(elements_filepath, "rb") as f:
|
|
135
|
-
files = Files(
|
|
136
|
-
content=f.read(),
|
|
137
|
-
file_name=str(elements_filepath.resolve()),
|
|
138
|
-
)
|
|
139
|
-
filtered_partition_request["files"] = files
|
|
140
|
-
partition_params = PartitionParameters(**filtered_partition_request)
|
|
141
|
-
partition_request_obj = PartitionRequest(partition_params)
|
|
142
|
-
resp = client.general.partition(partition_request_obj)
|
|
143
|
-
elements = resp.elements or []
|
|
121
|
+
|
|
144
122
|
elements = assign_and_map_hash_ids(elements=elements)
|
|
123
|
+
|
|
145
124
|
return elements
|
|
@@ -25,7 +25,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
if TYPE_CHECKING:
|
|
28
|
-
from astrapy
|
|
28
|
+
from astrapy import Collection as AstraDBCollection
|
|
29
|
+
|
|
29
30
|
|
|
30
31
|
CONNECTOR_TYPE = "astradb"
|
|
31
32
|
|
|
@@ -85,7 +86,12 @@ class AstraDBUploaderConfig(UploaderConfig):
|
|
|
85
86
|
embedding_dimension: int = Field(
|
|
86
87
|
default=384, description="The dimensionality of the embeddings"
|
|
87
88
|
)
|
|
88
|
-
|
|
89
|
+
keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
|
|
90
|
+
namespace: Optional[str] = Field(
|
|
91
|
+
default=None,
|
|
92
|
+
description="The Astra DB connection namespace.",
|
|
93
|
+
deprecated="Please use 'keyspace' instead.",
|
|
94
|
+
)
|
|
89
95
|
requested_indexing_policy: Optional[dict[str, Any]] = Field(
|
|
90
96
|
default=None,
|
|
91
97
|
description="The indexing policy to use for the collection.",
|
|
@@ -109,33 +115,34 @@ class AstraDBUploader(Uploader):
|
|
|
109
115
|
|
|
110
116
|
@requires_dependencies(["astrapy"], extras="astradb")
|
|
111
117
|
def get_collection(self) -> "AstraDBCollection":
|
|
112
|
-
from astrapy
|
|
118
|
+
from astrapy import DataAPIClient as AstraDBClient
|
|
113
119
|
|
|
114
|
-
#
|
|
115
|
-
|
|
116
|
-
embedding_dimension = self.upload_config.embedding_dimension
|
|
117
|
-
requested_indexing_policy = self.upload_config.requested_indexing_policy
|
|
120
|
+
# Choose keyspace or deprecated namespace
|
|
121
|
+
keyspace_param = self.upload_config.keyspace or self.upload_config.namespace
|
|
118
122
|
|
|
119
|
-
#
|
|
120
|
-
|
|
123
|
+
# Get the collection_name
|
|
124
|
+
collection_name = self.upload_config.collection_name
|
|
121
125
|
|
|
122
126
|
# Build the Astra DB object.
|
|
123
|
-
# caller_name/version for AstraDB tracking
|
|
124
127
|
access_configs = self.connection_config.access_config.get_secret_value()
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
128
|
+
|
|
129
|
+
# Create a client object to interact with the Astra DB
|
|
130
|
+
# caller_name/version for Astra DB tracking
|
|
131
|
+
my_client = AstraDBClient(
|
|
129
132
|
caller_name=integration_name,
|
|
130
133
|
caller_version=integration_version,
|
|
131
134
|
)
|
|
132
135
|
|
|
133
|
-
#
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
136
|
+
# Get the database object
|
|
137
|
+
astra_db = my_client.get_database(
|
|
138
|
+
api_endpoint=access_configs.api_endpoint,
|
|
139
|
+
token=access_configs.token,
|
|
140
|
+
keyspace=keyspace_param,
|
|
138
141
|
)
|
|
142
|
+
|
|
143
|
+
# Connect to the newly created collection
|
|
144
|
+
astra_db_collection = astra_db.get_collection(name=collection_name)
|
|
145
|
+
|
|
139
146
|
return astra_db_collection
|
|
140
147
|
|
|
141
148
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -41,13 +41,10 @@ class ChromaAccessConfig(AccessConfig):
|
|
|
41
41
|
)
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
SecretChromaAccessConfig = Secret[ChromaAccessConfig]
|
|
45
|
-
|
|
46
|
-
|
|
47
44
|
class ChromaConnectionConfig(ConnectionConfig):
|
|
48
45
|
collection_name: str = Field(description="The name of the Chroma collection to write into.")
|
|
49
|
-
access_config:
|
|
50
|
-
default=
|
|
46
|
+
access_config: Secret[ChromaAccessConfig] = Field(
|
|
47
|
+
default=ChromaAccessConfig(), validate_default=True
|
|
51
48
|
)
|
|
52
49
|
path: Optional[str] = Field(
|
|
53
50
|
default=None, description="Location where Chroma is persisted, if not connecting via http."
|
|
@@ -42,8 +42,10 @@ class DatabricksVolumesAccessConfig(AccessConfig):
|
|
|
42
42
|
description="The Databricks password part of basic authentication. "
|
|
43
43
|
"Only possible when Host is *.cloud.databricks.com (AWS).",
|
|
44
44
|
)
|
|
45
|
-
client_id: Optional[str] = Field(default=None)
|
|
46
|
-
client_secret: Optional[str] = Field(
|
|
45
|
+
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
46
|
+
client_secret: Optional[str] = Field(
|
|
47
|
+
default=None, description="Client Secret of the OAuth app."
|
|
48
|
+
)
|
|
47
49
|
token: Optional[str] = Field(
|
|
48
50
|
default=None,
|
|
49
51
|
description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
|
|
@@ -81,14 +83,9 @@ class DatabricksVolumesAccessConfig(AccessConfig):
|
|
|
81
83
|
google_service_account: Optional[str] = None
|
|
82
84
|
|
|
83
85
|
|
|
84
|
-
SecretDatabricksVolumesAccessConfig = Secret[DatabricksVolumesAccessConfig]
|
|
85
|
-
|
|
86
|
-
|
|
87
86
|
class DatabricksVolumesConnectionConfig(ConnectionConfig):
|
|
88
|
-
access_config:
|
|
89
|
-
|
|
90
|
-
secret_value=DatabricksVolumesAccessConfig()
|
|
91
|
-
)
|
|
87
|
+
access_config: Secret[DatabricksVolumesAccessConfig] = Field(
|
|
88
|
+
default=DatabricksVolumesAccessConfig(), validate_default=True
|
|
92
89
|
)
|
|
93
90
|
host: Optional[str] = Field(
|
|
94
91
|
default=None,
|
|
@@ -145,11 +142,12 @@ class DatabricksVolumesUploader(Uploader):
|
|
|
145
142
|
|
|
146
143
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
147
144
|
output_path = os.path.join(self.upload_config.path, path.name)
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
145
|
+
with open(path, "rb") as elements_file:
|
|
146
|
+
self.get_client().files.upload(
|
|
147
|
+
file_path=output_path,
|
|
148
|
+
contents=elements_file,
|
|
149
|
+
overwrite=self.upload_config.overwrite,
|
|
150
|
+
)
|
|
153
151
|
|
|
154
152
|
|
|
155
153
|
databricks_volumes_destination_entry = DestinationRegistryEntry(
|
|
@@ -76,14 +76,9 @@ class AzureAccessConfig(FsspecAccessConfig):
|
|
|
76
76
|
raise ValueError("either connection_string or account_name must be set")
|
|
77
77
|
|
|
78
78
|
|
|
79
|
-
SecretAzureAccessConfig = Secret[AzureAccessConfig]
|
|
80
|
-
|
|
81
|
-
|
|
82
79
|
class AzureConnectionConfig(FsspecConnectionConfig):
|
|
83
80
|
supported_protocols: list[str] = field(default_factory=lambda: ["az"], init=False)
|
|
84
|
-
access_config:
|
|
85
|
-
default_factory=lambda: SecretAzureAccessConfig(secret_value=AzureAccessConfig())
|
|
86
|
-
)
|
|
81
|
+
access_config: Secret[AzureAccessConfig]
|
|
87
82
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
88
83
|
|
|
89
84
|
def get_access_config(self) -> dict[str, Any]:
|
|
@@ -36,14 +36,9 @@ class BoxAccessConfig(FsspecAccessConfig):
|
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
SecretBoxAccessConfig = Secret[BoxAccessConfig]
|
|
40
|
-
|
|
41
|
-
|
|
42
39
|
class BoxConnectionConfig(FsspecConnectionConfig):
|
|
43
40
|
supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
|
|
44
|
-
access_config:
|
|
45
|
-
default_factory=lambda: SecretBoxAccessConfig(secret_value=BoxAccessConfig())
|
|
46
|
-
)
|
|
41
|
+
access_config: Secret[BoxAccessConfig] = Field(default=BoxAccessConfig(), validate_default=True)
|
|
47
42
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
48
43
|
|
|
49
44
|
def get_access_config(self) -> dict[str, Any]:
|
|
@@ -35,13 +35,10 @@ class DropboxAccessConfig(FsspecAccessConfig):
|
|
|
35
35
|
token: Optional[str] = Field(default=None, description="Dropbox access token.")
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
SecretDropboxAccessConfig = Secret[DropboxAccessConfig]
|
|
39
|
-
|
|
40
|
-
|
|
41
38
|
class DropboxConnectionConfig(FsspecConnectionConfig):
|
|
42
39
|
supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"], init=False)
|
|
43
|
-
access_config:
|
|
44
|
-
|
|
40
|
+
access_config: Secret[DropboxAccessConfig] = Field(
|
|
41
|
+
default=DropboxAccessConfig(), validate_default=True
|
|
45
42
|
)
|
|
46
43
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
47
44
|
|
|
@@ -86,14 +86,9 @@ class GcsAccessConfig(FsspecAccessConfig):
|
|
|
86
86
|
raise ValueError("Invalid auth token value")
|
|
87
87
|
|
|
88
88
|
|
|
89
|
-
SecretGcsAccessConfig = Secret[GcsAccessConfig]
|
|
90
|
-
|
|
91
|
-
|
|
92
89
|
class GcsConnectionConfig(FsspecConnectionConfig):
|
|
93
90
|
supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"], init=False)
|
|
94
|
-
access_config:
|
|
95
|
-
default_factory=lambda: SecretGcsAccessConfig(secret_value=GcsAccessConfig())
|
|
96
|
-
)
|
|
91
|
+
access_config: Secret[GcsAccessConfig] = Field(default=GcsAccessConfig(), validate_default=True)
|
|
97
92
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
98
93
|
|
|
99
94
|
|
|
@@ -49,14 +49,9 @@ class S3AccessConfig(FsspecAccessConfig):
|
|
|
49
49
|
)
|
|
50
50
|
|
|
51
51
|
|
|
52
|
-
SecretS3AccessConfig = Secret[S3AccessConfig]
|
|
53
|
-
|
|
54
|
-
|
|
55
52
|
class S3ConnectionConfig(FsspecConnectionConfig):
|
|
56
53
|
supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"], init=False)
|
|
57
|
-
access_config:
|
|
58
|
-
default_factory=lambda: SecretS3AccessConfig(secret_value=S3AccessConfig())
|
|
59
|
-
)
|
|
54
|
+
access_config: Secret[S3AccessConfig] = Field(default=S3AccessConfig(), validate_default=True)
|
|
60
55
|
endpoint_url: Optional[str] = Field(
|
|
61
56
|
default=None,
|
|
62
57
|
description="Use this endpoint_url, if specified. Needed for "
|
|
@@ -39,12 +39,9 @@ class KdbaiAccessConfig(AccessConfig):
|
|
|
39
39
|
)
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
SecretKdbaiAccessConfig = Secret[KdbaiAccessConfig]
|
|
43
|
-
|
|
44
|
-
|
|
45
42
|
class KdbaiConnectionConfig(ConnectionConfig):
|
|
46
|
-
access_config:
|
|
47
|
-
default=
|
|
43
|
+
access_config: Secret[KdbaiAccessConfig] = Field(
|
|
44
|
+
default=KdbaiAccessConfig(), validate_default=True
|
|
48
45
|
)
|
|
49
46
|
endpoint: str = Field(
|
|
50
47
|
default="http://localhost:8082", description="Endpoint url where KDBAI is hosted."
|
|
@@ -34,12 +34,9 @@ class LocalAccessConfig(AccessConfig):
|
|
|
34
34
|
pass
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
SecretLocalAccessConfig = Secret[LocalAccessConfig]
|
|
38
|
-
|
|
39
|
-
|
|
40
37
|
class LocalConnectionConfig(ConnectionConfig):
|
|
41
|
-
access_config:
|
|
42
|
-
|
|
38
|
+
access_config: Secret[LocalAccessConfig] = Field(
|
|
39
|
+
default=LocalAccessConfig(), validate_default=True
|
|
43
40
|
)
|
|
44
41
|
|
|
45
42
|
|
|
@@ -36,12 +36,9 @@ class MilvusAccessConfig(AccessConfig):
|
|
|
36
36
|
token: Optional[str] = Field(default=None, description="Milvus access token")
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
SecretMilvusAccessConfig = Secret[MilvusAccessConfig]
|
|
40
|
-
|
|
41
|
-
|
|
42
39
|
class MilvusConnectionConfig(ConnectionConfig):
|
|
43
|
-
access_config:
|
|
44
|
-
|
|
40
|
+
access_config: Secret[MilvusAccessConfig] = Field(
|
|
41
|
+
default=MilvusAccessConfig(), validate_default=True
|
|
45
42
|
)
|
|
46
43
|
uri: Optional[str] = Field(
|
|
47
44
|
default=None, description="Milvus uri", examples=["http://localhost:19530"]
|
|
@@ -34,12 +34,9 @@ class MongoDBAccessConfig(AccessConfig):
|
|
|
34
34
|
uri: Optional[str] = Field(default=None, description="URI to user when connecting")
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
SecretMongoDBAccessConfig = Secret[MongoDBAccessConfig]
|
|
38
|
-
|
|
39
|
-
|
|
40
37
|
class MongoDBConnectionConfig(ConnectionConfig):
|
|
41
|
-
access_config:
|
|
42
|
-
|
|
38
|
+
access_config: Secret[MongoDBAccessConfig] = Field(
|
|
39
|
+
default=MongoDBAccessConfig(), validate_default=True
|
|
43
40
|
)
|
|
44
41
|
host: Optional[str] = Field(
|
|
45
42
|
default=None,
|
|
@@ -36,13 +36,10 @@ class PineconeAccessConfig(AccessConfig):
|
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
SecretPineconeAccessConfig = Secret[PineconeAccessConfig]
|
|
40
|
-
|
|
41
|
-
|
|
42
39
|
class PineconeConnectionConfig(ConnectionConfig):
|
|
43
40
|
index_name: str = Field(description="Name of the index to connect to.")
|
|
44
|
-
access_config:
|
|
45
|
-
|
|
41
|
+
access_config: Secret[PineconeAccessConfig] = Field(
|
|
42
|
+
default=PineconeAccessConfig(), validate_default=True
|
|
46
43
|
)
|
|
47
44
|
|
|
48
45
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
@@ -40,9 +40,6 @@ class SQLAccessConfig(AccessConfig):
|
|
|
40
40
|
password: Optional[str] = Field(default=None, description="DB password")
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
SecreteSQLAccessConfig = Secret[SQLAccessConfig]
|
|
44
|
-
|
|
45
|
-
|
|
46
43
|
class SQLConnectionConfig(ConnectionConfig):
|
|
47
44
|
db_type: Literal["sqlite", "postgresql"] = Field(
|
|
48
45
|
default=SQLITE_DB, description="Type of the database backend"
|
|
@@ -53,9 +50,7 @@ class SQLConnectionConfig(ConnectionConfig):
|
|
|
53
50
|
)
|
|
54
51
|
host: Optional[str] = Field(default=None, description="DB host")
|
|
55
52
|
port: Optional[int] = Field(default=5432, description="DB host connection port")
|
|
56
|
-
access_config:
|
|
57
|
-
default_factory=lambda: SecreteSQLAccessConfig(secret_value=SQLAccessConfig())
|
|
58
|
-
)
|
|
53
|
+
access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
|
|
59
54
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
60
55
|
|
|
61
56
|
def __post_init__(self):
|
|
@@ -38,16 +38,13 @@ class WeaviateAccessConfig(AccessConfig):
|
|
|
38
38
|
password: Optional[str] = None
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
SecretWeaviateAccessConfig = Secret[WeaviateAccessConfig]
|
|
42
|
-
|
|
43
|
-
|
|
44
41
|
class WeaviateConnectionConfig(ConnectionConfig):
|
|
45
42
|
host_url: str = Field(description="Weaviate instance url")
|
|
46
43
|
class_name: str = Field(
|
|
47
44
|
description="Name of the class to push the records into, e.g: Pdf-elements"
|
|
48
45
|
)
|
|
49
|
-
access_config:
|
|
50
|
-
|
|
46
|
+
access_config: Secret[WeaviateAccessConfig] = Field(
|
|
47
|
+
default=WeaviateAccessConfig(), validate_default=True
|
|
51
48
|
)
|
|
52
49
|
username: Optional[str] = None
|
|
53
50
|
anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
from abc import ABC
|
|
3
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
4
|
+
from typing import Any, Optional
|
|
6
5
|
|
|
7
6
|
from pydantic import BaseModel, Field, SecretStr
|
|
8
7
|
|
|
@@ -10,11 +9,7 @@ from unstructured_ingest.utils.data_prep import flatten_dict
|
|
|
10
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
10
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
11
|
from unstructured_ingest.v2.logger import logger
|
|
13
|
-
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
from unstructured_client import UnstructuredClient
|
|
16
|
-
from unstructured_client.models.operations import PartitionRequest
|
|
17
|
-
from unstructured_client.models.shared import PartitionParameters
|
|
12
|
+
from unstructured_ingest.v2.unstructured_api import call_api
|
|
18
13
|
|
|
19
14
|
|
|
20
15
|
class PartitionerConfig(BaseModel):
|
|
@@ -154,60 +149,19 @@ class Partitioner(BaseProcess, ABC):
|
|
|
154
149
|
)
|
|
155
150
|
return self.postprocess(elements=elements_to_dicts(elements))
|
|
156
151
|
|
|
157
|
-
async def call_api(self, client: "UnstructuredClient", request: "PartitionRequest"):
|
|
158
|
-
# TODO when client supports async, run without using run_in_executor
|
|
159
|
-
# isolate the IO heavy call
|
|
160
|
-
loop = asyncio.get_event_loop()
|
|
161
|
-
return await loop.run_in_executor(None, client.general.partition, request)
|
|
162
|
-
|
|
163
|
-
def create_partition_parameters(self, filename: Path) -> "PartitionParameters":
|
|
164
|
-
from unstructured_client.models.shared import Files, PartitionParameters
|
|
165
|
-
|
|
166
|
-
partition_request = self.config.to_partition_kwargs()
|
|
167
|
-
|
|
168
|
-
# NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
|
|
169
|
-
# Prior to this it was a dataclass which doesn't have .__fields
|
|
170
|
-
try:
|
|
171
|
-
possible_fields = PartitionParameters.__fields__
|
|
172
|
-
except AttributeError:
|
|
173
|
-
possible_fields = [f.name for f in fields(PartitionParameters)]
|
|
174
|
-
|
|
175
|
-
filtered_partition_request = {
|
|
176
|
-
k: v for k, v in partition_request.items() if k in possible_fields
|
|
177
|
-
}
|
|
178
|
-
if len(filtered_partition_request) != len(partition_request):
|
|
179
|
-
logger.debug(
|
|
180
|
-
"Following fields were omitted due to not being "
|
|
181
|
-
"supported by the currently used unstructured client: {}".format(
|
|
182
|
-
", ".join([v for v in partition_request if v not in filtered_partition_request])
|
|
183
|
-
)
|
|
184
|
-
)
|
|
185
|
-
logger.debug(f"using hosted partitioner with kwargs: {partition_request}")
|
|
186
|
-
with open(filename, "rb") as f:
|
|
187
|
-
files = Files(
|
|
188
|
-
content=f.read(),
|
|
189
|
-
file_name=str(filename.resolve()),
|
|
190
|
-
)
|
|
191
|
-
filtered_partition_request["files"] = files
|
|
192
|
-
partition_params = PartitionParameters(**filtered_partition_request)
|
|
193
|
-
return partition_params
|
|
194
|
-
|
|
195
152
|
@requires_dependencies(dependencies=["unstructured_client"], extras="remote")
|
|
196
153
|
async def partition_via_api(
|
|
197
154
|
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
198
155
|
) -> list[dict]:
|
|
199
|
-
from unstructured_client import UnstructuredClient
|
|
200
|
-
from unstructured_client.models.operations import PartitionRequest
|
|
201
|
-
|
|
202
156
|
logger.debug(f"partitioning file {filename} with metadata: {metadata}")
|
|
203
|
-
|
|
157
|
+
|
|
158
|
+
elements = await call_api(
|
|
204
159
|
server_url=self.config.partition_endpoint,
|
|
205
|
-
|
|
160
|
+
api_key=self.config.api_key.get_secret_value(),
|
|
161
|
+
filename=filename,
|
|
162
|
+
api_parameters=self.config.to_partition_kwargs(),
|
|
206
163
|
)
|
|
207
|
-
|
|
208
|
-
partition_request = PartitionRequest(partition_params)
|
|
209
|
-
resp = await self.call_api(client=client, request=partition_request)
|
|
210
|
-
elements = resp.elements or []
|
|
164
|
+
|
|
211
165
|
# Append the data source metadata the auto partition does for you
|
|
212
166
|
for element in elements:
|
|
213
167
|
element["metadata"]["data_source"] = metadata
|