unstructured-ingest 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.0.19" # pragma: no cover
1
+ __version__ = "0.0.21" # pragma: no cover
@@ -37,11 +37,11 @@ class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig):
37
37
  "numbers, and underscores.",
38
38
  ),
39
39
  click.Option(
40
- ["--namespace"],
40
+ ["--keyspace"],
41
41
  required=False,
42
42
  default=None,
43
43
  type=str,
44
- help="The Astra DB connection namespace.",
44
+ help="The Astra DB connection keyspace.",
45
45
  ),
46
46
  ]
47
47
  return options
@@ -24,7 +24,8 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
24
24
  from unstructured_ingest.utils.dep_check import requires_dependencies
25
25
 
26
26
  if t.TYPE_CHECKING:
27
- from astrapy.db import AstraDB, AstraDBCollection
27
+ from astrapy import Collection as AstraDBCollection
28
+ from astrapy import Database as AstraDB
28
29
 
29
30
  NON_INDEXED_FIELDS = ["metadata._node_content", "content"]
30
31
 
@@ -39,6 +40,7 @@ class AstraDBAccessConfig(AccessConfig):
39
40
  class SimpleAstraDBConfig(BaseConnectorConfig):
40
41
  access_config: AstraDBAccessConfig
41
42
  collection_name: str
43
+ keyspace: t.Optional[str] = None
42
44
  namespace: t.Optional[str] = None
43
45
 
44
46
 
@@ -98,22 +100,30 @@ class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
98
100
  @requires_dependencies(["astrapy"], extras="astradb")
99
101
  def astra_db_collection(self) -> "AstraDBCollection":
100
102
  if self._astra_db_collection is None:
101
- from astrapy.db import AstraDB
103
+ from astrapy import DataAPIClient as AstraDBClient
102
104
 
103
- # Build the Astra DB object.
105
+ # Choose keyspace or deprecated namespace
106
+ keyspace_param = self.connector_config.keyspace or self.connector_config.namespace
107
+
108
+ # Create a client object to interact with the Astra DB
104
109
  # caller_name/version for Astra DB tracking
105
- self._astra_db = AstraDB(
106
- api_endpoint=self.connector_config.access_config.api_endpoint,
107
- token=self.connector_config.access_config.token,
108
- namespace=self.connector_config.namespace,
110
+ my_client = AstraDBClient(
109
111
  caller_name=integration_name,
110
112
  caller_version=integration_version,
111
113
  )
112
114
 
113
- # Create and connect to the collection
114
- self._astra_db_collection = self._astra_db.collection(
115
- collection_name=self.connector_config.collection_name,
115
+ # Get the database object
116
+ self._astra_db = my_client.get_database(
117
+ api_endpoint=self.connector_config.access_config.api_endpoint,
118
+ token=self.connector_config.access_config.token,
119
+ keyspace=keyspace_param,
116
120
  )
121
+
122
+ # Create and connect to the newly created collection
123
+ self._astra_db_collection = self._astra_db.get_collection(
124
+ name=self.connector_config.collection_name,
125
+ )
126
+
117
127
  return self._astra_db_collection # type: ignore
118
128
 
119
129
  @requires_dependencies(["astrapy"], extras="astradb")
@@ -132,8 +142,14 @@ class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
132
142
  @requires_dependencies(["astrapy"], extras="astradb")
133
143
  def get_ingest_docs(self): # type: ignore
134
144
  # Perform the find operation
135
- astra_db_docs = list(self.astra_db_collection.paginated_find())
145
+ astra_db_docs_cursor = self.astra_db_collection.find({})
136
146
 
147
+ # Iterate over the cursor
148
+ astra_db_docs = []
149
+ for result in astra_db_docs_cursor:
150
+ astra_db_docs.append(result)
151
+
152
+ # Create a list of AstraDBIngestDoc objects
137
153
  doc_list = []
138
154
  for record in astra_db_docs:
139
155
  doc = AstraDBIngestDoc(
@@ -182,30 +198,41 @@ class AstraDBDestinationConnector(BaseDestinationConnector):
182
198
  @requires_dependencies(["astrapy"], extras="astradb")
183
199
  def astra_db_collection(self) -> "AstraDBCollection":
184
200
  if self._astra_db_collection is None:
185
- from astrapy.db import AstraDB
201
+ from astrapy import DataAPIClient as AstraDBClient
202
+ from astrapy.exceptions import CollectionAlreadyExistsException
203
+
204
+ # Choose keyspace or deprecated namespace
205
+ keyspace_param = self.connector_config.keyspace or self.connector_config.namespace
186
206
 
187
207
  collection_name = self.connector_config.collection_name
188
208
  embedding_dimension = self.write_config.embedding_dimension
189
-
190
- # If the user has requested an indexing policy, pass it to the Astra DB
191
209
  requested_indexing_policy = self.write_config.requested_indexing_policy
192
- options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
193
210
 
211
+ # Create a client object to interact with the Astra DB
194
212
  # caller_name/version for Astra DB tracking
195
- self._astra_db = AstraDB(
196
- api_endpoint=self.connector_config.access_config.api_endpoint,
197
- token=self.connector_config.access_config.token,
198
- namespace=self.connector_config.namespace,
213
+ my_client = AstraDBClient(
199
214
  caller_name=integration_name,
200
215
  caller_version=integration_version,
201
216
  )
202
217
 
203
- # Create and connect to the newly created collection
204
- self._astra_db_collection = self._astra_db.create_collection(
205
- collection_name=collection_name,
206
- dimension=embedding_dimension,
207
- options=options,
218
+ # Get the database object
219
+ self._astra_db = my_client.get_database(
220
+ api_endpoint=self.connector_config.access_config.api_endpoint,
221
+ token=self.connector_config.access_config.token,
222
+ keyspace=keyspace_param,
208
223
  )
224
+
225
+ # Create and connect to the newly created collection
226
+ try:
227
+ self._astra_db_collection = self._astra_db.create_collection(
228
+ name=collection_name,
229
+ dimension=embedding_dimension,
230
+ indexing=requested_indexing_policy,
231
+ )
232
+ except CollectionAlreadyExistsException as e:
233
+ logger.info(f"{e}", exc_info=True)
234
+ self._astra_db_collection = self._astra_db.get_collection(name=collection_name)
235
+
209
236
  return self._astra_db_collection
210
237
 
211
238
  @requires_dependencies(["astrapy"], extras="astradb")
@@ -224,6 +251,9 @@ class AstraDBDestinationConnector(BaseDestinationConnector):
224
251
  def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
225
252
  logger.info(f"inserting / updating {len(elements_dict)} documents to Astra DB.")
226
253
 
254
+ if self._astra_db_collection is None:
255
+ raise DestinationConnectionError("Astra DB collection not available for insertion.")
256
+
227
257
  astra_db_batch_size = self.write_config.batch_size
228
258
 
229
259
  for batch in batch_generator(elements_dict, astra_db_batch_size):
@@ -1,5 +1,5 @@
1
1
  from abc import ABC
2
- from dataclasses import dataclass, fields
2
+ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Any, Optional
5
5
 
@@ -9,6 +9,7 @@ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
11
11
  from unstructured_ingest.v2.logger import logger
12
+ from unstructured_ingest.v2.unstructured_api import call_api
12
13
 
13
14
  CHUNK_MAX_CHARS_DEFAULT: int = 500
14
15
  CHUNK_MULTI_PAGE_DEFAULT: bool = True
@@ -111,35 +112,13 @@ class Chunker(BaseProcess, ABC):
111
112
 
112
113
  @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
113
114
  async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
114
- from unstructured_client import UnstructuredClient
115
- from unstructured_client.models.operations import PartitionRequest
116
- from unstructured_client.models.shared import Files, PartitionParameters
117
-
118
- client = UnstructuredClient(
119
- api_key_auth=self.config.chunk_api_key.get_secret_value(),
115
+ elements = await call_api(
120
116
  server_url=self.config.chunking_endpoint,
117
+ api_key=self.config.chunk_api_key.get_secret_value(),
118
+ filename=elements_filepath,
119
+ api_parameters=self.config.to_chunking_kwargs(),
121
120
  )
122
- partition_request = self.config.to_chunking_kwargs()
123
- possible_fields = [f.name for f in fields(PartitionParameters)]
124
- filtered_partition_request = {
125
- k: v for k, v in partition_request.items() if k in possible_fields
126
- }
127
- if len(filtered_partition_request) != len(partition_request):
128
- logger.debug(
129
- "Following fields were omitted due to not being "
130
- "supported by the currently used unstructured client: {}".format(
131
- ", ".join([v for v in partition_request if v not in filtered_partition_request])
132
- )
133
- )
134
- with open(elements_filepath, "rb") as f:
135
- files = Files(
136
- content=f.read(),
137
- file_name=str(elements_filepath.resolve()),
138
- )
139
- filtered_partition_request["files"] = files
140
- partition_params = PartitionParameters(**filtered_partition_request)
141
- partition_request_obj = PartitionRequest(partition_params)
142
- resp = client.general.partition(partition_request_obj)
143
- elements = resp.elements or []
121
+
144
122
  elements = assign_and_map_hash_ids(elements=elements)
123
+
145
124
  return elements
@@ -25,7 +25,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
25
25
  )
26
26
 
27
27
  if TYPE_CHECKING:
28
- from astrapy.db import AstraDBCollection
28
+ from astrapy import Collection as AstraDBCollection
29
+
29
30
 
30
31
  CONNECTOR_TYPE = "astradb"
31
32
 
@@ -85,7 +86,12 @@ class AstraDBUploaderConfig(UploaderConfig):
85
86
  embedding_dimension: int = Field(
86
87
  default=384, description="The dimensionality of the embeddings"
87
88
  )
88
- namespace: Optional[str] = Field(default=None, description="The Astra DB connection namespace.")
89
+ keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
90
+ namespace: Optional[str] = Field(
91
+ default=None,
92
+ description="The Astra DB connection namespace.",
93
+ deprecated="Please use 'keyspace' instead.",
94
+ )
89
95
  requested_indexing_policy: Optional[dict[str, Any]] = Field(
90
96
  default=None,
91
97
  description="The indexing policy to use for the collection.",
@@ -109,33 +115,34 @@ class AstraDBUploader(Uploader):
109
115
 
110
116
  @requires_dependencies(["astrapy"], extras="astradb")
111
117
  def get_collection(self) -> "AstraDBCollection":
112
- from astrapy.db import AstraDB
118
+ from astrapy import DataAPIClient as AstraDBClient
113
119
 
114
- # Get the collection_name and embedding dimension
115
- collection_name = self.upload_config.collection_name
116
- embedding_dimension = self.upload_config.embedding_dimension
117
- requested_indexing_policy = self.upload_config.requested_indexing_policy
120
+ # Choose keyspace or deprecated namespace
121
+ keyspace_param = self.upload_config.keyspace or self.upload_config.namespace
118
122
 
119
- # If the user has requested an indexing policy, pass it to the Astra DB
120
- options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
123
+ # Get the collection_name
124
+ collection_name = self.upload_config.collection_name
121
125
 
122
126
  # Build the Astra DB object.
123
- # caller_name/version for AstraDB tracking
124
127
  access_configs = self.connection_config.access_config.get_secret_value()
125
- astra_db = AstraDB(
126
- api_endpoint=access_configs.api_endpoint,
127
- token=access_configs.token,
128
- namespace=self.upload_config.namespace,
128
+
129
+ # Create a client object to interact with the Astra DB
130
+ # caller_name/version for Astra DB tracking
131
+ my_client = AstraDBClient(
129
132
  caller_name=integration_name,
130
133
  caller_version=integration_version,
131
134
  )
132
135
 
133
- # Create and connect to the newly created collection
134
- astra_db_collection = astra_db.create_collection(
135
- collection_name=collection_name,
136
- dimension=embedding_dimension,
137
- options=options,
136
+ # Get the database object
137
+ astra_db = my_client.get_database(
138
+ api_endpoint=access_configs.api_endpoint,
139
+ token=access_configs.token,
140
+ keyspace=keyspace_param,
138
141
  )
142
+
143
+ # Connect to the newly created collection
144
+ astra_db_collection = astra_db.get_collection(name=collection_name)
145
+
139
146
  return astra_db_collection
140
147
 
141
148
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -42,8 +42,10 @@ class DatabricksVolumesAccessConfig(AccessConfig):
42
42
  description="The Databricks password part of basic authentication. "
43
43
  "Only possible when Host is *.cloud.databricks.com (AWS).",
44
44
  )
45
- client_id: Optional[str] = Field(default=None)
46
- client_secret: Optional[str] = Field(default=None)
45
+ client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
46
+ client_secret: Optional[str] = Field(
47
+ default=None, description="Client Secret of the OAuth app."
48
+ )
47
49
  token: Optional[str] = Field(
48
50
  default=None,
49
51
  description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
@@ -140,11 +142,12 @@ class DatabricksVolumesUploader(Uploader):
140
142
 
141
143
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
142
144
  output_path = os.path.join(self.upload_config.path, path.name)
143
- self.get_client().files.upload(
144
- file_path=output_path,
145
- contents=path,
146
- overwrite=self.upload_config.overwrite,
147
- )
145
+ with open(path, "rb") as elements_file:
146
+ self.get_client().files.upload(
147
+ file_path=output_path,
148
+ contents=elements_file,
149
+ overwrite=self.upload_config.overwrite,
150
+ )
148
151
 
149
152
 
150
153
  databricks_volumes_destination_entry = DestinationRegistryEntry(
@@ -1,8 +1,7 @@
1
- import asyncio
2
1
  from abc import ABC
3
- from dataclasses import dataclass, fields
2
+ from dataclasses import dataclass
4
3
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Optional
4
+ from typing import Any, Optional
6
5
 
7
6
  from pydantic import BaseModel, Field, SecretStr
8
7
 
@@ -10,11 +9,7 @@ from unstructured_ingest.utils.data_prep import flatten_dict
10
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
12
11
  from unstructured_ingest.v2.logger import logger
13
-
14
- if TYPE_CHECKING:
15
- from unstructured_client import UnstructuredClient
16
- from unstructured_client.models.operations import PartitionRequest
17
- from unstructured_client.models.shared import PartitionParameters
12
+ from unstructured_ingest.v2.unstructured_api import call_api
18
13
 
19
14
 
20
15
  class PartitionerConfig(BaseModel):
@@ -154,60 +149,19 @@ class Partitioner(BaseProcess, ABC):
154
149
  )
155
150
  return self.postprocess(elements=elements_to_dicts(elements))
156
151
 
157
- async def call_api(self, client: "UnstructuredClient", request: "PartitionRequest"):
158
- # TODO when client supports async, run without using run_in_executor
159
- # isolate the IO heavy call
160
- loop = asyncio.get_event_loop()
161
- return await loop.run_in_executor(None, client.general.partition, request)
162
-
163
- def create_partition_parameters(self, filename: Path) -> "PartitionParameters":
164
- from unstructured_client.models.shared import Files, PartitionParameters
165
-
166
- partition_request = self.config.to_partition_kwargs()
167
-
168
- # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
169
- # Prior to this it was a dataclass which doesn't have .__fields
170
- try:
171
- possible_fields = PartitionParameters.__fields__
172
- except AttributeError:
173
- possible_fields = [f.name for f in fields(PartitionParameters)]
174
-
175
- filtered_partition_request = {
176
- k: v for k, v in partition_request.items() if k in possible_fields
177
- }
178
- if len(filtered_partition_request) != len(partition_request):
179
- logger.debug(
180
- "Following fields were omitted due to not being "
181
- "supported by the currently used unstructured client: {}".format(
182
- ", ".join([v for v in partition_request if v not in filtered_partition_request])
183
- )
184
- )
185
- logger.debug(f"using hosted partitioner with kwargs: {partition_request}")
186
- with open(filename, "rb") as f:
187
- files = Files(
188
- content=f.read(),
189
- file_name=str(filename.resolve()),
190
- )
191
- filtered_partition_request["files"] = files
192
- partition_params = PartitionParameters(**filtered_partition_request)
193
- return partition_params
194
-
195
152
  @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
196
153
  async def partition_via_api(
197
154
  self, filename: Path, metadata: Optional[dict] = None, **kwargs
198
155
  ) -> list[dict]:
199
- from unstructured_client import UnstructuredClient
200
- from unstructured_client.models.operations import PartitionRequest
201
-
202
156
  logger.debug(f"partitioning file {filename} with metadata: {metadata}")
203
- client = UnstructuredClient(
157
+
158
+ elements = await call_api(
204
159
  server_url=self.config.partition_endpoint,
205
- api_key_auth=self.config.api_key.get_secret_value(),
160
+ api_key=self.config.api_key.get_secret_value(),
161
+ filename=filename,
162
+ api_parameters=self.config.to_partition_kwargs(),
206
163
  )
207
- partition_params = self.create_partition_parameters(filename=filename)
208
- partition_request = PartitionRequest(partition_params)
209
- resp = await self.call_api(client=client, request=partition_request)
210
- elements = resp.elements or []
164
+
211
165
  # Append the data source metadata the auto partition does for you
212
166
  for element in elements:
213
167
  element["metadata"]["data_source"] = metadata
@@ -0,0 +1,87 @@
1
+ import asyncio
2
+ from dataclasses import fields
3
+ from functools import partial
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Optional
6
+
7
+ from unstructured_ingest.v2.logger import logger
8
+
9
+ if TYPE_CHECKING:
10
+ from unstructured_client.models.operations import PartitionRequest
11
+
12
+
13
+ def create_partition_request(filename: Path, parameters_dict: dict) -> "PartitionRequest":
14
+ """Given a filename and a dict of API parameters, return a PartitionRequest for use
15
+ by unstructured-client. Remove any params that aren't recognized by the SDK.
16
+
17
+ Args:
18
+ filename: Path to the file being partitioned
19
+ parameters_dict: A mapping of all API params we want to send
20
+
21
+ Returns: A PartitionRequest containing the file and all valid params
22
+ """
23
+ from unstructured_client.models.operations import PartitionRequest
24
+ from unstructured_client.models.shared import Files, PartitionParameters
25
+
26
+ # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
27
+ # Prior to this it was a dataclass which doesn't have .__fields
28
+ try:
29
+ possible_fields = PartitionParameters.__fields__
30
+ except AttributeError:
31
+ possible_fields = [f.name for f in fields(PartitionParameters)]
32
+
33
+ filtered_partition_request = {k: v for k, v in parameters_dict.items() if k in possible_fields}
34
+ if len(filtered_partition_request) != len(parameters_dict):
35
+ logger.debug(
36
+ "Following fields were omitted due to not being "
37
+ "supported by the currently used unstructured client: {}".format(
38
+ ", ".join([v for v in parameters_dict if v not in filtered_partition_request])
39
+ )
40
+ )
41
+
42
+ logger.debug(f"using hosted partitioner with kwargs: {parameters_dict}")
43
+
44
+ with open(filename, "rb") as f:
45
+ files = Files(
46
+ content=f.read(),
47
+ file_name=str(filename.resolve()),
48
+ )
49
+ filtered_partition_request["files"] = files
50
+
51
+ partition_params = PartitionParameters(**filtered_partition_request)
52
+
53
+ return PartitionRequest(partition_parameters=partition_params)
54
+
55
+
56
+ async def call_api(
57
+ server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
58
+ ) -> list[dict]:
59
+ """Call the Unstructured API using unstructured-client.
60
+
61
+ Args:
62
+ server_url: The base URL where the API is hosted
63
+ api_key: The user's API key (can be empty if this is a self hosted API)
64
+ filename: Path to the file being partitioned
65
+ api_parameters: A dict containing the requested API parameters
66
+
67
+ Returns: A list of the file's elements, or an empty list if there was an error
68
+ """
69
+ from unstructured_client import UnstructuredClient
70
+
71
+ client = UnstructuredClient(
72
+ server_url=server_url,
73
+ api_key_auth=api_key,
74
+ )
75
+ partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
76
+
77
+ # TODO when client supports async, run without using run_in_executor
78
+ # isolate the IO heavy call
79
+ loop = asyncio.get_event_loop()
80
+
81
+ # Note(austin) - The partition calls needs request to be a keyword arg
82
+ # We have to use partial to do this, we can't pass request=request into run_in_executor
83
+ partition_call = partial(client.general.partition, request=partition_request)
84
+
85
+ res = await loop.run_in_executor(None, partition_call)
86
+
87
+ return res.elements or []