unstructured-ingest 0.0.19__py3-none-any.whl → 0.0.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (47) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cmds/astradb.py +2 -2
  3. unstructured_ingest/connector/astradb.py +54 -24
  4. unstructured_ingest/embed/bedrock.py +56 -19
  5. unstructured_ingest/embed/huggingface.py +22 -22
  6. unstructured_ingest/embed/interfaces.py +11 -4
  7. unstructured_ingest/embed/mixedbreadai.py +17 -17
  8. unstructured_ingest/embed/octoai.py +7 -7
  9. unstructured_ingest/embed/openai.py +15 -20
  10. unstructured_ingest/embed/vertexai.py +25 -17
  11. unstructured_ingest/embed/voyageai.py +22 -17
  12. unstructured_ingest/v2/cli/base/cmd.py +1 -1
  13. unstructured_ingest/v2/interfaces/connector.py +1 -1
  14. unstructured_ingest/v2/pipeline/pipeline.py +3 -1
  15. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
  16. unstructured_ingest/v2/pipeline/steps/download.py +6 -2
  17. unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
  18. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  19. unstructured_ingest/v2/pipeline/steps/index.py +4 -2
  20. unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
  21. unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
  22. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  23. unstructured_ingest/v2/pipeline/steps/upload.py +6 -2
  24. unstructured_ingest/v2/processes/chunker.py +8 -29
  25. unstructured_ingest/v2/processes/connectors/airtable.py +1 -1
  26. unstructured_ingest/v2/processes/connectors/astradb.py +26 -19
  27. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +11 -8
  28. unstructured_ingest/v2/processes/connectors/elasticsearch.py +2 -2
  29. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +31 -5
  30. unstructured_ingest/v2/processes/connectors/fsspec/box.py +31 -2
  31. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +36 -8
  32. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +25 -77
  33. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +30 -1
  34. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +15 -18
  35. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +22 -1
  36. unstructured_ingest/v2/processes/connectors/milvus.py +2 -2
  37. unstructured_ingest/v2/processes/connectors/opensearch.py +2 -2
  38. unstructured_ingest/v2/processes/partitioner.py +9 -55
  39. unstructured_ingest/v2/unstructured_api.py +87 -0
  40. unstructured_ingest/v2/utils.py +1 -1
  41. unstructured_ingest-0.0.22.dist-info/METADATA +186 -0
  42. {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/RECORD +46 -45
  43. {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/WHEEL +1 -1
  44. unstructured_ingest-0.0.19.dist-info/METADATA +0 -639
  45. {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/LICENSE.md +0 -0
  46. {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/entry_points.txt +0 -0
  47. {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/top_level.txt +0 -0
@@ -2,12 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
+ from time import time
5
6
  from typing import Any, Generator, Optional
6
7
 
7
8
  from pydantic import Field, Secret
8
9
 
9
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
11
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
12
  from unstructured_ingest.v2.processes.connector_registry import (
12
13
  DestinationRegistryEntry,
13
14
  SourceRegistryEntry,
@@ -22,7 +23,6 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
22
23
  FsspecUploader,
23
24
  FsspecUploaderConfig,
24
25
  )
25
- from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
26
26
 
27
27
  CONNECTOR_TYPE = "dropbox"
28
28
 
@@ -49,6 +49,40 @@ class DropboxIndexer(FsspecIndexer):
49
49
  index_config: DropboxIndexerConfig
50
50
  connector_type: str = CONNECTOR_TYPE
51
51
 
52
+ def get_path(self, file_data: dict) -> str:
53
+ return file_data["name"]
54
+
55
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
56
+ path = file_data["name"].lstrip("/")
57
+ date_created = None
58
+ date_modified = None
59
+ server_modified = file_data.get("server_modified")
60
+ client_modified = file_data.get("client_modified")
61
+ if server_modified and client_modified and server_modified > client_modified:
62
+ date_created = str(client_modified.timestamp())
63
+ date_modified = str(server_modified.timestamp())
64
+ elif server_modified and client_modified and server_modified < client_modified:
65
+ date_created = str(server_modified.timestamp())
66
+ date_modified = str(client_modified.timestamp())
67
+
68
+ file_size = file_data.get("size") if "size" in file_data else None
69
+
70
+ version = file_data.get("content_hash")
71
+ record_locator = {
72
+ "protocol": self.index_config.protocol,
73
+ "remote_file_path": self.index_config.remote_url,
74
+ "file_id": file_data.get("id"),
75
+ }
76
+ return FileDataSourceMetadata(
77
+ date_created=date_created,
78
+ date_modified=date_modified,
79
+ date_processed=str(time()),
80
+ version=version,
81
+ url=f"{self.index_config.protocol}://{path}",
82
+ record_locator=record_locator,
83
+ filesize_bytes=file_size,
84
+ )
85
+
52
86
  @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
53
87
  def __post_init__(self):
54
88
  # dropbox expects the path to start with a /
@@ -63,12 +97,6 @@ class DropboxIndexer(FsspecIndexer):
63
97
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
64
98
  return super().run(**kwargs)
65
99
 
66
- def sterilize_info(self, path) -> dict:
67
- # the fs.info method defined in the dropboxdrivefs library expects a "url"
68
- # kwarg rather than "path"; though both refer to the same thing
69
- info = self.fs.info(url=path)
70
- return sterilize_dict(data=info)
71
-
72
100
 
73
101
  class DropboxDownloaderConfig(FsspecDownloaderConfig):
74
102
  pass
@@ -1,10 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- import contextlib
4
3
  from dataclasses import dataclass, field
5
- from datetime import datetime
6
4
  from pathlib import Path
7
- from time import time
8
5
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
9
6
  from uuid import NAMESPACE_DNS, uuid5
10
7
 
@@ -113,18 +110,13 @@ class FsspecIndexer(Indexer):
113
110
  logger.error(f"failed to validate connection: {e}", exc_info=True)
114
111
  raise SourceConnectionError(f"failed to validate connection: {e}")
115
112
 
116
- def list_files(self) -> list[str]:
113
+ def get_file_data(self) -> list[dict[str, Any]]:
117
114
  if not self.index_config.recursive:
118
115
  # fs.ls does not walk directories
119
116
  # directories that are listed in cloud storage can cause problems
120
117
  # because they are seen as 0 byte files
121
- found = self.fs.ls(self.index_config.path_without_protocol, detail=True)
122
- if isinstance(found, list):
123
- return [
124
- x.get("name") for x in found if x.get("size") > 0 and x.get("type") == "file"
125
- ]
126
- else:
127
- raise TypeError(f"unhandled response type from ls: {type(found)}")
118
+ files = self.fs.ls(self.index_config.path_without_protocol, detail=True)
119
+
128
120
  else:
129
121
  # fs.find will recursively walk directories
130
122
  # "size" is a common key for all the cloud protocols with fs
@@ -132,84 +124,40 @@ class FsspecIndexer(Indexer):
132
124
  self.index_config.path_without_protocol,
133
125
  detail=True,
134
126
  )
135
- if isinstance(found, dict):
136
- return [
137
- k for k, v in found.items() if v.get("size") > 0 and v.get("type") == "file"
138
- ]
139
- else:
140
- raise TypeError(f"unhandled response type from find: {type(found)}")
141
-
142
- def get_metadata(self, path: str) -> FileDataSourceMetadata:
143
- date_created = None
144
- date_modified = None
145
- file_size = None
146
- try:
147
- created: Optional[Any] = self.fs.created(path)
148
- if created:
149
- if isinstance(created, datetime):
150
- date_created = str(created.timestamp())
151
- else:
152
- date_created = str(created)
153
- except NotImplementedError:
154
- pass
127
+ files = found.values()
128
+ filtered_files = [
129
+ file for file in files if file.get("size") > 0 and file.get("type") == "file"
130
+ ]
131
+ return filtered_files
155
132
 
156
- try:
157
- modified: Optional[Any] = self.fs.modified(path)
158
- if modified:
159
- if isinstance(modified, datetime):
160
- date_modified = str(modified.timestamp())
161
- else:
162
- date_modified = str(modified)
163
- except NotImplementedError:
164
- pass
165
- with contextlib.suppress(AttributeError):
166
- file_size = self.fs.size(path)
167
-
168
- version = self.fs.checksum(path)
169
- metadata: dict[str, str] = {}
170
- with contextlib.suppress(AttributeError):
171
- metadata = self.fs.metadata(path)
172
- record_locator = {
173
- "protocol": self.index_config.protocol,
174
- "remote_file_path": self.index_config.remote_url,
175
- }
176
- file_stat = self.fs.stat(path=path)
177
- if file_id := file_stat.get("id"):
178
- record_locator["file_id"] = file_id
179
- if metadata:
180
- record_locator["metadata"] = metadata
181
- return FileDataSourceMetadata(
182
- date_created=date_created,
183
- date_modified=date_modified,
184
- date_processed=str(time()),
185
- version=str(version),
186
- url=f"{self.index_config.protocol}://{path}",
187
- record_locator=record_locator,
188
- filesize_bytes=file_size,
189
- )
133
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
134
+ raise NotImplementedError()
135
+
136
+ def get_path(self, file_data: dict) -> str:
137
+ return file_data["name"]
190
138
 
191
- def sterilize_info(self, path) -> dict:
192
- info = self.fs.info(path=path)
193
- return sterilize_dict(data=info)
139
+ def sterilize_info(self, file_data: dict) -> dict:
140
+ return sterilize_dict(data=file_data)
194
141
 
195
142
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
196
- files = self.list_files()
197
- for file in files:
143
+ files = self.get_file_data()
144
+ for file_data in files:
145
+ file_path = self.get_path(file_data=file_data)
198
146
  # Note: we remove any remaining leading slashes (Box introduces these)
199
147
  # to get a valid relative path
200
- rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
148
+ rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
201
149
 
202
- additional_metadata = self.sterilize_info(path=file)
203
- additional_metadata["original_file_path"] = file
150
+ additional_metadata = self.sterilize_info(file_data=file_data)
151
+ additional_metadata["original_file_path"] = file_path
204
152
  yield FileData(
205
- identifier=str(uuid5(NAMESPACE_DNS, file)),
153
+ identifier=str(uuid5(NAMESPACE_DNS, file_path)),
206
154
  connector_type=self.connector_type,
207
155
  source_identifiers=SourceIdentifiers(
208
- filename=Path(file).name,
156
+ filename=Path(file_path).name,
209
157
  rel_path=rel_path or None,
210
- fullpath=file,
158
+ fullpath=file_path,
211
159
  ),
212
- metadata=self.get_metadata(path=file),
160
+ metadata=self.get_metadata(file_data=file_data),
213
161
  additional_metadata=additional_metadata,
214
162
  )
215
163
 
@@ -2,13 +2,15 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
+ from time import time
5
6
  from typing import Any, Generator, Optional, Union
6
7
 
8
+ from dateutil import parser
7
9
  from pydantic import Field, Secret
8
10
 
9
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
12
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
11
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
13
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
12
14
  from unstructured_ingest.v2.processes.connector_registry import (
13
15
  DestinationRegistryEntry,
14
16
  SourceRegistryEntry,
@@ -106,6 +108,33 @@ class GcsIndexer(FsspecIndexer):
106
108
  def precheck(self) -> None:
107
109
  super().precheck()
108
110
 
111
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
112
+ path = file_data["name"]
113
+ date_created = None
114
+ date_modified = None
115
+ if modified_at_str := file_data.get("updated"):
116
+ date_modified = parser.parse(modified_at_str).timestamp()
117
+ if created_at_str := file_data.get("timeCreated"):
118
+ date_created = parser.parse(created_at_str).timestamp()
119
+
120
+ file_size = file_data.get("size") if "size" in file_data else None
121
+
122
+ version = file_data.get("etag")
123
+ record_locator = {
124
+ "protocol": self.index_config.protocol,
125
+ "remote_file_path": self.index_config.remote_url,
126
+ "file_id": file_data.get("id"),
127
+ }
128
+ return FileDataSourceMetadata(
129
+ date_created=date_created,
130
+ date_modified=date_modified,
131
+ date_processed=str(time()),
132
+ version=version,
133
+ url=f"{self.index_config.protocol}://{path}",
134
+ record_locator=record_locator,
135
+ filesize_bytes=file_size,
136
+ )
137
+
109
138
 
110
139
  class GcsDownloaderConfig(FsspecDownloaderConfig):
111
140
  pass
@@ -1,6 +1,5 @@
1
1
  import contextlib
2
2
  from dataclasses import dataclass, field
3
- from datetime import datetime
4
3
  from pathlib import Path
5
4
  from time import time
6
5
  from typing import Any, Generator, Optional
@@ -69,7 +68,7 @@ class S3ConnectionConfig(FsspecConnectionConfig):
69
68
 
70
69
  # Avoid injecting None by filtering out k,v pairs where the value is None
71
70
  access_configs.update(
72
- {k: v for k, v in self.access_config.get_secret_value().dict().items() if v}
71
+ {k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v}
73
72
  )
74
73
  return access_configs
75
74
 
@@ -80,27 +79,25 @@ class S3Indexer(FsspecIndexer):
80
79
  index_config: S3IndexerConfig
81
80
  connector_type: str = CONNECTOR_TYPE
82
81
 
83
- def get_metadata(self, path: str) -> FileDataSourceMetadata:
82
+ def get_path(self, file_data: dict) -> str:
83
+ return file_data["Key"]
84
+
85
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
86
+ path = file_data["Key"]
84
87
  date_created = None
85
88
  date_modified = None
86
- file_size = None
87
- try:
88
- modified: Optional[datetime] = self.fs.modified(path)
89
- if modified:
90
- date_created = str(modified.timestamp())
91
- date_modified = str(modified.timestamp())
92
- except NotImplementedError:
93
- pass
94
- with contextlib.suppress(AttributeError):
95
- file_size = self.fs.size(path)
89
+ modified = file_data.get("LastModified")
90
+ if modified:
91
+ date_created = str(modified.timestamp())
92
+ date_modified = str(modified.timestamp())
93
+
94
+ file_size = file_data.get("size") if "size" in file_data else None
95
+ file_size = file_size or file_data.get("Size")
96
96
 
97
- version = None
98
- info: dict[str, Any] = self.fs.info(path)
99
- if etag := info.get("ETag"):
100
- version = str(etag).rstrip('"').lstrip('"')
97
+ version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
101
98
  metadata: dict[str, str] = {}
102
99
  with contextlib.suppress(AttributeError):
103
- metadata = self.fs.metadata(path)
100
+ metadata = self.fs.metadata(path=path)
104
101
  record_locator = {
105
102
  "protocol": self.index_config.protocol,
106
103
  "remote_file_path": self.index_config.remote_url,
@@ -3,13 +3,14 @@ from __future__ import annotations
3
3
  import os
4
4
  from dataclasses import dataclass, field
5
5
  from pathlib import Path
6
+ from time import time
6
7
  from typing import Any, Generator, Optional
7
8
  from urllib.parse import urlparse
8
9
 
9
10
  from pydantic import Field, Secret
10
11
 
11
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
13
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
13
14
  from unstructured_ingest.v2.processes.connector_registry import (
14
15
  DestinationRegistryEntry,
15
16
  SourceRegistryEntry,
@@ -96,6 +97,26 @@ class SftpIndexer(FsspecIndexer):
96
97
  def precheck(self) -> None:
97
98
  super().precheck()
98
99
 
100
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
101
+ path = file_data["name"]
102
+ date_created = file_data.get("time").timestamp() if "time" in file_data else None
103
+ date_modified = file_data.get("mtime").timestamp() if "mtime" in file_data else None
104
+
105
+ file_size = file_data.get("size") if "size" in file_data else None
106
+
107
+ record_locator = {
108
+ "protocol": self.index_config.protocol,
109
+ "remote_file_path": self.index_config.remote_url,
110
+ }
111
+ return FileDataSourceMetadata(
112
+ date_created=date_created,
113
+ date_modified=date_modified,
114
+ date_processed=str(time()),
115
+ url=f"{self.index_config.protocol}://{path}",
116
+ record_locator=record_locator,
117
+ filesize_bytes=file_size,
118
+ )
119
+
99
120
 
100
121
  class SftpDownloaderConfig(FsspecDownloaderConfig):
101
122
  remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
@@ -48,8 +48,8 @@ class MilvusConnectionConfig(ConnectionConfig):
48
48
 
49
49
  def get_connection_kwargs(self) -> dict[str, Any]:
50
50
  access_config = self.access_config.get_secret_value()
51
- access_config_dict = access_config.dict()
52
- connection_config_dict = self.dict()
51
+ access_config_dict = access_config.model_dump()
52
+ connection_config_dict = self.model_dump()
53
53
  connection_config_dict.pop("access_config", None)
54
54
  connection_config_dict.update(access_config_dict)
55
55
  # Drop any that were not set explicitly
@@ -101,8 +101,8 @@ class OpenSearchConnectionConfig(ConnectionConfig):
101
101
  if self.username and access_config.password:
102
102
  client_input_kwargs["http_auth"] = (self.username, access_config.password)
103
103
  client_input = OpenSearchClientInput(**client_input_kwargs)
104
- logger.debug(f"opensearch client inputs mapped to: {client_input.dict()}")
105
- client_kwargs = client_input.dict()
104
+ logger.debug(f"opensearch client inputs mapped to: {client_input.model_dump()}")
105
+ client_kwargs = client_input.model_dump()
106
106
  if client_input.http_auth is not None:
107
107
  client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
108
108
  client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
@@ -1,8 +1,7 @@
1
- import asyncio
2
1
  from abc import ABC
3
- from dataclasses import dataclass, fields
2
+ from dataclasses import dataclass
4
3
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Optional
4
+ from typing import Any, Optional
6
5
 
7
6
  from pydantic import BaseModel, Field, SecretStr
8
7
 
@@ -10,11 +9,7 @@ from unstructured_ingest.utils.data_prep import flatten_dict
10
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
12
11
  from unstructured_ingest.v2.logger import logger
13
-
14
- if TYPE_CHECKING:
15
- from unstructured_client import UnstructuredClient
16
- from unstructured_client.models.operations import PartitionRequest
17
- from unstructured_client.models.shared import PartitionParameters
12
+ from unstructured_ingest.v2.unstructured_api import call_api
18
13
 
19
14
 
20
15
  class PartitionerConfig(BaseModel):
@@ -154,60 +149,19 @@ class Partitioner(BaseProcess, ABC):
154
149
  )
155
150
  return self.postprocess(elements=elements_to_dicts(elements))
156
151
 
157
- async def call_api(self, client: "UnstructuredClient", request: "PartitionRequest"):
158
- # TODO when client supports async, run without using run_in_executor
159
- # isolate the IO heavy call
160
- loop = asyncio.get_event_loop()
161
- return await loop.run_in_executor(None, client.general.partition, request)
162
-
163
- def create_partition_parameters(self, filename: Path) -> "PartitionParameters":
164
- from unstructured_client.models.shared import Files, PartitionParameters
165
-
166
- partition_request = self.config.to_partition_kwargs()
167
-
168
- # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
169
- # Prior to this it was a dataclass which doesn't have .__fields
170
- try:
171
- possible_fields = PartitionParameters.__fields__
172
- except AttributeError:
173
- possible_fields = [f.name for f in fields(PartitionParameters)]
174
-
175
- filtered_partition_request = {
176
- k: v for k, v in partition_request.items() if k in possible_fields
177
- }
178
- if len(filtered_partition_request) != len(partition_request):
179
- logger.debug(
180
- "Following fields were omitted due to not being "
181
- "supported by the currently used unstructured client: {}".format(
182
- ", ".join([v for v in partition_request if v not in filtered_partition_request])
183
- )
184
- )
185
- logger.debug(f"using hosted partitioner with kwargs: {partition_request}")
186
- with open(filename, "rb") as f:
187
- files = Files(
188
- content=f.read(),
189
- file_name=str(filename.resolve()),
190
- )
191
- filtered_partition_request["files"] = files
192
- partition_params = PartitionParameters(**filtered_partition_request)
193
- return partition_params
194
-
195
152
  @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
196
153
  async def partition_via_api(
197
154
  self, filename: Path, metadata: Optional[dict] = None, **kwargs
198
155
  ) -> list[dict]:
199
- from unstructured_client import UnstructuredClient
200
- from unstructured_client.models.operations import PartitionRequest
201
-
202
156
  logger.debug(f"partitioning file {filename} with metadata: {metadata}")
203
- client = UnstructuredClient(
157
+
158
+ elements = await call_api(
204
159
  server_url=self.config.partition_endpoint,
205
- api_key_auth=self.config.api_key.get_secret_value(),
160
+ api_key=self.config.api_key.get_secret_value(),
161
+ filename=filename,
162
+ api_parameters=self.config.to_partition_kwargs(),
206
163
  )
207
- partition_params = self.create_partition_parameters(filename=filename)
208
- partition_request = PartitionRequest(partition_params)
209
- resp = await self.call_api(client=client, request=partition_request)
210
- elements = resp.elements or []
164
+
211
165
  # Append the data source metadata the auto partition does for you
212
166
  for element in elements:
213
167
  element["metadata"]["data_source"] = metadata
@@ -0,0 +1,87 @@
1
+ import asyncio
2
+ from dataclasses import fields
3
+ from functools import partial
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Optional
6
+
7
+ from unstructured_ingest.v2.logger import logger
8
+
9
+ if TYPE_CHECKING:
10
+ from unstructured_client.models.operations import PartitionRequest
11
+
12
+
13
+ def create_partition_request(filename: Path, parameters_dict: dict) -> "PartitionRequest":
14
+ """Given a filename and a dict of API parameters, return a PartitionRequest for use
15
+ by unstructured-client. Remove any params that aren't recognized by the SDK.
16
+
17
+ Args:
18
+ filename: Path to the file being partitioned
19
+ parameters_dict: A mapping of all API params we want to send
20
+
21
+ Returns: A PartitionRequest containing the file and all valid params
22
+ """
23
+ from unstructured_client.models.operations import PartitionRequest
24
+ from unstructured_client.models.shared import Files, PartitionParameters
25
+
26
+ # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
27
+ # Prior to this it was a dataclass which doesn't have .__fields
28
+ try:
29
+ possible_fields = PartitionParameters.__fields__
30
+ except AttributeError:
31
+ possible_fields = [f.name for f in fields(PartitionParameters)]
32
+
33
+ filtered_partition_request = {k: v for k, v in parameters_dict.items() if k in possible_fields}
34
+ if len(filtered_partition_request) != len(parameters_dict):
35
+ logger.debug(
36
+ "Following fields were omitted due to not being "
37
+ "supported by the currently used unstructured client: {}".format(
38
+ ", ".join([v for v in parameters_dict if v not in filtered_partition_request])
39
+ )
40
+ )
41
+
42
+ logger.debug(f"using hosted partitioner with kwargs: {parameters_dict}")
43
+
44
+ with open(filename, "rb") as f:
45
+ files = Files(
46
+ content=f.read(),
47
+ file_name=str(filename.resolve()),
48
+ )
49
+ filtered_partition_request["files"] = files
50
+
51
+ partition_params = PartitionParameters(**filtered_partition_request)
52
+
53
+ return PartitionRequest(partition_parameters=partition_params)
54
+
55
+
56
+ async def call_api(
57
+ server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
58
+ ) -> list[dict]:
59
+ """Call the Unstructured API using unstructured-client.
60
+
61
+ Args:
62
+ server_url: The base URL where the API is hosted
63
+ api_key: The user's API key (can be empty if this is a self hosted API)
64
+ filename: Path to the file being partitioned
65
+ api_parameters: A dict containing the requested API parameters
66
+
67
+ Returns: A list of the file's elements, or an empty list if there was an error
68
+ """
69
+ from unstructured_client import UnstructuredClient
70
+
71
+ client = UnstructuredClient(
72
+ server_url=server_url,
73
+ api_key_auth=api_key,
74
+ )
75
+ partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
76
+
77
+ # TODO when client supports async, run without using run_in_executor
78
+ # isolate the IO heavy call
79
+ loop = asyncio.get_event_loop()
80
+
81
+ # Note(austin) - The partition calls needs request to be a keyword arg
82
+ # We have to use partial to do this, we can't pass request=request into run_in_executor
83
+ partition_call = partial(client.general.partition, request=partition_request)
84
+
85
+ res = await loop.run_in_executor(None, partition_call)
86
+
87
+ return res.elements or []
@@ -19,7 +19,7 @@ def is_secret(value: Any) -> bool:
19
19
 
20
20
  def serialize_base_model(model: BaseModel) -> dict:
21
21
  # To get the full serialized dict regardless of if values are marked as Secret
22
- model_dict = model.dict()
22
+ model_dict = model.model_dump()
23
23
  return serialize_base_dict(model_dict=model_dict)
24
24
 
25
25