unstructured-ingest 0.0.19__py3-none-any.whl → 0.0.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/astradb.py +2 -2
- unstructured_ingest/connector/astradb.py +54 -24
- unstructured_ingest/embed/bedrock.py +56 -19
- unstructured_ingest/embed/huggingface.py +22 -22
- unstructured_ingest/embed/interfaces.py +11 -4
- unstructured_ingest/embed/mixedbreadai.py +17 -17
- unstructured_ingest/embed/octoai.py +7 -7
- unstructured_ingest/embed/openai.py +15 -20
- unstructured_ingest/embed/vertexai.py +25 -17
- unstructured_ingest/embed/voyageai.py +22 -17
- unstructured_ingest/v2/cli/base/cmd.py +1 -1
- unstructured_ingest/v2/interfaces/connector.py +1 -1
- unstructured_ingest/v2/pipeline/pipeline.py +3 -1
- unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
- unstructured_ingest/v2/pipeline/steps/download.py +6 -2
- unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/index.py +4 -2
- unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
- unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/pipeline/steps/upload.py +6 -2
- unstructured_ingest/v2/processes/chunker.py +8 -29
- unstructured_ingest/v2/processes/connectors/airtable.py +1 -1
- unstructured_ingest/v2/processes/connectors/astradb.py +26 -19
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +11 -8
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +2 -2
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +31 -5
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +31 -2
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +36 -8
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +25 -77
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +30 -1
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +15 -18
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +22 -1
- unstructured_ingest/v2/processes/connectors/milvus.py +2 -2
- unstructured_ingest/v2/processes/connectors/opensearch.py +2 -2
- unstructured_ingest/v2/processes/partitioner.py +9 -55
- unstructured_ingest/v2/unstructured_api.py +87 -0
- unstructured_ingest/v2/utils.py +1 -1
- unstructured_ingest-0.0.22.dist-info/METADATA +186 -0
- {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/RECORD +46 -45
- {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/WHEEL +1 -1
- unstructured_ingest-0.0.19.dist-info/METADATA +0 -639
- {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/top_level.txt +0 -0
|
@@ -2,12 +2,13 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from time import time
|
|
5
6
|
from typing import Any, Generator, Optional
|
|
6
7
|
|
|
7
8
|
from pydantic import Field, Secret
|
|
8
9
|
|
|
9
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
11
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
|
|
11
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
13
|
DestinationRegistryEntry,
|
|
13
14
|
SourceRegistryEntry,
|
|
@@ -22,7 +23,6 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
22
23
|
FsspecUploader,
|
|
23
24
|
FsspecUploaderConfig,
|
|
24
25
|
)
|
|
25
|
-
from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
|
|
26
26
|
|
|
27
27
|
CONNECTOR_TYPE = "dropbox"
|
|
28
28
|
|
|
@@ -49,6 +49,40 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
49
49
|
index_config: DropboxIndexerConfig
|
|
50
50
|
connector_type: str = CONNECTOR_TYPE
|
|
51
51
|
|
|
52
|
+
def get_path(self, file_data: dict) -> str:
|
|
53
|
+
return file_data["name"]
|
|
54
|
+
|
|
55
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
56
|
+
path = file_data["name"].lstrip("/")
|
|
57
|
+
date_created = None
|
|
58
|
+
date_modified = None
|
|
59
|
+
server_modified = file_data.get("server_modified")
|
|
60
|
+
client_modified = file_data.get("client_modified")
|
|
61
|
+
if server_modified and client_modified and server_modified > client_modified:
|
|
62
|
+
date_created = str(client_modified.timestamp())
|
|
63
|
+
date_modified = str(server_modified.timestamp())
|
|
64
|
+
elif server_modified and client_modified and server_modified < client_modified:
|
|
65
|
+
date_created = str(server_modified.timestamp())
|
|
66
|
+
date_modified = str(client_modified.timestamp())
|
|
67
|
+
|
|
68
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
69
|
+
|
|
70
|
+
version = file_data.get("content_hash")
|
|
71
|
+
record_locator = {
|
|
72
|
+
"protocol": self.index_config.protocol,
|
|
73
|
+
"remote_file_path": self.index_config.remote_url,
|
|
74
|
+
"file_id": file_data.get("id"),
|
|
75
|
+
}
|
|
76
|
+
return FileDataSourceMetadata(
|
|
77
|
+
date_created=date_created,
|
|
78
|
+
date_modified=date_modified,
|
|
79
|
+
date_processed=str(time()),
|
|
80
|
+
version=version,
|
|
81
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
82
|
+
record_locator=record_locator,
|
|
83
|
+
filesize_bytes=file_size,
|
|
84
|
+
)
|
|
85
|
+
|
|
52
86
|
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
53
87
|
def __post_init__(self):
|
|
54
88
|
# dropbox expects the path to start with a /
|
|
@@ -63,12 +97,6 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
63
97
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
64
98
|
return super().run(**kwargs)
|
|
65
99
|
|
|
66
|
-
def sterilize_info(self, path) -> dict:
|
|
67
|
-
# the fs.info method defined in the dropboxdrivefs library expects a "url"
|
|
68
|
-
# kwarg rather than "path"; though both refer to the same thing
|
|
69
|
-
info = self.fs.info(url=path)
|
|
70
|
-
return sterilize_dict(data=info)
|
|
71
|
-
|
|
72
100
|
|
|
73
101
|
class DropboxDownloaderConfig(FsspecDownloaderConfig):
|
|
74
102
|
pass
|
|
@@ -1,10 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import contextlib
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
|
-
from datetime import datetime
|
|
6
4
|
from pathlib import Path
|
|
7
|
-
from time import time
|
|
8
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
9
6
|
from uuid import NAMESPACE_DNS, uuid5
|
|
10
7
|
|
|
@@ -113,18 +110,13 @@ class FsspecIndexer(Indexer):
|
|
|
113
110
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
114
111
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
115
112
|
|
|
116
|
-
def
|
|
113
|
+
def get_file_data(self) -> list[dict[str, Any]]:
|
|
117
114
|
if not self.index_config.recursive:
|
|
118
115
|
# fs.ls does not walk directories
|
|
119
116
|
# directories that are listed in cloud storage can cause problems
|
|
120
117
|
# because they are seen as 0 byte files
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
return [
|
|
124
|
-
x.get("name") for x in found if x.get("size") > 0 and x.get("type") == "file"
|
|
125
|
-
]
|
|
126
|
-
else:
|
|
127
|
-
raise TypeError(f"unhandled response type from ls: {type(found)}")
|
|
118
|
+
files = self.fs.ls(self.index_config.path_without_protocol, detail=True)
|
|
119
|
+
|
|
128
120
|
else:
|
|
129
121
|
# fs.find will recursively walk directories
|
|
130
122
|
# "size" is a common key for all the cloud protocols with fs
|
|
@@ -132,84 +124,40 @@ class FsspecIndexer(Indexer):
|
|
|
132
124
|
self.index_config.path_without_protocol,
|
|
133
125
|
detail=True,
|
|
134
126
|
)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
raise TypeError(f"unhandled response type from find: {type(found)}")
|
|
141
|
-
|
|
142
|
-
def get_metadata(self, path: str) -> FileDataSourceMetadata:
|
|
143
|
-
date_created = None
|
|
144
|
-
date_modified = None
|
|
145
|
-
file_size = None
|
|
146
|
-
try:
|
|
147
|
-
created: Optional[Any] = self.fs.created(path)
|
|
148
|
-
if created:
|
|
149
|
-
if isinstance(created, datetime):
|
|
150
|
-
date_created = str(created.timestamp())
|
|
151
|
-
else:
|
|
152
|
-
date_created = str(created)
|
|
153
|
-
except NotImplementedError:
|
|
154
|
-
pass
|
|
127
|
+
files = found.values()
|
|
128
|
+
filtered_files = [
|
|
129
|
+
file for file in files if file.get("size") > 0 and file.get("type") == "file"
|
|
130
|
+
]
|
|
131
|
+
return filtered_files
|
|
155
132
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
else:
|
|
162
|
-
date_modified = str(modified)
|
|
163
|
-
except NotImplementedError:
|
|
164
|
-
pass
|
|
165
|
-
with contextlib.suppress(AttributeError):
|
|
166
|
-
file_size = self.fs.size(path)
|
|
167
|
-
|
|
168
|
-
version = self.fs.checksum(path)
|
|
169
|
-
metadata: dict[str, str] = {}
|
|
170
|
-
with contextlib.suppress(AttributeError):
|
|
171
|
-
metadata = self.fs.metadata(path)
|
|
172
|
-
record_locator = {
|
|
173
|
-
"protocol": self.index_config.protocol,
|
|
174
|
-
"remote_file_path": self.index_config.remote_url,
|
|
175
|
-
}
|
|
176
|
-
file_stat = self.fs.stat(path=path)
|
|
177
|
-
if file_id := file_stat.get("id"):
|
|
178
|
-
record_locator["file_id"] = file_id
|
|
179
|
-
if metadata:
|
|
180
|
-
record_locator["metadata"] = metadata
|
|
181
|
-
return FileDataSourceMetadata(
|
|
182
|
-
date_created=date_created,
|
|
183
|
-
date_modified=date_modified,
|
|
184
|
-
date_processed=str(time()),
|
|
185
|
-
version=str(version),
|
|
186
|
-
url=f"{self.index_config.protocol}://{path}",
|
|
187
|
-
record_locator=record_locator,
|
|
188
|
-
filesize_bytes=file_size,
|
|
189
|
-
)
|
|
133
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
134
|
+
raise NotImplementedError()
|
|
135
|
+
|
|
136
|
+
def get_path(self, file_data: dict) -> str:
|
|
137
|
+
return file_data["name"]
|
|
190
138
|
|
|
191
|
-
def sterilize_info(self,
|
|
192
|
-
|
|
193
|
-
return sterilize_dict(data=info)
|
|
139
|
+
def sterilize_info(self, file_data: dict) -> dict:
|
|
140
|
+
return sterilize_dict(data=file_data)
|
|
194
141
|
|
|
195
142
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
196
|
-
files = self.
|
|
197
|
-
for
|
|
143
|
+
files = self.get_file_data()
|
|
144
|
+
for file_data in files:
|
|
145
|
+
file_path = self.get_path(file_data=file_data)
|
|
198
146
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
199
147
|
# to get a valid relative path
|
|
200
|
-
rel_path =
|
|
148
|
+
rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
|
|
201
149
|
|
|
202
|
-
additional_metadata = self.sterilize_info(
|
|
203
|
-
additional_metadata["original_file_path"] =
|
|
150
|
+
additional_metadata = self.sterilize_info(file_data=file_data)
|
|
151
|
+
additional_metadata["original_file_path"] = file_path
|
|
204
152
|
yield FileData(
|
|
205
|
-
identifier=str(uuid5(NAMESPACE_DNS,
|
|
153
|
+
identifier=str(uuid5(NAMESPACE_DNS, file_path)),
|
|
206
154
|
connector_type=self.connector_type,
|
|
207
155
|
source_identifiers=SourceIdentifiers(
|
|
208
|
-
filename=Path(
|
|
156
|
+
filename=Path(file_path).name,
|
|
209
157
|
rel_path=rel_path or None,
|
|
210
|
-
fullpath=
|
|
158
|
+
fullpath=file_path,
|
|
211
159
|
),
|
|
212
|
-
metadata=self.get_metadata(
|
|
160
|
+
metadata=self.get_metadata(file_data=file_data),
|
|
213
161
|
additional_metadata=additional_metadata,
|
|
214
162
|
)
|
|
215
163
|
|
|
@@ -2,13 +2,15 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from time import time
|
|
5
6
|
from typing import Any, Generator, Optional, Union
|
|
6
7
|
|
|
8
|
+
from dateutil import parser
|
|
7
9
|
from pydantic import Field, Secret
|
|
8
10
|
|
|
9
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
12
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
11
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
13
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
|
|
12
14
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
15
|
DestinationRegistryEntry,
|
|
14
16
|
SourceRegistryEntry,
|
|
@@ -106,6 +108,33 @@ class GcsIndexer(FsspecIndexer):
|
|
|
106
108
|
def precheck(self) -> None:
|
|
107
109
|
super().precheck()
|
|
108
110
|
|
|
111
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
112
|
+
path = file_data["name"]
|
|
113
|
+
date_created = None
|
|
114
|
+
date_modified = None
|
|
115
|
+
if modified_at_str := file_data.get("updated"):
|
|
116
|
+
date_modified = parser.parse(modified_at_str).timestamp()
|
|
117
|
+
if created_at_str := file_data.get("timeCreated"):
|
|
118
|
+
date_created = parser.parse(created_at_str).timestamp()
|
|
119
|
+
|
|
120
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
121
|
+
|
|
122
|
+
version = file_data.get("etag")
|
|
123
|
+
record_locator = {
|
|
124
|
+
"protocol": self.index_config.protocol,
|
|
125
|
+
"remote_file_path": self.index_config.remote_url,
|
|
126
|
+
"file_id": file_data.get("id"),
|
|
127
|
+
}
|
|
128
|
+
return FileDataSourceMetadata(
|
|
129
|
+
date_created=date_created,
|
|
130
|
+
date_modified=date_modified,
|
|
131
|
+
date_processed=str(time()),
|
|
132
|
+
version=version,
|
|
133
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
134
|
+
record_locator=record_locator,
|
|
135
|
+
filesize_bytes=file_size,
|
|
136
|
+
)
|
|
137
|
+
|
|
109
138
|
|
|
110
139
|
class GcsDownloaderConfig(FsspecDownloaderConfig):
|
|
111
140
|
pass
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import contextlib
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from datetime import datetime
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
from time import time
|
|
6
5
|
from typing import Any, Generator, Optional
|
|
@@ -69,7 +68,7 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
69
68
|
|
|
70
69
|
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
71
70
|
access_configs.update(
|
|
72
|
-
{k: v for k, v in self.access_config.get_secret_value().
|
|
71
|
+
{k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v}
|
|
73
72
|
)
|
|
74
73
|
return access_configs
|
|
75
74
|
|
|
@@ -80,27 +79,25 @@ class S3Indexer(FsspecIndexer):
|
|
|
80
79
|
index_config: S3IndexerConfig
|
|
81
80
|
connector_type: str = CONNECTOR_TYPE
|
|
82
81
|
|
|
83
|
-
def
|
|
82
|
+
def get_path(self, file_data: dict) -> str:
|
|
83
|
+
return file_data["Key"]
|
|
84
|
+
|
|
85
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
86
|
+
path = file_data["Key"]
|
|
84
87
|
date_created = None
|
|
85
88
|
date_modified = None
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
pass
|
|
94
|
-
with contextlib.suppress(AttributeError):
|
|
95
|
-
file_size = self.fs.size(path)
|
|
89
|
+
modified = file_data.get("LastModified")
|
|
90
|
+
if modified:
|
|
91
|
+
date_created = str(modified.timestamp())
|
|
92
|
+
date_modified = str(modified.timestamp())
|
|
93
|
+
|
|
94
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
95
|
+
file_size = file_size or file_data.get("Size")
|
|
96
96
|
|
|
97
|
-
version = None
|
|
98
|
-
info: dict[str, Any] = self.fs.info(path)
|
|
99
|
-
if etag := info.get("ETag"):
|
|
100
|
-
version = str(etag).rstrip('"').lstrip('"')
|
|
97
|
+
version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
|
|
101
98
|
metadata: dict[str, str] = {}
|
|
102
99
|
with contextlib.suppress(AttributeError):
|
|
103
|
-
metadata = self.fs.metadata(path)
|
|
100
|
+
metadata = self.fs.metadata(path=path)
|
|
104
101
|
record_locator = {
|
|
105
102
|
"protocol": self.index_config.protocol,
|
|
106
103
|
"remote_file_path": self.index_config.remote_url,
|
|
@@ -3,13 +3,14 @@ from __future__ import annotations
|
|
|
3
3
|
import os
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from time import time
|
|
6
7
|
from typing import Any, Generator, Optional
|
|
7
8
|
from urllib.parse import urlparse
|
|
8
9
|
|
|
9
10
|
from pydantic import Field, Secret
|
|
10
11
|
|
|
11
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
13
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
|
|
13
14
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
14
15
|
DestinationRegistryEntry,
|
|
15
16
|
SourceRegistryEntry,
|
|
@@ -96,6 +97,26 @@ class SftpIndexer(FsspecIndexer):
|
|
|
96
97
|
def precheck(self) -> None:
|
|
97
98
|
super().precheck()
|
|
98
99
|
|
|
100
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
101
|
+
path = file_data["name"]
|
|
102
|
+
date_created = file_data.get("time").timestamp() if "time" in file_data else None
|
|
103
|
+
date_modified = file_data.get("mtime").timestamp() if "mtime" in file_data else None
|
|
104
|
+
|
|
105
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
106
|
+
|
|
107
|
+
record_locator = {
|
|
108
|
+
"protocol": self.index_config.protocol,
|
|
109
|
+
"remote_file_path": self.index_config.remote_url,
|
|
110
|
+
}
|
|
111
|
+
return FileDataSourceMetadata(
|
|
112
|
+
date_created=date_created,
|
|
113
|
+
date_modified=date_modified,
|
|
114
|
+
date_processed=str(time()),
|
|
115
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
116
|
+
record_locator=record_locator,
|
|
117
|
+
filesize_bytes=file_size,
|
|
118
|
+
)
|
|
119
|
+
|
|
99
120
|
|
|
100
121
|
class SftpDownloaderConfig(FsspecDownloaderConfig):
|
|
101
122
|
remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
|
|
@@ -48,8 +48,8 @@ class MilvusConnectionConfig(ConnectionConfig):
|
|
|
48
48
|
|
|
49
49
|
def get_connection_kwargs(self) -> dict[str, Any]:
|
|
50
50
|
access_config = self.access_config.get_secret_value()
|
|
51
|
-
access_config_dict = access_config.
|
|
52
|
-
connection_config_dict = self.
|
|
51
|
+
access_config_dict = access_config.model_dump()
|
|
52
|
+
connection_config_dict = self.model_dump()
|
|
53
53
|
connection_config_dict.pop("access_config", None)
|
|
54
54
|
connection_config_dict.update(access_config_dict)
|
|
55
55
|
# Drop any that were not set explicitly
|
|
@@ -101,8 +101,8 @@ class OpenSearchConnectionConfig(ConnectionConfig):
|
|
|
101
101
|
if self.username and access_config.password:
|
|
102
102
|
client_input_kwargs["http_auth"] = (self.username, access_config.password)
|
|
103
103
|
client_input = OpenSearchClientInput(**client_input_kwargs)
|
|
104
|
-
logger.debug(f"opensearch client inputs mapped to: {client_input.
|
|
105
|
-
client_kwargs = client_input.
|
|
104
|
+
logger.debug(f"opensearch client inputs mapped to: {client_input.model_dump()}")
|
|
105
|
+
client_kwargs = client_input.model_dump()
|
|
106
106
|
if client_input.http_auth is not None:
|
|
107
107
|
client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
|
|
108
108
|
client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
from abc import ABC
|
|
3
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
4
|
+
from typing import Any, Optional
|
|
6
5
|
|
|
7
6
|
from pydantic import BaseModel, Field, SecretStr
|
|
8
7
|
|
|
@@ -10,11 +9,7 @@ from unstructured_ingest.utils.data_prep import flatten_dict
|
|
|
10
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
10
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
11
|
from unstructured_ingest.v2.logger import logger
|
|
13
|
-
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
from unstructured_client import UnstructuredClient
|
|
16
|
-
from unstructured_client.models.operations import PartitionRequest
|
|
17
|
-
from unstructured_client.models.shared import PartitionParameters
|
|
12
|
+
from unstructured_ingest.v2.unstructured_api import call_api
|
|
18
13
|
|
|
19
14
|
|
|
20
15
|
class PartitionerConfig(BaseModel):
|
|
@@ -154,60 +149,19 @@ class Partitioner(BaseProcess, ABC):
|
|
|
154
149
|
)
|
|
155
150
|
return self.postprocess(elements=elements_to_dicts(elements))
|
|
156
151
|
|
|
157
|
-
async def call_api(self, client: "UnstructuredClient", request: "PartitionRequest"):
|
|
158
|
-
# TODO when client supports async, run without using run_in_executor
|
|
159
|
-
# isolate the IO heavy call
|
|
160
|
-
loop = asyncio.get_event_loop()
|
|
161
|
-
return await loop.run_in_executor(None, client.general.partition, request)
|
|
162
|
-
|
|
163
|
-
def create_partition_parameters(self, filename: Path) -> "PartitionParameters":
|
|
164
|
-
from unstructured_client.models.shared import Files, PartitionParameters
|
|
165
|
-
|
|
166
|
-
partition_request = self.config.to_partition_kwargs()
|
|
167
|
-
|
|
168
|
-
# NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
|
|
169
|
-
# Prior to this it was a dataclass which doesn't have .__fields
|
|
170
|
-
try:
|
|
171
|
-
possible_fields = PartitionParameters.__fields__
|
|
172
|
-
except AttributeError:
|
|
173
|
-
possible_fields = [f.name for f in fields(PartitionParameters)]
|
|
174
|
-
|
|
175
|
-
filtered_partition_request = {
|
|
176
|
-
k: v for k, v in partition_request.items() if k in possible_fields
|
|
177
|
-
}
|
|
178
|
-
if len(filtered_partition_request) != len(partition_request):
|
|
179
|
-
logger.debug(
|
|
180
|
-
"Following fields were omitted due to not being "
|
|
181
|
-
"supported by the currently used unstructured client: {}".format(
|
|
182
|
-
", ".join([v for v in partition_request if v not in filtered_partition_request])
|
|
183
|
-
)
|
|
184
|
-
)
|
|
185
|
-
logger.debug(f"using hosted partitioner with kwargs: {partition_request}")
|
|
186
|
-
with open(filename, "rb") as f:
|
|
187
|
-
files = Files(
|
|
188
|
-
content=f.read(),
|
|
189
|
-
file_name=str(filename.resolve()),
|
|
190
|
-
)
|
|
191
|
-
filtered_partition_request["files"] = files
|
|
192
|
-
partition_params = PartitionParameters(**filtered_partition_request)
|
|
193
|
-
return partition_params
|
|
194
|
-
|
|
195
152
|
@requires_dependencies(dependencies=["unstructured_client"], extras="remote")
|
|
196
153
|
async def partition_via_api(
|
|
197
154
|
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
198
155
|
) -> list[dict]:
|
|
199
|
-
from unstructured_client import UnstructuredClient
|
|
200
|
-
from unstructured_client.models.operations import PartitionRequest
|
|
201
|
-
|
|
202
156
|
logger.debug(f"partitioning file {filename} with metadata: {metadata}")
|
|
203
|
-
|
|
157
|
+
|
|
158
|
+
elements = await call_api(
|
|
204
159
|
server_url=self.config.partition_endpoint,
|
|
205
|
-
|
|
160
|
+
api_key=self.config.api_key.get_secret_value(),
|
|
161
|
+
filename=filename,
|
|
162
|
+
api_parameters=self.config.to_partition_kwargs(),
|
|
206
163
|
)
|
|
207
|
-
|
|
208
|
-
partition_request = PartitionRequest(partition_params)
|
|
209
|
-
resp = await self.call_api(client=client, request=partition_request)
|
|
210
|
-
elements = resp.elements or []
|
|
164
|
+
|
|
211
165
|
# Append the data source metadata the auto partition does for you
|
|
212
166
|
for element in elements:
|
|
213
167
|
element["metadata"]["data_source"] = metadata
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import fields
|
|
3
|
+
from functools import partial
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Optional
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.logger import logger
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from unstructured_client.models.operations import PartitionRequest
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def create_partition_request(filename: Path, parameters_dict: dict) -> "PartitionRequest":
|
|
14
|
+
"""Given a filename and a dict of API parameters, return a PartitionRequest for use
|
|
15
|
+
by unstructured-client. Remove any params that aren't recognized by the SDK.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
filename: Path to the file being partitioned
|
|
19
|
+
parameters_dict: A mapping of all API params we want to send
|
|
20
|
+
|
|
21
|
+
Returns: A PartitionRequest containing the file and all valid params
|
|
22
|
+
"""
|
|
23
|
+
from unstructured_client.models.operations import PartitionRequest
|
|
24
|
+
from unstructured_client.models.shared import Files, PartitionParameters
|
|
25
|
+
|
|
26
|
+
# NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
|
|
27
|
+
# Prior to this it was a dataclass which doesn't have .__fields
|
|
28
|
+
try:
|
|
29
|
+
possible_fields = PartitionParameters.__fields__
|
|
30
|
+
except AttributeError:
|
|
31
|
+
possible_fields = [f.name for f in fields(PartitionParameters)]
|
|
32
|
+
|
|
33
|
+
filtered_partition_request = {k: v for k, v in parameters_dict.items() if k in possible_fields}
|
|
34
|
+
if len(filtered_partition_request) != len(parameters_dict):
|
|
35
|
+
logger.debug(
|
|
36
|
+
"Following fields were omitted due to not being "
|
|
37
|
+
"supported by the currently used unstructured client: {}".format(
|
|
38
|
+
", ".join([v for v in parameters_dict if v not in filtered_partition_request])
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
logger.debug(f"using hosted partitioner with kwargs: {parameters_dict}")
|
|
43
|
+
|
|
44
|
+
with open(filename, "rb") as f:
|
|
45
|
+
files = Files(
|
|
46
|
+
content=f.read(),
|
|
47
|
+
file_name=str(filename.resolve()),
|
|
48
|
+
)
|
|
49
|
+
filtered_partition_request["files"] = files
|
|
50
|
+
|
|
51
|
+
partition_params = PartitionParameters(**filtered_partition_request)
|
|
52
|
+
|
|
53
|
+
return PartitionRequest(partition_parameters=partition_params)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def call_api(
|
|
57
|
+
server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
|
|
58
|
+
) -> list[dict]:
|
|
59
|
+
"""Call the Unstructured API using unstructured-client.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
server_url: The base URL where the API is hosted
|
|
63
|
+
api_key: The user's API key (can be empty if this is a self hosted API)
|
|
64
|
+
filename: Path to the file being partitioned
|
|
65
|
+
api_parameters: A dict containing the requested API parameters
|
|
66
|
+
|
|
67
|
+
Returns: A list of the file's elements, or an empty list if there was an error
|
|
68
|
+
"""
|
|
69
|
+
from unstructured_client import UnstructuredClient
|
|
70
|
+
|
|
71
|
+
client = UnstructuredClient(
|
|
72
|
+
server_url=server_url,
|
|
73
|
+
api_key_auth=api_key,
|
|
74
|
+
)
|
|
75
|
+
partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
|
|
76
|
+
|
|
77
|
+
# TODO when client supports async, run without using run_in_executor
|
|
78
|
+
# isolate the IO heavy call
|
|
79
|
+
loop = asyncio.get_event_loop()
|
|
80
|
+
|
|
81
|
+
# Note(austin) - The partition calls needs request to be a keyword arg
|
|
82
|
+
# We have to use partial to do this, we can't pass request=request into run_in_executor
|
|
83
|
+
partition_call = partial(client.general.partition, request=partition_request)
|
|
84
|
+
|
|
85
|
+
res = await loop.run_in_executor(None, partition_call)
|
|
86
|
+
|
|
87
|
+
return res.elements or []
|
unstructured_ingest/v2/utils.py
CHANGED
|
@@ -19,7 +19,7 @@ def is_secret(value: Any) -> bool:
|
|
|
19
19
|
|
|
20
20
|
def serialize_base_model(model: BaseModel) -> dict:
|
|
21
21
|
# To get the full serialized dict regardless of if values are marked as Secret
|
|
22
|
-
model_dict = model.
|
|
22
|
+
model_dict = model.model_dump()
|
|
23
23
|
return serialize_base_dict(model_dict=model_dict)
|
|
24
24
|
|
|
25
25
|
|