remdb 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +801 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.7.dist-info/METADATA +1473 -0
- remdb-0.3.7.dist-info/RECORD +187 -0
- remdb-0.3.7.dist-info/WHEEL +4 -0
- remdb-0.3.7.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified file system interface abstracting S3 and local storage.
|
|
3
|
+
|
|
4
|
+
Design principles:
|
|
5
|
+
- No upload/download methods - use copy(from, to) instead
|
|
6
|
+
- No zip/unzip - use archive formats in copy operations
|
|
7
|
+
- Extension-based format detection
|
|
8
|
+
- Polars for columnar data by default
|
|
9
|
+
- ContentService integration for special formats
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Callable, BinaryIO, Iterator
|
|
14
|
+
import io
|
|
15
|
+
|
|
16
|
+
from rem.services.fs.s3_provider import S3Provider
|
|
17
|
+
from rem.services.fs.local_provider import LocalProvider
|
|
18
|
+
from rem.services.fs.git_provider import GitProvider, is_git
|
|
19
|
+
from rem.settings import settings
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def is_s3(uri: str) -> bool:
|
|
23
|
+
"""Check if URI is an S3 path."""
|
|
24
|
+
return uri.startswith("s3://")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def is_archive(uri: str) -> bool:
|
|
28
|
+
"""Check if URI is an archive file."""
|
|
29
|
+
return ".zip" in uri.lower() or ".tar" in uri.lower() or ".gz" in uri.lower()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class FS:
|
|
33
|
+
"""
|
|
34
|
+
Entry point to file systems abstracting S3 and local storage.
|
|
35
|
+
|
|
36
|
+
All operations work seamlessly across S3 and local filesystems.
|
|
37
|
+
Uses Polars for columnar data and ContentService for special formats.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self):
|
|
41
|
+
"""Initialize filesystem with S3, local, and Git providers."""
|
|
42
|
+
self._s3_provider = S3Provider()
|
|
43
|
+
self._local_provider = LocalProvider()
|
|
44
|
+
self._git_provider = GitProvider() if settings.git.enabled else None
|
|
45
|
+
|
|
46
|
+
def https_to_file(self, web_request_uri: str, target_uri: str, token: str | None = None):
|
|
47
|
+
"""
|
|
48
|
+
Download a remote resource to the filesystem.
|
|
49
|
+
|
|
50
|
+
Examples:
|
|
51
|
+
- Download Airtable/Slack files to S3
|
|
52
|
+
- Download public files to local storage
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
web_request_uri: HTTPS URL to download from
|
|
56
|
+
target_uri: Destination (s3://... or local path)
|
|
57
|
+
token: Optional bearer token for authorization
|
|
58
|
+
"""
|
|
59
|
+
import requests
|
|
60
|
+
|
|
61
|
+
headers = {"Authorization": f"Bearer {token}"} if token else None
|
|
62
|
+
response = requests.get(web_request_uri, headers=headers)
|
|
63
|
+
response.raise_for_status()
|
|
64
|
+
|
|
65
|
+
with self.open(target_uri, "wb") as f:
|
|
66
|
+
f.write(response.content)
|
|
67
|
+
|
|
68
|
+
def open(self, uri: str, mode: str = "rb") -> BinaryIO:
|
|
69
|
+
"""
|
|
70
|
+
Open file for read or write.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
uri: File path (s3://... or local path)
|
|
74
|
+
mode: File mode (r, rb, w, wb, etc.)
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
File-like object
|
|
78
|
+
"""
|
|
79
|
+
if is_s3(uri):
|
|
80
|
+
return self._s3_provider.open(uri, mode=mode)
|
|
81
|
+
else:
|
|
82
|
+
return self._local_provider.open(uri, mode=mode)
|
|
83
|
+
|
|
84
|
+
def exists(self, uri: str) -> bool:
|
|
85
|
+
"""
|
|
86
|
+
Check if file or folder exists.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
uri: File or directory path (s3://, git://, or local)
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
True if exists, False otherwise
|
|
93
|
+
"""
|
|
94
|
+
if is_git(uri):
|
|
95
|
+
if not self._git_provider:
|
|
96
|
+
raise ValueError("Git provider not enabled. Set GIT__ENABLED=true")
|
|
97
|
+
return self._git_provider.exists(uri)
|
|
98
|
+
elif is_s3(uri):
|
|
99
|
+
return self._s3_provider.exists(uri)
|
|
100
|
+
else:
|
|
101
|
+
return self._local_provider.exists(uri)
|
|
102
|
+
|
|
103
|
+
def from_parent_dir(self, uri: str, file: str | None = None) -> str:
|
|
104
|
+
"""
|
|
105
|
+
Construct path from parent directory.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
uri: Current file path
|
|
109
|
+
file: Optional file name to append
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Parent directory path (optionally with file appended)
|
|
113
|
+
"""
|
|
114
|
+
pth = str(Path(uri).parent)
|
|
115
|
+
if is_s3(uri):
|
|
116
|
+
pth = pth.replace("s3:/", "s3://")
|
|
117
|
+
return pth if not file else f"{pth}/{file}"
|
|
118
|
+
|
|
119
|
+
def read(self, uri: str, use_polars: bool = True, **options) -> Any:
|
|
120
|
+
"""
|
|
121
|
+
Read any data type - extensions determine the reader.
|
|
122
|
+
|
|
123
|
+
Supports:
|
|
124
|
+
- Columnar: .csv, .parquet, .feather, .avro (via Polars/Pandas)
|
|
125
|
+
- Structured: .json, .yaml, .yml
|
|
126
|
+
- Documents: .pdf, .docx, .md, .txt
|
|
127
|
+
- Images: .png, .jpg, .jpeg, .tiff, .svg
|
|
128
|
+
- Binary: .pickle
|
|
129
|
+
- Audio: .wav, .mp3 (TODO)
|
|
130
|
+
- Spreadsheets: .xlsx, .xls
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
uri: File path (s3://, git://, or local)
|
|
134
|
+
use_polars: Use Polars for dataframes (default: True)
|
|
135
|
+
**options: Format-specific options
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Parsed data in appropriate format
|
|
139
|
+
"""
|
|
140
|
+
if is_git(uri):
|
|
141
|
+
if not self._git_provider:
|
|
142
|
+
raise ValueError("Git provider not enabled. Set GIT__ENABLED=true")
|
|
143
|
+
return self._git_provider.read(uri, **options)
|
|
144
|
+
elif is_s3(uri):
|
|
145
|
+
return self._s3_provider.read(uri, use_polars=use_polars, **options)
|
|
146
|
+
else:
|
|
147
|
+
return self._local_provider.read(uri, use_polars=use_polars, **options)
|
|
148
|
+
|
|
149
|
+
def write(self, uri: str, data: Any, **options):
|
|
150
|
+
"""
|
|
151
|
+
Write any data type - extensions determine the writer.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
uri: File path
|
|
155
|
+
data: Data to write
|
|
156
|
+
**options: Format-specific options
|
|
157
|
+
"""
|
|
158
|
+
if is_s3(uri):
|
|
159
|
+
return self._s3_provider.write(uri, data, **options)
|
|
160
|
+
else:
|
|
161
|
+
return self._local_provider.write(uri, data, **options)
|
|
162
|
+
|
|
163
|
+
def copy(self, uri_from: str, uri_to: str):
|
|
164
|
+
"""
|
|
165
|
+
Copy files between filesystems.
|
|
166
|
+
|
|
167
|
+
Supports:
|
|
168
|
+
- s3 -> s3
|
|
169
|
+
- local -> s3 (upload)
|
|
170
|
+
- s3 -> local (download)
|
|
171
|
+
- local -> local
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
uri_from: Source path
|
|
175
|
+
uri_to: Destination path
|
|
176
|
+
"""
|
|
177
|
+
from_s3 = is_s3(uri_from)
|
|
178
|
+
to_s3 = is_s3(uri_to)
|
|
179
|
+
|
|
180
|
+
if from_s3 and to_s3:
|
|
181
|
+
# S3 to S3
|
|
182
|
+
return self._s3_provider.copy(uri_from, uri_to)
|
|
183
|
+
elif from_s3 and not to_s3:
|
|
184
|
+
# S3 to local (download)
|
|
185
|
+
return self._s3_provider.copy(uri_from, uri_to)
|
|
186
|
+
elif not from_s3 and to_s3:
|
|
187
|
+
# Local to S3 (upload)
|
|
188
|
+
return self._s3_provider.copy(uri_from, uri_to)
|
|
189
|
+
else:
|
|
190
|
+
# Local to local
|
|
191
|
+
return self._local_provider.copy(uri_from, uri_to)
|
|
192
|
+
|
|
193
|
+
def cache_data(self, data: Any, **kwargs) -> str:
|
|
194
|
+
"""
|
|
195
|
+
Cache data to S3 storage.
|
|
196
|
+
|
|
197
|
+
Currently supports images, can be extended for other types.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
data: Data to cache
|
|
201
|
+
**kwargs: Additional options (uri, suffix, etc.)
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
URI of cached data
|
|
205
|
+
"""
|
|
206
|
+
return self._s3_provider.cache_data(data, **kwargs)
|
|
207
|
+
|
|
208
|
+
def ls(self, uri: str, **options) -> list[str]:
|
|
209
|
+
"""
|
|
210
|
+
List files from a prefix recursively.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
uri: Directory path or prefix (s3://, git://, or local)
|
|
214
|
+
**options: Provider-specific options
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
List of file URIs
|
|
218
|
+
"""
|
|
219
|
+
if is_git(uri):
|
|
220
|
+
if not self._git_provider:
|
|
221
|
+
raise ValueError("Git provider not enabled. Set GIT__ENABLED=true")
|
|
222
|
+
return self._git_provider.ls(uri, **options)
|
|
223
|
+
elif is_s3(uri):
|
|
224
|
+
return self._s3_provider.ls(uri, **options)
|
|
225
|
+
else:
|
|
226
|
+
return self._local_provider.ls(uri, **options)
|
|
227
|
+
|
|
228
|
+
def ls_dirs(self, uri: str, **options) -> list[str]:
|
|
229
|
+
"""
|
|
230
|
+
List immediate child directories.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
uri: Directory path or prefix
|
|
234
|
+
**options: Provider-specific options
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
List of directory URIs
|
|
238
|
+
"""
|
|
239
|
+
if is_s3(uri):
|
|
240
|
+
return self._s3_provider.ls_dirs(uri, **options)
|
|
241
|
+
else:
|
|
242
|
+
return self._local_provider.ls_dirs(uri, **options)
|
|
243
|
+
|
|
244
|
+
def ls_iter(self, uri: str, **options) -> Iterator[str]:
|
|
245
|
+
"""
|
|
246
|
+
Iterate over files from a prefix (for pagination).
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
uri: Directory path or prefix
|
|
250
|
+
**options: Provider-specific options
|
|
251
|
+
|
|
252
|
+
Yields:
|
|
253
|
+
File URIs
|
|
254
|
+
"""
|
|
255
|
+
if is_s3(uri):
|
|
256
|
+
yield from self._s3_provider.ls_iter(uri, **options)
|
|
257
|
+
else:
|
|
258
|
+
yield from self._local_provider.ls_iter(uri, **options)
|
|
259
|
+
|
|
260
|
+
def delete(self, uri: str, limit: int = 100):
|
|
261
|
+
"""
|
|
262
|
+
Delete objects in a folder/directory.
|
|
263
|
+
|
|
264
|
+
Safety limit prevents accidental bulk deletions.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
uri: File or directory path
|
|
268
|
+
limit: Maximum number of files to delete
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
List of deleted file URIs
|
|
272
|
+
"""
|
|
273
|
+
if is_s3(uri):
|
|
274
|
+
return self._s3_provider.delete(uri, limit=limit)
|
|
275
|
+
else:
|
|
276
|
+
return self._local_provider.delete(uri, limit=limit)
|
|
277
|
+
|
|
278
|
+
def read_dataset(self, uri: str):
|
|
279
|
+
"""
|
|
280
|
+
Read data as PyArrow dataset.
|
|
281
|
+
|
|
282
|
+
Useful for:
|
|
283
|
+
- Lazy loading large datasets
|
|
284
|
+
- Partitioned data
|
|
285
|
+
- S3 Express use cases
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
uri: Dataset path (parquet, etc.)
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
PyArrow Dataset
|
|
292
|
+
"""
|
|
293
|
+
if is_s3(uri):
|
|
294
|
+
return self._s3_provider.read_dataset(uri)
|
|
295
|
+
else:
|
|
296
|
+
return self._local_provider.read_dataset(uri)
|
|
297
|
+
|
|
298
|
+
def read_image(self, uri: str):
|
|
299
|
+
"""
|
|
300
|
+
Read image as PIL Image.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
uri: Image file path
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
PIL Image object
|
|
307
|
+
"""
|
|
308
|
+
if is_s3(uri):
|
|
309
|
+
return self._s3_provider.read_image(uri)
|
|
310
|
+
else:
|
|
311
|
+
return self._local_provider.read_image(uri)
|
|
312
|
+
|
|
313
|
+
def apply(self, uri: str, fn: Callable[[str], Any]) -> Any:
|
|
314
|
+
"""
|
|
315
|
+
Apply a function to a file.
|
|
316
|
+
|
|
317
|
+
Downloads file to temporary location if needed, then passes
|
|
318
|
+
local path to the function.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
uri: File path
|
|
322
|
+
fn: Function that takes a local file path
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Result of function call
|
|
326
|
+
"""
|
|
327
|
+
if is_s3(uri):
|
|
328
|
+
return self._s3_provider.apply(uri, fn)
|
|
329
|
+
else:
|
|
330
|
+
return self._local_provider.apply(uri, fn)
|
|
331
|
+
|
|
332
|
+
def local_file(self, uri: str) -> str:
|
|
333
|
+
"""
|
|
334
|
+
Get local file path, downloading from S3 if needed.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
uri: File path (s3://... or local)
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
Local file path
|
|
341
|
+
"""
|
|
342
|
+
if is_s3(uri):
|
|
343
|
+
return self._s3_provider.local_file(uri)
|
|
344
|
+
else:
|
|
345
|
+
return uri
|
|
346
|
+
|
|
347
|
+
# ========================================================================
|
|
348
|
+
# Parsing Hooks
|
|
349
|
+
# ========================================================================
|
|
350
|
+
|
|
351
|
+
def get_parsed_uri(self, uri: str, resource: str = "content.md") -> str:
|
|
352
|
+
"""
|
|
353
|
+
Get URI for parsed version of a file.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
uri: Original file URI
|
|
357
|
+
resource: Resource within parsed directory (default: content.md)
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
Parsed resource URI
|
|
361
|
+
|
|
362
|
+
Example:
|
|
363
|
+
fs = FS()
|
|
364
|
+
parsed_uri = fs.get_parsed_uri("s3://bucket/file.pdf")
|
|
365
|
+
metadata_uri = fs.get_parsed_uri("s3://bucket/file.pdf", "metadata.json")
|
|
366
|
+
"""
|
|
367
|
+
if is_s3(uri):
|
|
368
|
+
return self._s3_provider.get_parsed_uri(uri, resource)
|
|
369
|
+
else:
|
|
370
|
+
return self._local_provider.get_parsed_uri(uri, resource)
|
|
371
|
+
|
|
372
|
+
def has_parsed(self, uri: str) -> bool:
|
|
373
|
+
"""
|
|
374
|
+
Check if parsed version exists for a file.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
uri: Original file URI
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
True if metadata.json exists in .parsed/ directory
|
|
381
|
+
|
|
382
|
+
Example:
|
|
383
|
+
if fs.has_parsed("s3://bucket/file.pdf"):
|
|
384
|
+
content = fs.read_parsed("s3://bucket/file.pdf")
|
|
385
|
+
"""
|
|
386
|
+
if is_s3(uri):
|
|
387
|
+
return self._s3_provider.has_parsed(uri)
|
|
388
|
+
else:
|
|
389
|
+
return self._local_provider.has_parsed(uri)
|
|
390
|
+
|
|
391
|
+
def read_parsed(self, uri: str, resource: str = "content.md", **options) -> Any:
|
|
392
|
+
"""
|
|
393
|
+
Read parsed content for a file.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
uri: Original file URI
|
|
397
|
+
resource: Resource to read (default: content.md)
|
|
398
|
+
**options: Format-specific read options
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
Parsed content (format depends on resource)
|
|
402
|
+
|
|
403
|
+
Example:
|
|
404
|
+
# Read parsed markdown
|
|
405
|
+
markdown = fs.read_parsed("s3://bucket/file.pdf")
|
|
406
|
+
|
|
407
|
+
# Read parse metadata
|
|
408
|
+
metadata = fs.read_parsed("s3://bucket/file.pdf", "metadata.json")
|
|
409
|
+
"""
|
|
410
|
+
if is_s3(uri):
|
|
411
|
+
return self._s3_provider.read_parsed(uri, resource, **options)
|
|
412
|
+
else:
|
|
413
|
+
return self._local_provider.read_parsed(uri, resource, **options)
|
|
414
|
+
|
|
415
|
+
def write_parsed(
|
|
416
|
+
self,
|
|
417
|
+
uri: str,
|
|
418
|
+
content: Any,
|
|
419
|
+
resource: str = "content.md",
|
|
420
|
+
metadata: dict[str, Any] | None = None,
|
|
421
|
+
):
|
|
422
|
+
"""
|
|
423
|
+
Write parsed content for a file.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
uri: Original file URI
|
|
427
|
+
content: Parsed content to write
|
|
428
|
+
resource: Resource name (default: content.md)
|
|
429
|
+
metadata: Optional parse metadata
|
|
430
|
+
|
|
431
|
+
Example:
|
|
432
|
+
fs.write_parsed(
|
|
433
|
+
"s3://bucket/file.pdf",
|
|
434
|
+
markdown_content,
|
|
435
|
+
metadata={"provider": "kreuzberg", "page_count": 10}
|
|
436
|
+
)
|
|
437
|
+
"""
|
|
438
|
+
if is_s3(uri):
|
|
439
|
+
self._s3_provider.write_parsed(uri, content, resource, metadata)
|
|
440
|
+
else:
|
|
441
|
+
self._local_provider.write_parsed(uri, content, resource, metadata)
|
|
442
|
+
|
|
443
|
+
def list_parsed_resources(self, uri: str) -> list[str]:
|
|
444
|
+
"""
|
|
445
|
+
List all resources in parsed directory.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
uri: Original file URI
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
List of resource paths (relative to .parsed/ directory)
|
|
452
|
+
|
|
453
|
+
Example:
|
|
454
|
+
resources = fs.list_parsed_resources("s3://bucket/file.pdf")
|
|
455
|
+
# ['content.md', 'metadata.json', 'images/page_1.png']
|
|
456
|
+
"""
|
|
457
|
+
if is_s3(uri):
|
|
458
|
+
return self._s3_provider.list_parsed_resources(uri)
|
|
459
|
+
else:
|
|
460
|
+
return self._local_provider.list_parsed_resources(uri)
|