remdb 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +806 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.0.dist-info/METADATA +1455 -0
- remdb-0.3.0.dist-info/RECORD +187 -0
- remdb-0.3.0.dist-info/WHEEL +4 -0
- remdb-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,1042 @@
|
|
|
1
|
+
"""
|
|
2
|
+
S3 storage provider for REM file system.
|
|
3
|
+
|
|
4
|
+
Features:
|
|
5
|
+
- Read/write multiple formats (JSON, YAML, CSV, Parquet, images, etc.)
|
|
6
|
+
- Presigned URLs for direct access
|
|
7
|
+
- Multipart uploads for large files
|
|
8
|
+
- Polars integration for columnar data
|
|
9
|
+
- Versioning support
|
|
10
|
+
- Directory operations (ls, ls_dirs, delete)
|
|
11
|
+
|
|
12
|
+
Integration:
|
|
13
|
+
- Uses rem.settings for S3 configuration
|
|
14
|
+
- ContentService for special format parsing (PDF, DOCX, etc.)
|
|
15
|
+
- IRSA (IAM Roles for Service Accounts) in EKS
|
|
16
|
+
|
|
17
|
+
Parsing Hooks:
|
|
18
|
+
- Convention: Separate uploads/ and parsed/ directories
|
|
19
|
+
- Uploads: s3://bucket/v1/uploads/user/2025/01/19/file.pdf
|
|
20
|
+
- Parsed: s3://bucket/v1/parsed/user/2025/01/19/file.pdf/{resource}
|
|
21
|
+
- get_parsed_uri(): Get URI for parsed content/metadata/images/tables
|
|
22
|
+
- has_parsed(): Check if file has been parsed
|
|
23
|
+
- read_parsed(): Read parsed markdown, metadata, or extracted resources
|
|
24
|
+
- write_parsed(): Write parsed content with automatic metadata tracking
|
|
25
|
+
- list_parsed_resources(): Discover all parsed resources
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
fs = S3Provider()
|
|
29
|
+
upload_uri = "s3://rem-io-staging/v1/uploads/user-123/2025/01/19/report.pdf"
|
|
30
|
+
|
|
31
|
+
# Check if already parsed
|
|
32
|
+
if fs.has_parsed(upload_uri):
|
|
33
|
+
markdown = fs.read_parsed(upload_uri)
|
|
34
|
+
else:
|
|
35
|
+
# Parse and cache
|
|
36
|
+
result = parse_file(upload_uri)
|
|
37
|
+
fs.write_parsed(
|
|
38
|
+
upload_uri,
|
|
39
|
+
result.markdown,
|
|
40
|
+
metadata={"provider": "kreuzberg", "page_count": 10}
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# List all parsed resources
|
|
44
|
+
resources = fs.list_parsed_resources(upload_uri)
|
|
45
|
+
# ['content.md', 'metadata.json', 'images/page_1.png', 'tables/table_0.parquet']
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
from pathlib import Path
|
|
49
|
+
from typing import Any, BinaryIO, Callable, Iterator
|
|
50
|
+
from io import BytesIO
|
|
51
|
+
from urllib.parse import urlparse
|
|
52
|
+
from datetime import datetime
|
|
53
|
+
import json
|
|
54
|
+
import tempfile
|
|
55
|
+
import io
|
|
56
|
+
|
|
57
|
+
import boto3
|
|
58
|
+
from botocore.exceptions import ClientError
|
|
59
|
+
from pydantic import BaseModel, model_validator
|
|
60
|
+
from loguru import logger
|
|
61
|
+
|
|
62
|
+
from rem.settings import settings
|
|
63
|
+
|
|
64
|
+
# Optional imports for specific formats
|
|
65
|
+
try:
|
|
66
|
+
import polars as pl
|
|
67
|
+
except ImportError:
|
|
68
|
+
pl = None # type: ignore[assignment]
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
import pandas as pd
|
|
72
|
+
except ImportError:
|
|
73
|
+
pd = None # type: ignore[assignment]
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
import yaml
|
|
77
|
+
except ImportError:
|
|
78
|
+
yaml = None # type: ignore[assignment]
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
from PIL import Image
|
|
82
|
+
except ImportError:
|
|
83
|
+
Image = None # type: ignore[assignment]
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
import pyarrow as pa
|
|
87
|
+
import pyarrow.dataset as ds
|
|
88
|
+
except ImportError:
|
|
89
|
+
pa = None
|
|
90
|
+
ds = None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class S3ObjectListing(BaseModel):
|
|
94
|
+
"""S3 object metadata with convenience properties."""
|
|
95
|
+
|
|
96
|
+
Key: str
|
|
97
|
+
LastModified: datetime
|
|
98
|
+
Size: int
|
|
99
|
+
bucket: str
|
|
100
|
+
uri: str | None = None
|
|
101
|
+
|
|
102
|
+
def __repr__(self):
|
|
103
|
+
return self.uri or f"s3://{self.bucket}/{self.Key}"
|
|
104
|
+
|
|
105
|
+
@model_validator(mode="before")
|
|
106
|
+
@classmethod
|
|
107
|
+
def fixup(cls, data: Any) -> Any:
|
|
108
|
+
"""Construct full URI from bucket and key."""
|
|
109
|
+
data["uri"] = f"s3://{data['bucket']}/{data['Key']}"
|
|
110
|
+
return data
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class FileLikeWritable:
|
|
114
|
+
"""
|
|
115
|
+
Wrapper around S3 put_object to provide file-like write interface.
|
|
116
|
+
|
|
117
|
+
Used for writing data that doesn't fit in memory or needs streaming.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(self, s3_client, bucket: str, key: str):
|
|
121
|
+
self._client = s3_client
|
|
122
|
+
self.bucket = bucket
|
|
123
|
+
self.key = key
|
|
124
|
+
|
|
125
|
+
def write(self, data: bytes, **options):
|
|
126
|
+
"""Write bytes to S3 object."""
|
|
127
|
+
if isinstance(data, BytesIO):
|
|
128
|
+
data = data.getvalue()
|
|
129
|
+
self._client.put_object(Bucket=self.bucket, Key=self.key, Body=data)
|
|
130
|
+
|
|
131
|
+
def __enter__(self):
|
|
132
|
+
return self
|
|
133
|
+
|
|
134
|
+
def __exit__(self, exc_type, exc_value, exc_traceback):
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def generate_presigned_url(url: str, expiry: int = 3600, for_upload: bool = False) -> str:
|
|
139
|
+
"""
|
|
140
|
+
Generate presigned URL for S3 object access.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
url: S3 URI (s3://bucket/key)
|
|
144
|
+
expiry: URL expiration in seconds (default: 3600)
|
|
145
|
+
for_upload: Generate PUT URL instead of GET (default: False)
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Presigned URL for direct S3 access
|
|
149
|
+
|
|
150
|
+
Example:
|
|
151
|
+
# Download URL
|
|
152
|
+
url = generate_presigned_url("s3://bucket/file.pdf")
|
|
153
|
+
|
|
154
|
+
# Upload URL
|
|
155
|
+
url = generate_presigned_url("s3://bucket/file.pdf", for_upload=True)
|
|
156
|
+
"""
|
|
157
|
+
s3 = S3Provider()
|
|
158
|
+
|
|
159
|
+
if not s3.is_s3_uri(url):
|
|
160
|
+
return url
|
|
161
|
+
|
|
162
|
+
bucket_name, object_key = s3._split_bucket_and_blob_from_path(url)
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
if for_upload:
|
|
166
|
+
return s3._client.generate_presigned_url(
|
|
167
|
+
"put_object",
|
|
168
|
+
Params={"Bucket": bucket_name, "Key": object_key},
|
|
169
|
+
ExpiresIn=expiry,
|
|
170
|
+
HttpMethod="PUT",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
return s3._client.generate_presigned_url(
|
|
174
|
+
"get_object",
|
|
175
|
+
Params={"Bucket": bucket_name, "Key": object_key},
|
|
176
|
+
ExpiresIn=expiry,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
except Exception as ex:
|
|
180
|
+
logger.error(f"Failed to generate presigned URL for {url}: {ex}")
|
|
181
|
+
raise
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class S3Provider:
|
|
185
|
+
"""
|
|
186
|
+
S3 storage provider with REM settings integration.
|
|
187
|
+
|
|
188
|
+
Supports IRSA (IAM Roles for Service Accounts) in EKS for secure access.
|
|
189
|
+
Falls back to access keys for local development or MinIO.
|
|
190
|
+
|
|
191
|
+
Parsing Hooks:
|
|
192
|
+
- get_parsed_uri(): Get URI for parsed version of a file
|
|
193
|
+
- read_parsed(): Read parsed content (markdown, images, etc.)
|
|
194
|
+
- write_parsed(): Write parsed content with metadata
|
|
195
|
+
- has_parsed(): Check if parsed version exists
|
|
196
|
+
|
|
197
|
+
Convention:
|
|
198
|
+
- Parsed files stored at {original_uri}.parsed/
|
|
199
|
+
- Metadata at {original_uri}.parsed/metadata.json
|
|
200
|
+
- Content at {original_uri}.parsed/content.md (or other formats)
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
def __init__(self):
|
|
204
|
+
"""Initialize S3 client from REM settings."""
|
|
205
|
+
self._client = self._create_s3_client()
|
|
206
|
+
|
|
207
|
+
def _create_s3_client(self):
|
|
208
|
+
"""Create S3 client with IRSA or configured credentials."""
|
|
209
|
+
s3_config: dict[str, Any] = {
|
|
210
|
+
"region_name": settings.s3.region,
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
# Custom endpoint for MinIO/LocalStack
|
|
214
|
+
if settings.s3.endpoint_url:
|
|
215
|
+
s3_config["endpoint_url"] = settings.s3.endpoint_url
|
|
216
|
+
|
|
217
|
+
# Access keys (not needed with IRSA in EKS)
|
|
218
|
+
if settings.s3.access_key_id and settings.s3.secret_access_key:
|
|
219
|
+
s3_config["aws_access_key_id"] = settings.s3.access_key_id
|
|
220
|
+
s3_config["aws_secret_access_key"] = settings.s3.secret_access_key
|
|
221
|
+
|
|
222
|
+
# SSL configuration
|
|
223
|
+
s3_config["use_ssl"] = settings.s3.use_ssl
|
|
224
|
+
|
|
225
|
+
return boto3.client("s3", **s3_config)
|
|
226
|
+
|
|
227
|
+
@staticmethod
|
|
228
|
+
def is_s3_uri(uri: str) -> bool:
|
|
229
|
+
"""Check if URI is S3 format."""
|
|
230
|
+
return uri.startswith("s3://")
|
|
231
|
+
|
|
232
|
+
def _check_uri(self, uri: str):
|
|
233
|
+
"""Validate S3 URI format."""
|
|
234
|
+
url = urlparse(uri)
|
|
235
|
+
if url.scheme != "s3":
|
|
236
|
+
raise ValueError(
|
|
237
|
+
f"URI must be of the form s3://BUCKET/path/to/file "
|
|
238
|
+
f"but got {uri} with scheme {url.scheme}"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
def _split_bucket_and_blob_from_path(self, uri: str) -> tuple[str, str]:
|
|
242
|
+
"""
|
|
243
|
+
Split S3 URI into bucket and key.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
uri: S3 URI (s3://bucket/path/to/file)
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Tuple of (bucket, key)
|
|
250
|
+
"""
|
|
251
|
+
self._check_uri(uri)
|
|
252
|
+
url = urlparse(uri)
|
|
253
|
+
return url.netloc, url.path.lstrip("/")
|
|
254
|
+
|
|
255
|
+
def exists(self, uri: str) -> bool:
|
|
256
|
+
"""
|
|
257
|
+
Check if S3 object or prefix exists.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
uri: S3 URI
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
True if exists, False otherwise
|
|
264
|
+
"""
|
|
265
|
+
bucket, prefix = self._split_bucket_and_blob_from_path(uri)
|
|
266
|
+
|
|
267
|
+
# For files (has extension), use head_object
|
|
268
|
+
if "." in Path(prefix).name:
|
|
269
|
+
try:
|
|
270
|
+
self._client.head_object(Bucket=bucket, Key=prefix)
|
|
271
|
+
return True
|
|
272
|
+
except ClientError:
|
|
273
|
+
return False
|
|
274
|
+
|
|
275
|
+
# For directories/prefixes, use list_objects_v2
|
|
276
|
+
try:
|
|
277
|
+
response = self._client.list_objects_v2(
|
|
278
|
+
Prefix=prefix, Bucket=bucket, MaxKeys=1
|
|
279
|
+
)
|
|
280
|
+
return response.get("KeyCount", 0) > 0
|
|
281
|
+
except ClientError:
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
def open(self, uri: str, mode: str = "rb", version_id: str | None = None) -> BytesIO | FileLikeWritable:
|
|
285
|
+
"""
|
|
286
|
+
Open S3 object as file-like object.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
uri: S3 URI
|
|
290
|
+
mode: File mode (r, rb, w, wb)
|
|
291
|
+
version_id: Optional S3 version ID for versioned buckets
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
File-like object (BytesIO for read, FileLikeWritable for write)
|
|
295
|
+
"""
|
|
296
|
+
if mode[0] == "r":
|
|
297
|
+
return BytesIO(self.get_streaming_body(uri, version_id=version_id).read())
|
|
298
|
+
|
|
299
|
+
bucket, key = self._split_bucket_and_blob_from_path(uri)
|
|
300
|
+
return FileLikeWritable(self._client, bucket, key)
|
|
301
|
+
|
|
302
|
+
def get_streaming_body(
|
|
303
|
+
self,
|
|
304
|
+
uri: str,
|
|
305
|
+
version_id: str | None = None,
|
|
306
|
+
**kwargs,
|
|
307
|
+
):
|
|
308
|
+
"""
|
|
309
|
+
Get streaming body for S3 object.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
uri: S3 URI
|
|
313
|
+
version_id: Optional version ID
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
S3 streaming body
|
|
317
|
+
"""
|
|
318
|
+
bucket, prefix = self._split_bucket_and_blob_from_path(uri)
|
|
319
|
+
|
|
320
|
+
try:
|
|
321
|
+
params = {"Bucket": bucket, "Key": prefix}
|
|
322
|
+
if version_id:
|
|
323
|
+
params["VersionId"] = version_id
|
|
324
|
+
|
|
325
|
+
response = self._client.get_object(**params)
|
|
326
|
+
return response["Body"]
|
|
327
|
+
except ClientError as ex:
|
|
328
|
+
logger.error(f"Failed to get S3 object {uri}: {ex}")
|
|
329
|
+
raise
|
|
330
|
+
|
|
331
|
+
def read(self, uri: str, use_polars: bool = True, version_id: str | None = None, **options) -> Any:
|
|
332
|
+
"""
|
|
333
|
+
Read S3 object with format detection.
|
|
334
|
+
|
|
335
|
+
Supports:
|
|
336
|
+
- JSON (.json)
|
|
337
|
+
- YAML (.yml, .yaml)
|
|
338
|
+
- CSV (.csv)
|
|
339
|
+
- Parquet (.parquet)
|
|
340
|
+
- Feather (.feather)
|
|
341
|
+
- Excel (.xlsx, .xls)
|
|
342
|
+
- Text (.txt, .log, .md)
|
|
343
|
+
- Images (.png, .jpg, .jpeg, .tiff, .svg)
|
|
344
|
+
- PDF (.pdf) - TODO: integrate ContentService
|
|
345
|
+
- DOCX (.docx) - TODO: integrate ContentService
|
|
346
|
+
- WAV (.wav) - TODO: add audio provider
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
uri: S3 URI
|
|
350
|
+
use_polars: Use Polars for dataframes (default: True)
|
|
351
|
+
version_id: Optional S3 version ID
|
|
352
|
+
**options: Format-specific options
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
Parsed data in appropriate format
|
|
356
|
+
"""
|
|
357
|
+
p = Path(uri)
|
|
358
|
+
suffix = p.suffix.lower()
|
|
359
|
+
|
|
360
|
+
# TODO: Integrate ContentService for PDF/DOCX parsing
|
|
361
|
+
if suffix == ".pdf":
|
|
362
|
+
logger.warning("PDF parsing not yet implemented - use ContentService")
|
|
363
|
+
raise NotImplementedError(
|
|
364
|
+
"PDF parsing requires ContentService integration. "
|
|
365
|
+
"TODO: from rem.services.content import ContentService; return ContentService().process_uri(uri)"
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
if suffix == ".docx":
|
|
369
|
+
logger.warning("DOCX parsing not yet implemented")
|
|
370
|
+
# TODO: Add python-docx provider
|
|
371
|
+
raise NotImplementedError(
|
|
372
|
+
"DOCX parsing not yet implemented. "
|
|
373
|
+
"TODO: Add python-docx to dependencies and implement DocxProvider"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Structured data formats
|
|
377
|
+
if suffix in [".yml", ".yaml"]:
|
|
378
|
+
if not yaml:
|
|
379
|
+
raise ImportError("PyYAML is required for YAML support")
|
|
380
|
+
return yaml.safe_load(self.get_streaming_body(uri, version_id=version_id, **options))
|
|
381
|
+
|
|
382
|
+
if suffix == ".json":
|
|
383
|
+
return json.load(self.get_streaming_body(uri, version_id=version_id, **options))
|
|
384
|
+
|
|
385
|
+
if suffix == ".txt" or suffix == ".log" or suffix == ".md":
|
|
386
|
+
return self.get_streaming_body(uri, version_id=version_id, **options).read().decode()
|
|
387
|
+
|
|
388
|
+
# Columnar data formats
|
|
389
|
+
dataframe_lib = pl if use_polars and pl else pd
|
|
390
|
+
if not dataframe_lib:
|
|
391
|
+
raise ImportError(
|
|
392
|
+
"Either Polars or Pandas is required for tabular data support. "
|
|
393
|
+
"Install with: uv add polars"
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
if suffix == ".csv":
|
|
397
|
+
with self.open(uri, "rb") as f:
|
|
398
|
+
return dataframe_lib.read_csv(f, **options)
|
|
399
|
+
|
|
400
|
+
if suffix == ".parquet":
|
|
401
|
+
with self.open(uri, "rb") as f:
|
|
402
|
+
return dataframe_lib.read_parquet(f, **options)
|
|
403
|
+
|
|
404
|
+
if suffix == ".feather":
|
|
405
|
+
with self.open(uri, "rb") as f:
|
|
406
|
+
# TODO: Verify feather support in Polars
|
|
407
|
+
if use_polars and pl:
|
|
408
|
+
logger.warning("Feather support in Polars may vary - consider using Pandas")
|
|
409
|
+
return dataframe_lib.read_feather(f, **options)
|
|
410
|
+
|
|
411
|
+
if suffix in [".xls", ".xlsx"]:
|
|
412
|
+
# Excel requires pandas
|
|
413
|
+
if not pd:
|
|
414
|
+
raise ImportError("Pandas is required for Excel support")
|
|
415
|
+
# TODO: Add openpyxl or xlrd to dependencies
|
|
416
|
+
logger.warning("Excel support requires openpyxl or xlrd - add to pyproject.toml if needed")
|
|
417
|
+
return pd.read_excel(uri, sheet_name=None, **options)
|
|
418
|
+
|
|
419
|
+
# Image formats
|
|
420
|
+
if suffix in [".png", ".jpg", ".jpeg", ".tiff", ".tif"]:
|
|
421
|
+
if not Image:
|
|
422
|
+
raise ImportError("Pillow is required for image support. Install with: uv add pillow")
|
|
423
|
+
with self.open(uri, "rb") as s3f:
|
|
424
|
+
return Image.open(s3f)
|
|
425
|
+
|
|
426
|
+
if suffix == ".svg":
|
|
427
|
+
return self.get_streaming_body(uri, version_id=version_id, **options).read().decode()
|
|
428
|
+
|
|
429
|
+
# TODO: Audio formats
|
|
430
|
+
if suffix in [".wav", ".mp3", ".flac"]:
|
|
431
|
+
logger.warning(f"Audio format {suffix} not yet supported")
|
|
432
|
+
# TODO: Add librosa or pydub provider
|
|
433
|
+
raise NotImplementedError(
|
|
434
|
+
f"Audio format {suffix} requires audio processing library. "
|
|
435
|
+
"TODO: Add librosa or pydub to dependencies"
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Binary formats
|
|
439
|
+
if suffix == ".pickle":
|
|
440
|
+
import pickle
|
|
441
|
+
with self.open(uri, "rb") as f:
|
|
442
|
+
return pickle.load(f)
|
|
443
|
+
|
|
444
|
+
raise ValueError(
|
|
445
|
+
f"Unsupported file format: {suffix}. "
|
|
446
|
+
f"Supported formats: .json, .yaml, .csv, .parquet, .txt, .png, .jpg, etc."
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
def write(self, uri: str, data: Any, **options):
|
|
450
|
+
"""
|
|
451
|
+
Write data to S3 with format detection.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
uri: S3 URI
|
|
455
|
+
data: Data to write (DataFrame, dict, Image, bytes, str)
|
|
456
|
+
**options: Format-specific options
|
|
457
|
+
"""
|
|
458
|
+
p = Path(uri)
|
|
459
|
+
suffix = p.suffix.lower()
|
|
460
|
+
bucket, prefix = self._split_bucket_and_blob_from_path(uri)
|
|
461
|
+
|
|
462
|
+
def write_object(writer_fn):
|
|
463
|
+
"""
|
|
464
|
+
Helper to write via BytesIO stream.
|
|
465
|
+
|
|
466
|
+
Pattern: write_object(lambda s: data.write_parquet(s))
|
|
467
|
+
- Creates in-memory buffer
|
|
468
|
+
- Calls writer function to populate buffer
|
|
469
|
+
- Uploads buffer contents to S3
|
|
470
|
+
- Avoids writing temporary files to disk
|
|
471
|
+
"""
|
|
472
|
+
stream = io.BytesIO()
|
|
473
|
+
writer_fn(stream)
|
|
474
|
+
self._client.put_object(Bucket=bucket, Key=prefix, Body=stream.getvalue())
|
|
475
|
+
|
|
476
|
+
# Dataframe formats
|
|
477
|
+
if suffix == ".parquet":
|
|
478
|
+
if hasattr(data, "write_parquet"): # Polars
|
|
479
|
+
return write_object(lambda s: data.write_parquet(s, **options))
|
|
480
|
+
elif hasattr(data, "to_parquet"): # Pandas
|
|
481
|
+
return write_object(lambda s: data.to_parquet(s, **options))
|
|
482
|
+
raise TypeError(f"Cannot write {type(data)} to parquet")
|
|
483
|
+
|
|
484
|
+
if suffix == ".csv":
|
|
485
|
+
if hasattr(data, "write_csv"): # Polars
|
|
486
|
+
return write_object(lambda s: data.write_csv(s, **options))
|
|
487
|
+
elif hasattr(data, "to_csv"): # Pandas
|
|
488
|
+
from functools import partial
|
|
489
|
+
fn = partial(data.to_csv, index=False)
|
|
490
|
+
return write_object(lambda s: fn(s, **options))
|
|
491
|
+
elif isinstance(data, (bytes, str)):
|
|
492
|
+
content = data.encode("utf-8") if isinstance(data, str) else data
|
|
493
|
+
return self._client.put_object(
|
|
494
|
+
Bucket=bucket, Key=prefix, Body=content, ContentType="text/csv"
|
|
495
|
+
)
|
|
496
|
+
raise TypeError(f"Cannot write {type(data)} to CSV")
|
|
497
|
+
|
|
498
|
+
if suffix == ".feather":
|
|
499
|
+
if hasattr(data, "write_feather"): # Polars (check method name)
|
|
500
|
+
logger.warning("Feather support in Polars - verify method name")
|
|
501
|
+
return write_object(lambda s: data.write_feather(s, **options))
|
|
502
|
+
elif hasattr(data, "to_feather"): # Pandas
|
|
503
|
+
return write_object(lambda s: data.to_feather(s, **options))
|
|
504
|
+
raise TypeError(f"Cannot write {type(data)} to feather")
|
|
505
|
+
|
|
506
|
+
# Structured data formats
|
|
507
|
+
if suffix in [".yml", ".yaml"]:
|
|
508
|
+
if isinstance(data, dict):
|
|
509
|
+
if not yaml:
|
|
510
|
+
raise ImportError("PyYAML required for YAML support")
|
|
511
|
+
yaml_str = yaml.safe_dump(data)
|
|
512
|
+
return self._client.put_object(Bucket=bucket, Key=prefix, Body=yaml_str.encode('utf-8'))
|
|
513
|
+
raise TypeError(f"YAML requires dict, got {type(data)}")
|
|
514
|
+
|
|
515
|
+
if suffix == ".json":
|
|
516
|
+
if isinstance(data, dict):
|
|
517
|
+
json_str = json.dumps(data)
|
|
518
|
+
return self._client.put_object(Bucket=bucket, Key=prefix, Body=json_str.encode('utf-8'))
|
|
519
|
+
raise TypeError(f"JSON requires dict, got {type(data)}")
|
|
520
|
+
|
|
521
|
+
# Image formats
|
|
522
|
+
if suffix in [".png", ".jpg", ".jpeg", ".tiff", ".tif"]:
|
|
523
|
+
if not Image:
|
|
524
|
+
raise ImportError("Pillow required for image support")
|
|
525
|
+
if not isinstance(data, Image.Image):
|
|
526
|
+
data = Image.fromarray(data)
|
|
527
|
+
format_name = suffix[1:] # Remove leading dot
|
|
528
|
+
_data = BytesIO()
|
|
529
|
+
save_options = {"format": format_name, **options}
|
|
530
|
+
if "dpi" in options:
|
|
531
|
+
dpi = options["dpi"]
|
|
532
|
+
save_options["dpi"] = (dpi, dpi) if isinstance(dpi, int) else dpi
|
|
533
|
+
data.save(_data, **save_options)
|
|
534
|
+
return self._client.put_object(Bucket=bucket, Key=prefix, Body=_data.getvalue())
|
|
535
|
+
|
|
536
|
+
# Document formats
|
|
537
|
+
if suffix == ".pdf":
|
|
538
|
+
return self._client.put_object(
|
|
539
|
+
Bucket=bucket, Key=prefix, Body=data, ContentType="application/pdf"
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
if suffix == ".html":
|
|
543
|
+
return self._client.put_object(
|
|
544
|
+
Bucket=bucket, Key=prefix, Body=data, ContentType="text/html"
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
# Binary/text fallback
|
|
548
|
+
if suffix == ".pickle":
|
|
549
|
+
import pickle
|
|
550
|
+
with self.open(uri, "wb") as f:
|
|
551
|
+
return write_object(lambda s: pickle.dump(data, s, **options))
|
|
552
|
+
|
|
553
|
+
# Default: write as bytes/string
|
|
554
|
+
return self._client.put_object(Bucket=bucket, Key=prefix, Body=data)
|
|
555
|
+
|
|
556
|
+
def copy(self, uri_from: str, uri_to: str):
|
|
557
|
+
"""
|
|
558
|
+
Copy files between S3, local, or S3-to-S3.
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
uri_from: Source URI (s3://... or local path)
|
|
562
|
+
uri_to: Destination URI (s3://... or local path)
|
|
563
|
+
"""
|
|
564
|
+
from_s3 = self.is_s3_uri(uri_from)
|
|
565
|
+
to_s3 = self.is_s3_uri(uri_to)
|
|
566
|
+
|
|
567
|
+
if to_s3 and not from_s3:
|
|
568
|
+
# Upload: local -> S3
|
|
569
|
+
bucket, path = self._split_bucket_and_blob_from_path(uri_to)
|
|
570
|
+
self._client.upload_file(uri_from, bucket, path)
|
|
571
|
+
|
|
572
|
+
elif not to_s3 and from_s3:
|
|
573
|
+
# Download: S3 -> local
|
|
574
|
+
bucket, path = self._split_bucket_and_blob_from_path(uri_from)
|
|
575
|
+
# TODO: Add progress bar with tqdm
|
|
576
|
+
logger.info(f"Downloading {uri_from} to {uri_to}")
|
|
577
|
+
self._client.download_file(bucket, path, uri_to)
|
|
578
|
+
|
|
579
|
+
elif to_s3 and from_s3:
|
|
580
|
+
# S3 to S3 copy
|
|
581
|
+
with self.open(uri_from) as from_obj:
|
|
582
|
+
with self.open(uri_to, "wb") as to_obj:
|
|
583
|
+
to_obj.write(from_obj.read())
|
|
584
|
+
else:
|
|
585
|
+
raise ValueError("At least one of uri_from or uri_to must be an S3 path")
|
|
586
|
+
|
|
587
|
+
def ls(self, uri: str, file_type: str = "*", search: str = "**/", **kwargs) -> list[str]:
|
|
588
|
+
"""
|
|
589
|
+
List files under S3 prefix.
|
|
590
|
+
|
|
591
|
+
Args:
|
|
592
|
+
uri: S3 prefix URI
|
|
593
|
+
file_type: File extension filter (default: all)
|
|
594
|
+
search: Search pattern (default: recursive)
|
|
595
|
+
|
|
596
|
+
Returns:
|
|
597
|
+
List of S3 URIs
|
|
598
|
+
"""
|
|
599
|
+
results = self.glob(uri, file_type=file_type, search=search, **kwargs)
|
|
600
|
+
return [obj.uri for obj in results if obj.uri is not None]
|
|
601
|
+
|
|
602
|
+
def glob(
|
|
603
|
+
self, uri: str, file_type: str = "*", search: str = "**/", **kwargs
|
|
604
|
+
) -> list[S3ObjectListing]:
|
|
605
|
+
"""
|
|
606
|
+
List S3 objects with metadata.
|
|
607
|
+
|
|
608
|
+
Args:
|
|
609
|
+
uri: S3 prefix URI
|
|
610
|
+
file_type: File extension filter
|
|
611
|
+
search: Search pattern
|
|
612
|
+
|
|
613
|
+
Returns:
|
|
614
|
+
List of S3ObjectListing objects
|
|
615
|
+
"""
|
|
616
|
+
bucket, prefix = self._split_bucket_and_blob_from_path(uri)
|
|
617
|
+
|
|
618
|
+
# Ensure trailing slash for directory prefixes
|
|
619
|
+
if prefix and not prefix.endswith("/"):
|
|
620
|
+
prefix = prefix + "/"
|
|
621
|
+
|
|
622
|
+
try:
|
|
623
|
+
response = self._client.list_objects_v2(Prefix=prefix, Bucket=bucket)
|
|
624
|
+
contents = response.get("Contents")
|
|
625
|
+
|
|
626
|
+
if not contents:
|
|
627
|
+
return []
|
|
628
|
+
|
|
629
|
+
return [S3ObjectListing(**d, bucket=bucket) for d in contents]
|
|
630
|
+
|
|
631
|
+
except ClientError as ex:
|
|
632
|
+
logger.error(f"Failed to list S3 objects at {uri}: {ex}")
|
|
633
|
+
return []
|
|
634
|
+
|
|
635
|
+
def ls_dirs(self, uri: str, max_keys: int = 100) -> list[str]:
|
|
636
|
+
"""
|
|
637
|
+
List immediate child directories under S3 prefix.
|
|
638
|
+
|
|
639
|
+
Args:
|
|
640
|
+
uri: S3 prefix URI
|
|
641
|
+
max_keys: Maximum directories to return
|
|
642
|
+
|
|
643
|
+
Returns:
|
|
644
|
+
List of directory URIs
|
|
645
|
+
"""
|
|
646
|
+
bucket, key = self._split_bucket_and_blob_from_path(uri)
|
|
647
|
+
key = f"{key.rstrip('/')}/"
|
|
648
|
+
|
|
649
|
+
response = self._client.list_objects_v2(
|
|
650
|
+
Bucket=bucket, Prefix=key, Delimiter="/", MaxKeys=max_keys
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
prefixes = response.get("CommonPrefixes", [])
|
|
654
|
+
dirs = [p["Prefix"].rstrip("/").split("/")[-1] for p in prefixes]
|
|
655
|
+
return [f"{uri}/{d}" for d in dirs]
|
|
656
|
+
|
|
657
|
+
def ls_iter(self, uri: str, **options) -> Iterator[str]:
|
|
658
|
+
"""
|
|
659
|
+
Iterate over S3 objects with pagination.
|
|
660
|
+
|
|
661
|
+
TODO: Implement pagination with continuation tokens.
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
uri: S3 prefix URI
|
|
665
|
+
**options: Listing options
|
|
666
|
+
|
|
667
|
+
Yields:
|
|
668
|
+
S3 URIs
|
|
669
|
+
"""
|
|
670
|
+
# TODO: Implement pagination for large result sets
|
|
671
|
+
logger.warning("ls_iter pagination not yet implemented - returning all results")
|
|
672
|
+
yield from self.ls(uri, **options)
|
|
673
|
+
|
|
674
|
+
def delete(self, uri: str, limit: int = 50) -> list[str]:
|
|
675
|
+
"""
|
|
676
|
+
Delete S3 objects under prefix.
|
|
677
|
+
|
|
678
|
+
Safety limit prevents accidental bulk deletions.
|
|
679
|
+
|
|
680
|
+
Args:
|
|
681
|
+
uri: S3 URI (file or prefix)
|
|
682
|
+
limit: Maximum files to delete (safety limit)
|
|
683
|
+
|
|
684
|
+
Returns:
|
|
685
|
+
List of deleted URIs
|
|
686
|
+
"""
|
|
687
|
+
deleted_files = self.ls(uri)
|
|
688
|
+
|
|
689
|
+
if len(deleted_files) > limit:
|
|
690
|
+
raise ValueError(
|
|
691
|
+
f"Attempting to delete {len(deleted_files)} files exceeds "
|
|
692
|
+
f"safety limit of {limit}. Increase limit parameter if intentional."
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
s3_resource = boto3.resource("s3")
|
|
696
|
+
for file_uri in deleted_files:
|
|
697
|
+
logger.debug(f"Deleting {file_uri}")
|
|
698
|
+
bucket, key = self._split_bucket_and_blob_from_path(file_uri)
|
|
699
|
+
s3_resource.Object(bucket, key).delete()
|
|
700
|
+
|
|
701
|
+
# Delete the prefix marker if it exists
|
|
702
|
+
bucket, key = self._split_bucket_and_blob_from_path(uri)
|
|
703
|
+
try:
|
|
704
|
+
s3_resource.Object(bucket, key).delete()
|
|
705
|
+
except:
|
|
706
|
+
pass # Prefix marker may not exist
|
|
707
|
+
|
|
708
|
+
return deleted_files
|
|
709
|
+
|
|
710
|
+
def read_dataset(self, uri: str):
|
|
711
|
+
"""
|
|
712
|
+
Read S3 data as PyArrow dataset.
|
|
713
|
+
|
|
714
|
+
Useful for partitioned parquet datasets and lazy loading.
|
|
715
|
+
|
|
716
|
+
Args:
|
|
717
|
+
uri: S3 dataset URI
|
|
718
|
+
|
|
719
|
+
Returns:
|
|
720
|
+
PyArrow Dataset
|
|
721
|
+
"""
|
|
722
|
+
if not pl:
|
|
723
|
+
raise ImportError("Polars required for dataset operations. Install with: uv add polars")
|
|
724
|
+
|
|
725
|
+
with self.open(uri, mode="rb") as f:
|
|
726
|
+
return pl.read_parquet(f).to_arrow()
|
|
727
|
+
|
|
728
|
+
def read_image(self, uri: str, version_id: str | None = None):
|
|
729
|
+
"""
|
|
730
|
+
Read S3 object as PIL Image.
|
|
731
|
+
|
|
732
|
+
Args:
|
|
733
|
+
uri: S3 image URI
|
|
734
|
+
version_id: Optional S3 version ID
|
|
735
|
+
|
|
736
|
+
Returns:
|
|
737
|
+
PIL Image
|
|
738
|
+
"""
|
|
739
|
+
if not Image:
|
|
740
|
+
raise ImportError("Pillow required for image support. Install with: uv add pillow")
|
|
741
|
+
|
|
742
|
+
if version_id:
|
|
743
|
+
bucket, key = self._split_bucket_and_blob_from_path(uri)
|
|
744
|
+
response = self._client.get_object(Bucket=bucket, Key=key, VersionId=version_id)
|
|
745
|
+
return Image.open(BytesIO(response["Body"].read()))
|
|
746
|
+
|
|
747
|
+
with self.open(uri, "rb") as f:
|
|
748
|
+
return Image.open(f)
|
|
749
|
+
|
|
750
|
+
def cache_data(
|
|
751
|
+
self,
|
|
752
|
+
data: Any,
|
|
753
|
+
cache_location: str | None = None,
|
|
754
|
+
suffix: str | None = None,
|
|
755
|
+
**kwargs,
|
|
756
|
+
) -> str:
|
|
757
|
+
"""
|
|
758
|
+
Cache data to S3 (typically images).
|
|
759
|
+
|
|
760
|
+
Args:
|
|
761
|
+
data: Data to cache (Image, etc.)
|
|
762
|
+
cache_location: S3 prefix for cache (default: from settings)
|
|
763
|
+
suffix: File extension
|
|
764
|
+
**kwargs: Additional options (uri, etc.)
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
S3 URI of cached data
|
|
768
|
+
"""
|
|
769
|
+
if "uri" in kwargs:
|
|
770
|
+
return kwargs["uri"]
|
|
771
|
+
|
|
772
|
+
cache_location = cache_location or f"s3://{settings.s3.bucket_name}/cache"
|
|
773
|
+
|
|
774
|
+
if Image and isinstance(data, Image.Image):
|
|
775
|
+
suffix = suffix or ".png"
|
|
776
|
+
# TODO: Implement res_hash for unique file naming
|
|
777
|
+
import uuid
|
|
778
|
+
file_id = str(uuid.uuid4())
|
|
779
|
+
uri = f"{cache_location}/images/{file_id}{suffix}"
|
|
780
|
+
self.write(uri, data)
|
|
781
|
+
return uri
|
|
782
|
+
|
|
783
|
+
raise NotImplementedError(
|
|
784
|
+
f"Caching not implemented for type {type(data)}. "
|
|
785
|
+
"Currently supports: PIL Image. TODO: Add support for other types."
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
def apply(self, uri: str, fn: Callable[[str], Any]) -> Any:
|
|
789
|
+
"""
|
|
790
|
+
Apply function to S3 file via temporary local copy.
|
|
791
|
+
|
|
792
|
+
Downloads file to /tmp, applies function, then cleans up.
|
|
793
|
+
|
|
794
|
+
Args:
|
|
795
|
+
uri: S3 URI
|
|
796
|
+
fn: Function that takes local file path
|
|
797
|
+
|
|
798
|
+
Returns:
|
|
799
|
+
Result of function call
|
|
800
|
+
"""
|
|
801
|
+
with self.open(uri, "rb") as s3f:
|
|
802
|
+
suffix = Path(uri).suffix
|
|
803
|
+
with tempfile.NamedTemporaryFile(
|
|
804
|
+
suffix=suffix, prefix="s3_", mode="wb", delete=False
|
|
805
|
+
) as f:
|
|
806
|
+
f.write(s3f.read())
|
|
807
|
+
f.flush()
|
|
808
|
+
try:
|
|
809
|
+
return fn(f.name)
|
|
810
|
+
finally:
|
|
811
|
+
# Clean up temp file
|
|
812
|
+
Path(f.name).unlink(missing_ok=True)
|
|
813
|
+
|
|
814
|
+
def local_file(self, uri: str) -> str:
|
|
815
|
+
"""
|
|
816
|
+
Download S3 file to /tmp and return local path.
|
|
817
|
+
|
|
818
|
+
Args:
|
|
819
|
+
uri: S3 URI
|
|
820
|
+
|
|
821
|
+
Returns:
|
|
822
|
+
Local file path
|
|
823
|
+
"""
|
|
824
|
+
filename = Path(uri).name
|
|
825
|
+
local_path = f"/tmp/{filename}"
|
|
826
|
+
self.copy(uri, local_path)
|
|
827
|
+
return local_path
|
|
828
|
+
|
|
829
|
+
# ========================================================================
|
|
830
|
+
# Parsing Hooks
|
|
831
|
+
# ========================================================================
|
|
832
|
+
# Convention: Separate uploads/ and parsed/ directories with deterministic matching
|
|
833
|
+
# Uploads: s3://bucket/v1/uploads/user-123/2025/01/19/file.pdf
|
|
834
|
+
# Parsed: s3://bucket/v1/parsed/user-123/2025/01/19/file.pdf/content.md
|
|
835
|
+
# s3://bucket/v1/parsed/user-123/2025/01/19/file.pdf/metadata.json
|
|
836
|
+
# s3://bucket/v1/parsed/user-123/2025/01/19/file.pdf/images/page_1.png
|
|
837
|
+
# ========================================================================
|
|
838
|
+
|
|
839
|
+
def get_parsed_uri(self, uri: str, resource: str = "content.md") -> str:
|
|
840
|
+
"""
|
|
841
|
+
Get URI for parsed version of a file.
|
|
842
|
+
|
|
843
|
+
Maps uploads/ paths to parsed/ paths deterministically:
|
|
844
|
+
uploads/user/2025/01/19/file.pdf -> parsed/user/2025/01/19/file.pdf/{resource}
|
|
845
|
+
|
|
846
|
+
Args:
|
|
847
|
+
uri: Original file URI (e.g., s3://bucket/v1/uploads/user/2025/01/19/file.pdf)
|
|
848
|
+
resource: Resource within parsed directory (default: content.md)
|
|
849
|
+
|
|
850
|
+
Returns:
|
|
851
|
+
Parsed resource URI (e.g., s3://bucket/v1/parsed/user/2025/01/19/file.pdf/content.md)
|
|
852
|
+
|
|
853
|
+
Example:
|
|
854
|
+
# Original upload
|
|
855
|
+
upload_uri = "s3://rem-io-staging/v1/uploads/user-123/2025/01/19/report.pdf"
|
|
856
|
+
|
|
857
|
+
# Get parsed markdown
|
|
858
|
+
parsed_uri = fs.get_parsed_uri(upload_uri)
|
|
859
|
+
# -> s3://rem-io-staging/v1/parsed/user-123/2025/01/19/report.pdf/content.md
|
|
860
|
+
|
|
861
|
+
# Get parse metadata
|
|
862
|
+
meta_uri = fs.get_parsed_uri(upload_uri, "metadata.json")
|
|
863
|
+
# -> s3://rem-io-staging/v1/parsed/user-123/2025/01/19/report.pdf/metadata.json
|
|
864
|
+
|
|
865
|
+
# Get extracted image
|
|
866
|
+
img_uri = fs.get_parsed_uri(upload_uri, "images/page_1.png")
|
|
867
|
+
# -> s3://rem-io-staging/v1/parsed/user-123/2025/01/19/report.pdf/images/page_1.png
|
|
868
|
+
"""
|
|
869
|
+
# Parse the S3 URI
|
|
870
|
+
bucket, key = self._split_bucket_and_blob_from_path(uri)
|
|
871
|
+
|
|
872
|
+
# Replace uploads_prefix with parsed_prefix in the key
|
|
873
|
+
# Handle both with and without version prefix
|
|
874
|
+
uploads_prefix = settings.s3.uploads_prefix
|
|
875
|
+
parsed_prefix = settings.s3.parsed_prefix
|
|
876
|
+
|
|
877
|
+
if f"/{uploads_prefix}/" in key:
|
|
878
|
+
# Replace uploads/ with parsed/ in the path
|
|
879
|
+
new_key = key.replace(f"/{uploads_prefix}/", f"/{parsed_prefix}/", 1)
|
|
880
|
+
# Append resource to the end (filename becomes a directory)
|
|
881
|
+
parsed_key = f"{new_key}/{resource}"
|
|
882
|
+
elif key.startswith(f"{uploads_prefix}/"):
|
|
883
|
+
# Handle case without leading slash
|
|
884
|
+
new_key = key.replace(f"{uploads_prefix}/", f"{parsed_prefix}/", 1)
|
|
885
|
+
parsed_key = f"{new_key}/{resource}"
|
|
886
|
+
else:
|
|
887
|
+
# Fallback: append .parsed/ if not in uploads/ directory
|
|
888
|
+
# This handles legacy paths or custom directories
|
|
889
|
+
parsed_key = f"{key}.parsed/{resource}"
|
|
890
|
+
|
|
891
|
+
return f"s3://{bucket}/{parsed_key}"
|
|
892
|
+
|
|
893
|
+
def has_parsed(self, uri: str) -> bool:
|
|
894
|
+
"""
|
|
895
|
+
Check if parsed version exists for a file.
|
|
896
|
+
|
|
897
|
+
Args:
|
|
898
|
+
uri: Original file URI
|
|
899
|
+
|
|
900
|
+
Returns:
|
|
901
|
+
True if metadata.json exists in .parsed/ directory
|
|
902
|
+
|
|
903
|
+
Example:
|
|
904
|
+
if fs.has_parsed("s3://bucket/file.pdf"):
|
|
905
|
+
content = fs.read_parsed("s3://bucket/file.pdf")
|
|
906
|
+
else:
|
|
907
|
+
# Trigger parsing workflow
|
|
908
|
+
content_service.process_and_save(uri)
|
|
909
|
+
"""
|
|
910
|
+
metadata_uri = self.get_parsed_uri(uri, "metadata.json")
|
|
911
|
+
return self.exists(metadata_uri)
|
|
912
|
+
|
|
913
|
+
def read_parsed(self, uri: str, resource: str = "content.md", **options) -> Any:
|
|
914
|
+
"""
|
|
915
|
+
Read parsed content for a file.
|
|
916
|
+
|
|
917
|
+
Args:
|
|
918
|
+
uri: Original file URI
|
|
919
|
+
resource: Resource to read (default: content.md)
|
|
920
|
+
**options: Format-specific read options
|
|
921
|
+
|
|
922
|
+
Returns:
|
|
923
|
+
Parsed content (format depends on resource)
|
|
924
|
+
|
|
925
|
+
Raises:
|
|
926
|
+
FileNotFoundError: If parsed version doesn't exist
|
|
927
|
+
|
|
928
|
+
Example:
|
|
929
|
+
# Read parsed markdown
|
|
930
|
+
markdown = fs.read_parsed("s3://bucket/file.pdf")
|
|
931
|
+
|
|
932
|
+
# Read parse metadata
|
|
933
|
+
metadata = fs.read_parsed("s3://bucket/file.pdf", "metadata.json")
|
|
934
|
+
|
|
935
|
+
# Read extracted table
|
|
936
|
+
table = fs.read_parsed("s3://bucket/file.pdf", "tables/table_0.parquet")
|
|
937
|
+
"""
|
|
938
|
+
parsed_uri = self.get_parsed_uri(uri, resource)
|
|
939
|
+
|
|
940
|
+
if not self.exists(parsed_uri):
|
|
941
|
+
raise FileNotFoundError(
|
|
942
|
+
f"Parsed resource not found: {resource}. "
|
|
943
|
+
f"Parse file first with ContentService.process_and_save('{uri}')"
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
return self.read(parsed_uri, **options)
|
|
947
|
+
|
|
948
|
+
def write_parsed(
|
|
949
|
+
self,
|
|
950
|
+
uri: str,
|
|
951
|
+
content: Any,
|
|
952
|
+
resource: str = "content.md",
|
|
953
|
+
metadata: dict[str, Any] | None = None,
|
|
954
|
+
):
|
|
955
|
+
"""
|
|
956
|
+
Write parsed content for a file.
|
|
957
|
+
|
|
958
|
+
Automatically writes metadata.json with parse info if provided.
|
|
959
|
+
|
|
960
|
+
Args:
|
|
961
|
+
uri: Original file URI
|
|
962
|
+
content: Parsed content to write
|
|
963
|
+
resource: Resource name (default: content.md)
|
|
964
|
+
metadata: Optional parse metadata (provider, timestamp, etc.)
|
|
965
|
+
|
|
966
|
+
Example:
|
|
967
|
+
# Write parsed markdown
|
|
968
|
+
fs.write_parsed(
|
|
969
|
+
"s3://bucket/file.pdf",
|
|
970
|
+
markdown_content,
|
|
971
|
+
metadata={
|
|
972
|
+
"provider": "kreuzberg",
|
|
973
|
+
"timestamp": datetime.now().isoformat(),
|
|
974
|
+
"page_count": 10,
|
|
975
|
+
}
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
# Write extracted image
|
|
979
|
+
fs.write_parsed(
|
|
980
|
+
"s3://bucket/file.pdf",
|
|
981
|
+
image_data,
|
|
982
|
+
resource="images/page_1.png"
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
# Write extracted table
|
|
986
|
+
fs.write_parsed(
|
|
987
|
+
"s3://bucket/file.pdf",
|
|
988
|
+
table_df,
|
|
989
|
+
resource="tables/table_0.parquet"
|
|
990
|
+
)
|
|
991
|
+
"""
|
|
992
|
+
# Write primary content
|
|
993
|
+
parsed_uri = self.get_parsed_uri(uri, resource)
|
|
994
|
+
self.write(parsed_uri, content)
|
|
995
|
+
|
|
996
|
+
# Write metadata if provided
|
|
997
|
+
if metadata is not None:
|
|
998
|
+
# Add standard fields if not present
|
|
999
|
+
if "timestamp" not in metadata:
|
|
1000
|
+
metadata["timestamp"] = datetime.now().isoformat()
|
|
1001
|
+
if "source_uri" not in metadata:
|
|
1002
|
+
metadata["source_uri"] = uri
|
|
1003
|
+
|
|
1004
|
+
metadata_uri = self.get_parsed_uri(uri, "metadata.json")
|
|
1005
|
+
self.write(metadata_uri, metadata)
|
|
1006
|
+
|
|
1007
|
+
def list_parsed_resources(self, uri: str) -> list[str]:
|
|
1008
|
+
"""
|
|
1009
|
+
List all resources in parsed directory.
|
|
1010
|
+
|
|
1011
|
+
Args:
|
|
1012
|
+
uri: Original file URI (upload path)
|
|
1013
|
+
|
|
1014
|
+
Returns:
|
|
1015
|
+
List of resource paths (relative to parsed file directory)
|
|
1016
|
+
|
|
1017
|
+
Example:
|
|
1018
|
+
upload_uri = "s3://rem-io-staging/v1/uploads/user-123/2025/01/19/report.pdf"
|
|
1019
|
+
resources = fs.list_parsed_resources(upload_uri)
|
|
1020
|
+
# Returns: ['content.md', 'metadata.json', 'images/page_1.png', 'tables/table_0.parquet']
|
|
1021
|
+
|
|
1022
|
+
# Read all resources
|
|
1023
|
+
for resource in resources:
|
|
1024
|
+
data = fs.read_parsed(upload_uri, resource)
|
|
1025
|
+
"""
|
|
1026
|
+
# Get the parsed directory path (without specific resource)
|
|
1027
|
+
parsed_base = self.get_parsed_uri(uri, "")
|
|
1028
|
+
# Remove trailing slash for consistent listing
|
|
1029
|
+
parsed_base = parsed_base.rstrip("/")
|
|
1030
|
+
|
|
1031
|
+
# List all files under the parsed directory
|
|
1032
|
+
all_uris = self.ls(parsed_base)
|
|
1033
|
+
|
|
1034
|
+
# Extract relative paths from the parsed base
|
|
1035
|
+
resources = []
|
|
1036
|
+
for full_uri in all_uris:
|
|
1037
|
+
# Remove the parsed base prefix to get relative path
|
|
1038
|
+
if full_uri.startswith(parsed_base + "/"):
|
|
1039
|
+
relative = full_uri[len(parsed_base) + 1:] # +1 for the /
|
|
1040
|
+
resources.append(relative)
|
|
1041
|
+
|
|
1042
|
+
return resources
|