remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,1042 @@
1
+ """
2
+ S3 storage provider for REM file system.
3
+
4
+ Features:
5
+ - Read/write multiple formats (JSON, YAML, CSV, Parquet, images, etc.)
6
+ - Presigned URLs for direct access
7
+ - Multipart uploads for large files
8
+ - Polars integration for columnar data
9
+ - Versioning support
10
+ - Directory operations (ls, ls_dirs, delete)
11
+
12
+ Integration:
13
+ - Uses rem.settings for S3 configuration
14
+ - ContentService for special format parsing (PDF, DOCX, etc.)
15
+ - IRSA (IAM Roles for Service Accounts) in EKS
16
+
17
+ Parsing Hooks:
18
+ - Convention: Separate uploads/ and parsed/ directories
19
+ - Uploads: s3://bucket/v1/uploads/user/2025/01/19/file.pdf
20
+ - Parsed: s3://bucket/v1/parsed/user/2025/01/19/file.pdf/{resource}
21
+ - get_parsed_uri(): Get URI for parsed content/metadata/images/tables
22
+ - has_parsed(): Check if file has been parsed
23
+ - read_parsed(): Read parsed markdown, metadata, or extracted resources
24
+ - write_parsed(): Write parsed content with automatic metadata tracking
25
+ - list_parsed_resources(): Discover all parsed resources
26
+
27
+ Example:
28
+ fs = S3Provider()
29
+ upload_uri = "s3://rem-io-staging/v1/uploads/user-123/2025/01/19/report.pdf"
30
+
31
+ # Check if already parsed
32
+ if fs.has_parsed(upload_uri):
33
+ markdown = fs.read_parsed(upload_uri)
34
+ else:
35
+ # Parse and cache
36
+ result = parse_file(upload_uri)
37
+ fs.write_parsed(
38
+ upload_uri,
39
+ result.markdown,
40
+ metadata={"provider": "kreuzberg", "page_count": 10}
41
+ )
42
+
43
+ # List all parsed resources
44
+ resources = fs.list_parsed_resources(upload_uri)
45
+ # ['content.md', 'metadata.json', 'images/page_1.png', 'tables/table_0.parquet']
46
+ """
47
+
48
+ from pathlib import Path
49
+ from typing import Any, BinaryIO, Callable, Iterator
50
+ from io import BytesIO
51
+ from urllib.parse import urlparse
52
+ from datetime import datetime
53
+ import json
54
+ import tempfile
55
+ import io
56
+
57
+ import boto3
58
+ from botocore.exceptions import ClientError
59
+ from pydantic import BaseModel, model_validator
60
+ from loguru import logger
61
+
62
+ from rem.settings import settings
63
+
64
+ # Optional imports for specific formats
65
+ try:
66
+ import polars as pl
67
+ except ImportError:
68
+ pl = None # type: ignore[assignment]
69
+
70
+ try:
71
+ import pandas as pd
72
+ except ImportError:
73
+ pd = None # type: ignore[assignment]
74
+
75
+ try:
76
+ import yaml
77
+ except ImportError:
78
+ yaml = None # type: ignore[assignment]
79
+
80
+ try:
81
+ from PIL import Image
82
+ except ImportError:
83
+ Image = None # type: ignore[assignment]
84
+
85
+ try:
86
+ import pyarrow as pa
87
+ import pyarrow.dataset as ds
88
+ except ImportError:
89
+ pa = None
90
+ ds = None
91
+
92
+
93
+ class S3ObjectListing(BaseModel):
94
+ """S3 object metadata with convenience properties."""
95
+
96
+ Key: str
97
+ LastModified: datetime
98
+ Size: int
99
+ bucket: str
100
+ uri: str | None = None
101
+
102
+ def __repr__(self):
103
+ return self.uri or f"s3://{self.bucket}/{self.Key}"
104
+
105
+ @model_validator(mode="before")
106
+ @classmethod
107
+ def fixup(cls, data: Any) -> Any:
108
+ """Construct full URI from bucket and key."""
109
+ data["uri"] = f"s3://{data['bucket']}/{data['Key']}"
110
+ return data
111
+
112
+
113
+ class FileLikeWritable:
114
+ """
115
+ Wrapper around S3 put_object to provide file-like write interface.
116
+
117
+ Used for writing data that doesn't fit in memory or needs streaming.
118
+ """
119
+
120
+ def __init__(self, s3_client, bucket: str, key: str):
121
+ self._client = s3_client
122
+ self.bucket = bucket
123
+ self.key = key
124
+
125
+ def write(self, data: bytes, **options):
126
+ """Write bytes to S3 object."""
127
+ if isinstance(data, BytesIO):
128
+ data = data.getvalue()
129
+ self._client.put_object(Bucket=self.bucket, Key=self.key, Body=data)
130
+
131
+ def __enter__(self):
132
+ return self
133
+
134
+ def __exit__(self, exc_type, exc_value, exc_traceback):
135
+ return None
136
+
137
+
138
+ def generate_presigned_url(url: str, expiry: int = 3600, for_upload: bool = False) -> str:
139
+ """
140
+ Generate presigned URL for S3 object access.
141
+
142
+ Args:
143
+ url: S3 URI (s3://bucket/key)
144
+ expiry: URL expiration in seconds (default: 3600)
145
+ for_upload: Generate PUT URL instead of GET (default: False)
146
+
147
+ Returns:
148
+ Presigned URL for direct S3 access
149
+
150
+ Example:
151
+ # Download URL
152
+ url = generate_presigned_url("s3://bucket/file.pdf")
153
+
154
+ # Upload URL
155
+ url = generate_presigned_url("s3://bucket/file.pdf", for_upload=True)
156
+ """
157
+ s3 = S3Provider()
158
+
159
+ if not s3.is_s3_uri(url):
160
+ return url
161
+
162
+ bucket_name, object_key = s3._split_bucket_and_blob_from_path(url)
163
+
164
+ try:
165
+ if for_upload:
166
+ return s3._client.generate_presigned_url(
167
+ "put_object",
168
+ Params={"Bucket": bucket_name, "Key": object_key},
169
+ ExpiresIn=expiry,
170
+ HttpMethod="PUT",
171
+ )
172
+
173
+ return s3._client.generate_presigned_url(
174
+ "get_object",
175
+ Params={"Bucket": bucket_name, "Key": object_key},
176
+ ExpiresIn=expiry,
177
+ )
178
+
179
+ except Exception as ex:
180
+ logger.error(f"Failed to generate presigned URL for {url}: {ex}")
181
+ raise
182
+
183
+
184
+ class S3Provider:
185
+ """
186
+ S3 storage provider with REM settings integration.
187
+
188
+ Supports IRSA (IAM Roles for Service Accounts) in EKS for secure access.
189
+ Falls back to access keys for local development or MinIO.
190
+
191
+ Parsing Hooks:
192
+ - get_parsed_uri(): Get URI for parsed version of a file
193
+ - read_parsed(): Read parsed content (markdown, images, etc.)
194
+ - write_parsed(): Write parsed content with metadata
195
+ - has_parsed(): Check if parsed version exists
196
+
197
+ Convention:
198
+ - Parsed files stored at {original_uri}.parsed/
199
+ - Metadata at {original_uri}.parsed/metadata.json
200
+ - Content at {original_uri}.parsed/content.md (or other formats)
201
+ """
202
+
203
+ def __init__(self):
204
+ """Initialize S3 client from REM settings."""
205
+ self._client = self._create_s3_client()
206
+
207
+ def _create_s3_client(self):
208
+ """Create S3 client with IRSA or configured credentials."""
209
+ s3_config: dict[str, Any] = {
210
+ "region_name": settings.s3.region,
211
+ }
212
+
213
+ # Custom endpoint for MinIO/LocalStack
214
+ if settings.s3.endpoint_url:
215
+ s3_config["endpoint_url"] = settings.s3.endpoint_url
216
+
217
+ # Access keys (not needed with IRSA in EKS)
218
+ if settings.s3.access_key_id and settings.s3.secret_access_key:
219
+ s3_config["aws_access_key_id"] = settings.s3.access_key_id
220
+ s3_config["aws_secret_access_key"] = settings.s3.secret_access_key
221
+
222
+ # SSL configuration
223
+ s3_config["use_ssl"] = settings.s3.use_ssl
224
+
225
+ return boto3.client("s3", **s3_config)
226
+
227
+ @staticmethod
228
+ def is_s3_uri(uri: str) -> bool:
229
+ """Check if URI is S3 format."""
230
+ return uri.startswith("s3://")
231
+
232
+ def _check_uri(self, uri: str):
233
+ """Validate S3 URI format."""
234
+ url = urlparse(uri)
235
+ if url.scheme != "s3":
236
+ raise ValueError(
237
+ f"URI must be of the form s3://BUCKET/path/to/file "
238
+ f"but got {uri} with scheme {url.scheme}"
239
+ )
240
+
241
+ def _split_bucket_and_blob_from_path(self, uri: str) -> tuple[str, str]:
242
+ """
243
+ Split S3 URI into bucket and key.
244
+
245
+ Args:
246
+ uri: S3 URI (s3://bucket/path/to/file)
247
+
248
+ Returns:
249
+ Tuple of (bucket, key)
250
+ """
251
+ self._check_uri(uri)
252
+ url = urlparse(uri)
253
+ return url.netloc, url.path.lstrip("/")
254
+
255
+ def exists(self, uri: str) -> bool:
256
+ """
257
+ Check if S3 object or prefix exists.
258
+
259
+ Args:
260
+ uri: S3 URI
261
+
262
+ Returns:
263
+ True if exists, False otherwise
264
+ """
265
+ bucket, prefix = self._split_bucket_and_blob_from_path(uri)
266
+
267
+ # For files (has extension), use head_object
268
+ if "." in Path(prefix).name:
269
+ try:
270
+ self._client.head_object(Bucket=bucket, Key=prefix)
271
+ return True
272
+ except ClientError:
273
+ return False
274
+
275
+ # For directories/prefixes, use list_objects_v2
276
+ try:
277
+ response = self._client.list_objects_v2(
278
+ Prefix=prefix, Bucket=bucket, MaxKeys=1
279
+ )
280
+ return response.get("KeyCount", 0) > 0
281
+ except ClientError:
282
+ return False
283
+
284
+ def open(self, uri: str, mode: str = "rb", version_id: str | None = None) -> BytesIO | FileLikeWritable:
285
+ """
286
+ Open S3 object as file-like object.
287
+
288
+ Args:
289
+ uri: S3 URI
290
+ mode: File mode (r, rb, w, wb)
291
+ version_id: Optional S3 version ID for versioned buckets
292
+
293
+ Returns:
294
+ File-like object (BytesIO for read, FileLikeWritable for write)
295
+ """
296
+ if mode[0] == "r":
297
+ return BytesIO(self.get_streaming_body(uri, version_id=version_id).read())
298
+
299
+ bucket, key = self._split_bucket_and_blob_from_path(uri)
300
+ return FileLikeWritable(self._client, bucket, key)
301
+
302
+ def get_streaming_body(
303
+ self,
304
+ uri: str,
305
+ version_id: str | None = None,
306
+ **kwargs,
307
+ ):
308
+ """
309
+ Get streaming body for S3 object.
310
+
311
+ Args:
312
+ uri: S3 URI
313
+ version_id: Optional version ID
314
+
315
+ Returns:
316
+ S3 streaming body
317
+ """
318
+ bucket, prefix = self._split_bucket_and_blob_from_path(uri)
319
+
320
+ try:
321
+ params = {"Bucket": bucket, "Key": prefix}
322
+ if version_id:
323
+ params["VersionId"] = version_id
324
+
325
+ response = self._client.get_object(**params)
326
+ return response["Body"]
327
+ except ClientError as ex:
328
+ logger.error(f"Failed to get S3 object {uri}: {ex}")
329
+ raise
330
+
331
+ def read(self, uri: str, use_polars: bool = True, version_id: str | None = None, **options) -> Any:
332
+ """
333
+ Read S3 object with format detection.
334
+
335
+ Supports:
336
+ - JSON (.json)
337
+ - YAML (.yml, .yaml)
338
+ - CSV (.csv)
339
+ - Parquet (.parquet)
340
+ - Feather (.feather)
341
+ - Excel (.xlsx, .xls)
342
+ - Text (.txt, .log, .md)
343
+ - Images (.png, .jpg, .jpeg, .tiff, .svg)
344
+ - PDF (.pdf) - TODO: integrate ContentService
345
+ - DOCX (.docx) - TODO: integrate ContentService
346
+ - WAV (.wav) - TODO: add audio provider
347
+
348
+ Args:
349
+ uri: S3 URI
350
+ use_polars: Use Polars for dataframes (default: True)
351
+ version_id: Optional S3 version ID
352
+ **options: Format-specific options
353
+
354
+ Returns:
355
+ Parsed data in appropriate format
356
+ """
357
+ p = Path(uri)
358
+ suffix = p.suffix.lower()
359
+
360
+ # TODO: Integrate ContentService for PDF/DOCX parsing
361
+ if suffix == ".pdf":
362
+ logger.warning("PDF parsing not yet implemented - use ContentService")
363
+ raise NotImplementedError(
364
+ "PDF parsing requires ContentService integration. "
365
+ "TODO: from rem.services.content import ContentService; return ContentService().process_uri(uri)"
366
+ )
367
+
368
+ if suffix == ".docx":
369
+ logger.warning("DOCX parsing not yet implemented")
370
+ # TODO: Add python-docx provider
371
+ raise NotImplementedError(
372
+ "DOCX parsing not yet implemented. "
373
+ "TODO: Add python-docx to dependencies and implement DocxProvider"
374
+ )
375
+
376
+ # Structured data formats
377
+ if suffix in [".yml", ".yaml"]:
378
+ if not yaml:
379
+ raise ImportError("PyYAML is required for YAML support")
380
+ return yaml.safe_load(self.get_streaming_body(uri, version_id=version_id, **options))
381
+
382
+ if suffix == ".json":
383
+ return json.load(self.get_streaming_body(uri, version_id=version_id, **options))
384
+
385
+ if suffix == ".txt" or suffix == ".log" or suffix == ".md":
386
+ return self.get_streaming_body(uri, version_id=version_id, **options).read().decode()
387
+
388
+ # Columnar data formats
389
+ dataframe_lib = pl if use_polars and pl else pd
390
+ if not dataframe_lib:
391
+ raise ImportError(
392
+ "Either Polars or Pandas is required for tabular data support. "
393
+ "Install with: uv add polars"
394
+ )
395
+
396
+ if suffix == ".csv":
397
+ with self.open(uri, "rb") as f:
398
+ return dataframe_lib.read_csv(f, **options)
399
+
400
+ if suffix == ".parquet":
401
+ with self.open(uri, "rb") as f:
402
+ return dataframe_lib.read_parquet(f, **options)
403
+
404
+ if suffix == ".feather":
405
+ with self.open(uri, "rb") as f:
406
+ # TODO: Verify feather support in Polars
407
+ if use_polars and pl:
408
+ logger.warning("Feather support in Polars may vary - consider using Pandas")
409
+ return dataframe_lib.read_feather(f, **options)
410
+
411
+ if suffix in [".xls", ".xlsx"]:
412
+ # Excel requires pandas
413
+ if not pd:
414
+ raise ImportError("Pandas is required for Excel support")
415
+ # TODO: Add openpyxl or xlrd to dependencies
416
+ logger.warning("Excel support requires openpyxl or xlrd - add to pyproject.toml if needed")
417
+ return pd.read_excel(uri, sheet_name=None, **options)
418
+
419
+ # Image formats
420
+ if suffix in [".png", ".jpg", ".jpeg", ".tiff", ".tif"]:
421
+ if not Image:
422
+ raise ImportError("Pillow is required for image support. Install with: uv add pillow")
423
+ with self.open(uri, "rb") as s3f:
424
+ return Image.open(s3f)
425
+
426
+ if suffix == ".svg":
427
+ return self.get_streaming_body(uri, version_id=version_id, **options).read().decode()
428
+
429
+ # TODO: Audio formats
430
+ if suffix in [".wav", ".mp3", ".flac"]:
431
+ logger.warning(f"Audio format {suffix} not yet supported")
432
+ # TODO: Add librosa or pydub provider
433
+ raise NotImplementedError(
434
+ f"Audio format {suffix} requires audio processing library. "
435
+ "TODO: Add librosa or pydub to dependencies"
436
+ )
437
+
438
+ # Binary formats
439
+ if suffix == ".pickle":
440
+ import pickle
441
+ with self.open(uri, "rb") as f:
442
+ return pickle.load(f)
443
+
444
+ raise ValueError(
445
+ f"Unsupported file format: {suffix}. "
446
+ f"Supported formats: .json, .yaml, .csv, .parquet, .txt, .png, .jpg, etc."
447
+ )
448
+
449
+ def write(self, uri: str, data: Any, **options):
450
+ """
451
+ Write data to S3 with format detection.
452
+
453
+ Args:
454
+ uri: S3 URI
455
+ data: Data to write (DataFrame, dict, Image, bytes, str)
456
+ **options: Format-specific options
457
+ """
458
+ p = Path(uri)
459
+ suffix = p.suffix.lower()
460
+ bucket, prefix = self._split_bucket_and_blob_from_path(uri)
461
+
462
+ def write_object(writer_fn):
463
+ """
464
+ Helper to write via BytesIO stream.
465
+
466
+ Pattern: write_object(lambda s: data.write_parquet(s))
467
+ - Creates in-memory buffer
468
+ - Calls writer function to populate buffer
469
+ - Uploads buffer contents to S3
470
+ - Avoids writing temporary files to disk
471
+ """
472
+ stream = io.BytesIO()
473
+ writer_fn(stream)
474
+ self._client.put_object(Bucket=bucket, Key=prefix, Body=stream.getvalue())
475
+
476
+ # Dataframe formats
477
+ if suffix == ".parquet":
478
+ if hasattr(data, "write_parquet"): # Polars
479
+ return write_object(lambda s: data.write_parquet(s, **options))
480
+ elif hasattr(data, "to_parquet"): # Pandas
481
+ return write_object(lambda s: data.to_parquet(s, **options))
482
+ raise TypeError(f"Cannot write {type(data)} to parquet")
483
+
484
+ if suffix == ".csv":
485
+ if hasattr(data, "write_csv"): # Polars
486
+ return write_object(lambda s: data.write_csv(s, **options))
487
+ elif hasattr(data, "to_csv"): # Pandas
488
+ from functools import partial
489
+ fn = partial(data.to_csv, index=False)
490
+ return write_object(lambda s: fn(s, **options))
491
+ elif isinstance(data, (bytes, str)):
492
+ content = data.encode("utf-8") if isinstance(data, str) else data
493
+ return self._client.put_object(
494
+ Bucket=bucket, Key=prefix, Body=content, ContentType="text/csv"
495
+ )
496
+ raise TypeError(f"Cannot write {type(data)} to CSV")
497
+
498
+ if suffix == ".feather":
499
+ if hasattr(data, "write_feather"): # Polars (check method name)
500
+ logger.warning("Feather support in Polars - verify method name")
501
+ return write_object(lambda s: data.write_feather(s, **options))
502
+ elif hasattr(data, "to_feather"): # Pandas
503
+ return write_object(lambda s: data.to_feather(s, **options))
504
+ raise TypeError(f"Cannot write {type(data)} to feather")
505
+
506
+ # Structured data formats
507
+ if suffix in [".yml", ".yaml"]:
508
+ if isinstance(data, dict):
509
+ if not yaml:
510
+ raise ImportError("PyYAML required for YAML support")
511
+ yaml_str = yaml.safe_dump(data)
512
+ return self._client.put_object(Bucket=bucket, Key=prefix, Body=yaml_str.encode('utf-8'))
513
+ raise TypeError(f"YAML requires dict, got {type(data)}")
514
+
515
+ if suffix == ".json":
516
+ if isinstance(data, dict):
517
+ json_str = json.dumps(data)
518
+ return self._client.put_object(Bucket=bucket, Key=prefix, Body=json_str.encode('utf-8'))
519
+ raise TypeError(f"JSON requires dict, got {type(data)}")
520
+
521
+ # Image formats
522
+ if suffix in [".png", ".jpg", ".jpeg", ".tiff", ".tif"]:
523
+ if not Image:
524
+ raise ImportError("Pillow required for image support")
525
+ if not isinstance(data, Image.Image):
526
+ data = Image.fromarray(data)
527
+ format_name = suffix[1:] # Remove leading dot
528
+ _data = BytesIO()
529
+ save_options = {"format": format_name, **options}
530
+ if "dpi" in options:
531
+ dpi = options["dpi"]
532
+ save_options["dpi"] = (dpi, dpi) if isinstance(dpi, int) else dpi
533
+ data.save(_data, **save_options)
534
+ return self._client.put_object(Bucket=bucket, Key=prefix, Body=_data.getvalue())
535
+
536
+ # Document formats
537
+ if suffix == ".pdf":
538
+ return self._client.put_object(
539
+ Bucket=bucket, Key=prefix, Body=data, ContentType="application/pdf"
540
+ )
541
+
542
+ if suffix == ".html":
543
+ return self._client.put_object(
544
+ Bucket=bucket, Key=prefix, Body=data, ContentType="text/html"
545
+ )
546
+
547
+ # Binary/text fallback
548
+ if suffix == ".pickle":
549
+ import pickle
550
+ with self.open(uri, "wb") as f:
551
+ return write_object(lambda s: pickle.dump(data, s, **options))
552
+
553
+ # Default: write as bytes/string
554
+ return self._client.put_object(Bucket=bucket, Key=prefix, Body=data)
555
+
556
+ def copy(self, uri_from: str, uri_to: str):
557
+ """
558
+ Copy files between S3, local, or S3-to-S3.
559
+
560
+ Args:
561
+ uri_from: Source URI (s3://... or local path)
562
+ uri_to: Destination URI (s3://... or local path)
563
+ """
564
+ from_s3 = self.is_s3_uri(uri_from)
565
+ to_s3 = self.is_s3_uri(uri_to)
566
+
567
+ if to_s3 and not from_s3:
568
+ # Upload: local -> S3
569
+ bucket, path = self._split_bucket_and_blob_from_path(uri_to)
570
+ self._client.upload_file(uri_from, bucket, path)
571
+
572
+ elif not to_s3 and from_s3:
573
+ # Download: S3 -> local
574
+ bucket, path = self._split_bucket_and_blob_from_path(uri_from)
575
+ # TODO: Add progress bar with tqdm
576
+ logger.info(f"Downloading {uri_from} to {uri_to}")
577
+ self._client.download_file(bucket, path, uri_to)
578
+
579
+ elif to_s3 and from_s3:
580
+ # S3 to S3 copy
581
+ with self.open(uri_from) as from_obj:
582
+ with self.open(uri_to, "wb") as to_obj:
583
+ to_obj.write(from_obj.read())
584
+ else:
585
+ raise ValueError("At least one of uri_from or uri_to must be an S3 path")
586
+
587
+ def ls(self, uri: str, file_type: str = "*", search: str = "**/", **kwargs) -> list[str]:
588
+ """
589
+ List files under S3 prefix.
590
+
591
+ Args:
592
+ uri: S3 prefix URI
593
+ file_type: File extension filter (default: all)
594
+ search: Search pattern (default: recursive)
595
+
596
+ Returns:
597
+ List of S3 URIs
598
+ """
599
+ results = self.glob(uri, file_type=file_type, search=search, **kwargs)
600
+ return [obj.uri for obj in results if obj.uri is not None]
601
+
602
+ def glob(
603
+ self, uri: str, file_type: str = "*", search: str = "**/", **kwargs
604
+ ) -> list[S3ObjectListing]:
605
+ """
606
+ List S3 objects with metadata.
607
+
608
+ Args:
609
+ uri: S3 prefix URI
610
+ file_type: File extension filter
611
+ search: Search pattern
612
+
613
+ Returns:
614
+ List of S3ObjectListing objects
615
+ """
616
+ bucket, prefix = self._split_bucket_and_blob_from_path(uri)
617
+
618
+ # Ensure trailing slash for directory prefixes
619
+ if prefix and not prefix.endswith("/"):
620
+ prefix = prefix + "/"
621
+
622
+ try:
623
+ response = self._client.list_objects_v2(Prefix=prefix, Bucket=bucket)
624
+ contents = response.get("Contents")
625
+
626
+ if not contents:
627
+ return []
628
+
629
+ return [S3ObjectListing(**d, bucket=bucket) for d in contents]
630
+
631
+ except ClientError as ex:
632
+ logger.error(f"Failed to list S3 objects at {uri}: {ex}")
633
+ return []
634
+
635
+ def ls_dirs(self, uri: str, max_keys: int = 100) -> list[str]:
636
+ """
637
+ List immediate child directories under S3 prefix.
638
+
639
+ Args:
640
+ uri: S3 prefix URI
641
+ max_keys: Maximum directories to return
642
+
643
+ Returns:
644
+ List of directory URIs
645
+ """
646
+ bucket, key = self._split_bucket_and_blob_from_path(uri)
647
+ key = f"{key.rstrip('/')}/"
648
+
649
+ response = self._client.list_objects_v2(
650
+ Bucket=bucket, Prefix=key, Delimiter="/", MaxKeys=max_keys
651
+ )
652
+
653
+ prefixes = response.get("CommonPrefixes", [])
654
+ dirs = [p["Prefix"].rstrip("/").split("/")[-1] for p in prefixes]
655
+ return [f"{uri}/{d}" for d in dirs]
656
+
657
+ def ls_iter(self, uri: str, **options) -> Iterator[str]:
658
+ """
659
+ Iterate over S3 objects with pagination.
660
+
661
+ TODO: Implement pagination with continuation tokens.
662
+
663
+ Args:
664
+ uri: S3 prefix URI
665
+ **options: Listing options
666
+
667
+ Yields:
668
+ S3 URIs
669
+ """
670
+ # TODO: Implement pagination for large result sets
671
+ logger.warning("ls_iter pagination not yet implemented - returning all results")
672
+ yield from self.ls(uri, **options)
673
+
674
+ def delete(self, uri: str, limit: int = 50) -> list[str]:
675
+ """
676
+ Delete S3 objects under prefix.
677
+
678
+ Safety limit prevents accidental bulk deletions.
679
+
680
+ Args:
681
+ uri: S3 URI (file or prefix)
682
+ limit: Maximum files to delete (safety limit)
683
+
684
+ Returns:
685
+ List of deleted URIs
686
+ """
687
+ deleted_files = self.ls(uri)
688
+
689
+ if len(deleted_files) > limit:
690
+ raise ValueError(
691
+ f"Attempting to delete {len(deleted_files)} files exceeds "
692
+ f"safety limit of {limit}. Increase limit parameter if intentional."
693
+ )
694
+
695
+ s3_resource = boto3.resource("s3")
696
+ for file_uri in deleted_files:
697
+ logger.debug(f"Deleting {file_uri}")
698
+ bucket, key = self._split_bucket_and_blob_from_path(file_uri)
699
+ s3_resource.Object(bucket, key).delete()
700
+
701
+ # Delete the prefix marker if it exists
702
+ bucket, key = self._split_bucket_and_blob_from_path(uri)
703
+ try:
704
+ s3_resource.Object(bucket, key).delete()
705
+ except:
706
+ pass # Prefix marker may not exist
707
+
708
+ return deleted_files
709
+
710
+ def read_dataset(self, uri: str):
711
+ """
712
+ Read S3 data as PyArrow dataset.
713
+
714
+ Useful for partitioned parquet datasets and lazy loading.
715
+
716
+ Args:
717
+ uri: S3 dataset URI
718
+
719
+ Returns:
720
+ PyArrow Dataset
721
+ """
722
+ if not pl:
723
+ raise ImportError("Polars required for dataset operations. Install with: uv add polars")
724
+
725
+ with self.open(uri, mode="rb") as f:
726
+ return pl.read_parquet(f).to_arrow()
727
+
728
+ def read_image(self, uri: str, version_id: str | None = None):
729
+ """
730
+ Read S3 object as PIL Image.
731
+
732
+ Args:
733
+ uri: S3 image URI
734
+ version_id: Optional S3 version ID
735
+
736
+ Returns:
737
+ PIL Image
738
+ """
739
+ if not Image:
740
+ raise ImportError("Pillow required for image support. Install with: uv add pillow")
741
+
742
+ if version_id:
743
+ bucket, key = self._split_bucket_and_blob_from_path(uri)
744
+ response = self._client.get_object(Bucket=bucket, Key=key, VersionId=version_id)
745
+ return Image.open(BytesIO(response["Body"].read()))
746
+
747
+ with self.open(uri, "rb") as f:
748
+ return Image.open(f)
749
+
750
+ def cache_data(
751
+ self,
752
+ data: Any,
753
+ cache_location: str | None = None,
754
+ suffix: str | None = None,
755
+ **kwargs,
756
+ ) -> str:
757
+ """
758
+ Cache data to S3 (typically images).
759
+
760
+ Args:
761
+ data: Data to cache (Image, etc.)
762
+ cache_location: S3 prefix for cache (default: from settings)
763
+ suffix: File extension
764
+ **kwargs: Additional options (uri, etc.)
765
+
766
+ Returns:
767
+ S3 URI of cached data
768
+ """
769
+ if "uri" in kwargs:
770
+ return kwargs["uri"]
771
+
772
+ cache_location = cache_location or f"s3://{settings.s3.bucket_name}/cache"
773
+
774
+ if Image and isinstance(data, Image.Image):
775
+ suffix = suffix or ".png"
776
+ # TODO: Implement res_hash for unique file naming
777
+ import uuid
778
+ file_id = str(uuid.uuid4())
779
+ uri = f"{cache_location}/images/{file_id}{suffix}"
780
+ self.write(uri, data)
781
+ return uri
782
+
783
+ raise NotImplementedError(
784
+ f"Caching not implemented for type {type(data)}. "
785
+ "Currently supports: PIL Image. TODO: Add support for other types."
786
+ )
787
+
788
+ def apply(self, uri: str, fn: Callable[[str], Any]) -> Any:
789
+ """
790
+ Apply function to S3 file via temporary local copy.
791
+
792
+ Downloads file to /tmp, applies function, then cleans up.
793
+
794
+ Args:
795
+ uri: S3 URI
796
+ fn: Function that takes local file path
797
+
798
+ Returns:
799
+ Result of function call
800
+ """
801
+ with self.open(uri, "rb") as s3f:
802
+ suffix = Path(uri).suffix
803
+ with tempfile.NamedTemporaryFile(
804
+ suffix=suffix, prefix="s3_", mode="wb", delete=False
805
+ ) as f:
806
+ f.write(s3f.read())
807
+ f.flush()
808
+ try:
809
+ return fn(f.name)
810
+ finally:
811
+ # Clean up temp file
812
+ Path(f.name).unlink(missing_ok=True)
813
+
814
+ def local_file(self, uri: str) -> str:
815
+ """
816
+ Download S3 file to /tmp and return local path.
817
+
818
+ Args:
819
+ uri: S3 URI
820
+
821
+ Returns:
822
+ Local file path
823
+ """
824
+ filename = Path(uri).name
825
+ local_path = f"/tmp/{filename}"
826
+ self.copy(uri, local_path)
827
+ return local_path
828
+
829
+ # ========================================================================
830
+ # Parsing Hooks
831
+ # ========================================================================
832
+ # Convention: Separate uploads/ and parsed/ directories with deterministic matching
833
+ # Uploads: s3://bucket/v1/uploads/user-123/2025/01/19/file.pdf
834
+ # Parsed: s3://bucket/v1/parsed/user-123/2025/01/19/file.pdf/content.md
835
+ # s3://bucket/v1/parsed/user-123/2025/01/19/file.pdf/metadata.json
836
+ # s3://bucket/v1/parsed/user-123/2025/01/19/file.pdf/images/page_1.png
837
+ # ========================================================================
838
+
839
+ def get_parsed_uri(self, uri: str, resource: str = "content.md") -> str:
840
+ """
841
+ Get URI for parsed version of a file.
842
+
843
+ Maps uploads/ paths to parsed/ paths deterministically:
844
+ uploads/user/2025/01/19/file.pdf -> parsed/user/2025/01/19/file.pdf/{resource}
845
+
846
+ Args:
847
+ uri: Original file URI (e.g., s3://bucket/v1/uploads/user/2025/01/19/file.pdf)
848
+ resource: Resource within parsed directory (default: content.md)
849
+
850
+ Returns:
851
+ Parsed resource URI (e.g., s3://bucket/v1/parsed/user/2025/01/19/file.pdf/content.md)
852
+
853
+ Example:
854
+ # Original upload
855
+ upload_uri = "s3://rem-io-staging/v1/uploads/user-123/2025/01/19/report.pdf"
856
+
857
+ # Get parsed markdown
858
+ parsed_uri = fs.get_parsed_uri(upload_uri)
859
+ # -> s3://rem-io-staging/v1/parsed/user-123/2025/01/19/report.pdf/content.md
860
+
861
+ # Get parse metadata
862
+ meta_uri = fs.get_parsed_uri(upload_uri, "metadata.json")
863
+ # -> s3://rem-io-staging/v1/parsed/user-123/2025/01/19/report.pdf/metadata.json
864
+
865
+ # Get extracted image
866
+ img_uri = fs.get_parsed_uri(upload_uri, "images/page_1.png")
867
+ # -> s3://rem-io-staging/v1/parsed/user-123/2025/01/19/report.pdf/images/page_1.png
868
+ """
869
+ # Parse the S3 URI
870
+ bucket, key = self._split_bucket_and_blob_from_path(uri)
871
+
872
+ # Replace uploads_prefix with parsed_prefix in the key
873
+ # Handle both with and without version prefix
874
+ uploads_prefix = settings.s3.uploads_prefix
875
+ parsed_prefix = settings.s3.parsed_prefix
876
+
877
+ if f"/{uploads_prefix}/" in key:
878
+ # Replace uploads/ with parsed/ in the path
879
+ new_key = key.replace(f"/{uploads_prefix}/", f"/{parsed_prefix}/", 1)
880
+ # Append resource to the end (filename becomes a directory)
881
+ parsed_key = f"{new_key}/{resource}"
882
+ elif key.startswith(f"{uploads_prefix}/"):
883
+ # Handle case without leading slash
884
+ new_key = key.replace(f"{uploads_prefix}/", f"{parsed_prefix}/", 1)
885
+ parsed_key = f"{new_key}/{resource}"
886
+ else:
887
+ # Fallback: append .parsed/ if not in uploads/ directory
888
+ # This handles legacy paths or custom directories
889
+ parsed_key = f"{key}.parsed/{resource}"
890
+
891
+ return f"s3://{bucket}/{parsed_key}"
892
+
893
+ def has_parsed(self, uri: str) -> bool:
894
+ """
895
+ Check if parsed version exists for a file.
896
+
897
+ Args:
898
+ uri: Original file URI
899
+
900
+ Returns:
901
+ True if metadata.json exists in .parsed/ directory
902
+
903
+ Example:
904
+ if fs.has_parsed("s3://bucket/file.pdf"):
905
+ content = fs.read_parsed("s3://bucket/file.pdf")
906
+ else:
907
+ # Trigger parsing workflow
908
+ content_service.process_and_save(uri)
909
+ """
910
+ metadata_uri = self.get_parsed_uri(uri, "metadata.json")
911
+ return self.exists(metadata_uri)
912
+
913
+ def read_parsed(self, uri: str, resource: str = "content.md", **options) -> Any:
914
+ """
915
+ Read parsed content for a file.
916
+
917
+ Args:
918
+ uri: Original file URI
919
+ resource: Resource to read (default: content.md)
920
+ **options: Format-specific read options
921
+
922
+ Returns:
923
+ Parsed content (format depends on resource)
924
+
925
+ Raises:
926
+ FileNotFoundError: If parsed version doesn't exist
927
+
928
+ Example:
929
+ # Read parsed markdown
930
+ markdown = fs.read_parsed("s3://bucket/file.pdf")
931
+
932
+ # Read parse metadata
933
+ metadata = fs.read_parsed("s3://bucket/file.pdf", "metadata.json")
934
+
935
+ # Read extracted table
936
+ table = fs.read_parsed("s3://bucket/file.pdf", "tables/table_0.parquet")
937
+ """
938
+ parsed_uri = self.get_parsed_uri(uri, resource)
939
+
940
+ if not self.exists(parsed_uri):
941
+ raise FileNotFoundError(
942
+ f"Parsed resource not found: {resource}. "
943
+ f"Parse file first with ContentService.process_and_save('{uri}')"
944
+ )
945
+
946
+ return self.read(parsed_uri, **options)
947
+
948
+ def write_parsed(
949
+ self,
950
+ uri: str,
951
+ content: Any,
952
+ resource: str = "content.md",
953
+ metadata: dict[str, Any] | None = None,
954
+ ):
955
+ """
956
+ Write parsed content for a file.
957
+
958
+ Automatically writes metadata.json with parse info if provided.
959
+
960
+ Args:
961
+ uri: Original file URI
962
+ content: Parsed content to write
963
+ resource: Resource name (default: content.md)
964
+ metadata: Optional parse metadata (provider, timestamp, etc.)
965
+
966
+ Example:
967
+ # Write parsed markdown
968
+ fs.write_parsed(
969
+ "s3://bucket/file.pdf",
970
+ markdown_content,
971
+ metadata={
972
+ "provider": "kreuzberg",
973
+ "timestamp": datetime.now().isoformat(),
974
+ "page_count": 10,
975
+ }
976
+ )
977
+
978
+ # Write extracted image
979
+ fs.write_parsed(
980
+ "s3://bucket/file.pdf",
981
+ image_data,
982
+ resource="images/page_1.png"
983
+ )
984
+
985
+ # Write extracted table
986
+ fs.write_parsed(
987
+ "s3://bucket/file.pdf",
988
+ table_df,
989
+ resource="tables/table_0.parquet"
990
+ )
991
+ """
992
+ # Write primary content
993
+ parsed_uri = self.get_parsed_uri(uri, resource)
994
+ self.write(parsed_uri, content)
995
+
996
+ # Write metadata if provided
997
+ if metadata is not None:
998
+ # Add standard fields if not present
999
+ if "timestamp" not in metadata:
1000
+ metadata["timestamp"] = datetime.now().isoformat()
1001
+ if "source_uri" not in metadata:
1002
+ metadata["source_uri"] = uri
1003
+
1004
+ metadata_uri = self.get_parsed_uri(uri, "metadata.json")
1005
+ self.write(metadata_uri, metadata)
1006
+
1007
+ def list_parsed_resources(self, uri: str) -> list[str]:
1008
+ """
1009
+ List all resources in parsed directory.
1010
+
1011
+ Args:
1012
+ uri: Original file URI (upload path)
1013
+
1014
+ Returns:
1015
+ List of resource paths (relative to parsed file directory)
1016
+
1017
+ Example:
1018
+ upload_uri = "s3://rem-io-staging/v1/uploads/user-123/2025/01/19/report.pdf"
1019
+ resources = fs.list_parsed_resources(upload_uri)
1020
+ # Returns: ['content.md', 'metadata.json', 'images/page_1.png', 'tables/table_0.parquet']
1021
+
1022
+ # Read all resources
1023
+ for resource in resources:
1024
+ data = fs.read_parsed(upload_uri, resource)
1025
+ """
1026
+ # Get the parsed directory path (without specific resource)
1027
+ parsed_base = self.get_parsed_uri(uri, "")
1028
+ # Remove trailing slash for consistent listing
1029
+ parsed_base = parsed_base.rstrip("/")
1030
+
1031
+ # List all files under the parsed directory
1032
+ all_uris = self.ls(parsed_base)
1033
+
1034
+ # Extract relative paths from the parsed base
1035
+ resources = []
1036
+ for full_uri in all_uris:
1037
+ # Remove the parsed base prefix to get relative path
1038
+ if full_uri.startswith(parsed_base + "/"):
1039
+ relative = full_uri[len(parsed_base) + 1:] # +1 for the /
1040
+ resources.append(relative)
1041
+
1042
+ return resources