remdb 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +565 -0
  44. rem/cli/commands/configure.py +423 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1124 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +88 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +657 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +229 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.2.6.dist-info/METADATA +1191 -0
  185. remdb-0.2.6.dist-info/RECORD +187 -0
  186. remdb-0.2.6.dist-info/WHEEL +4 -0
  187. remdb-0.2.6.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,760 @@
1
+ """
2
+ Local filesystem provider for REM.
3
+
4
+ Provides consistent interface with S3Provider for local file operations.
5
+ Supports same formats and operations as S3Provider.
6
+
7
+ Parsing Hooks:
8
+ - Convention: Separate uploads/ and parsed/ directories
9
+ - Uploads: ~/.rem/fs/v1/uploads/user/2025/01/19/file.pdf
10
+ - Parsed: ~/.rem/fs/v1/parsed/user/2025/01/19/file.pdf/{resource}
11
+ - get_parsed_uri(): Get path for parsed content/metadata/images/tables
12
+ - has_parsed(): Check if file has been parsed
13
+ - read_parsed(): Read parsed markdown, metadata, or extracted resources
14
+ - write_parsed(): Write parsed content with automatic metadata tracking
15
+ - list_parsed_resources(): Discover all parsed resources
16
+
17
+ Example:
18
+ fs = LocalProvider()
19
+ upload_path = "/home/user/.rem/fs/v1/uploads/user-123/2025/01/19/report.pdf"
20
+
21
+ # Check if already parsed
22
+ if fs.has_parsed(upload_path):
23
+ markdown = fs.read_parsed(upload_path)
24
+ else:
25
+ # Parse and cache locally
26
+ result = parse_file(upload_path)
27
+ fs.write_parsed(
28
+ upload_path,
29
+ result.markdown,
30
+ metadata={"provider": "kreuzberg", "page_count": 10}
31
+ )
32
+
33
+ # List all parsed resources
34
+ resources = fs.list_parsed_resources(upload_path)
35
+ # ['content.md', 'metadata.json', 'images/page_1.png', 'tables/table_0.parquet']
36
+ """
37
+
38
+ from pathlib import Path
39
+ from typing import Any, BinaryIO, Callable, Iterator
40
+ from datetime import datetime
41
+ import json
42
+ import shutil
43
+ import glob as glob_module
44
+
45
+ from loguru import logger
46
+
47
+ # Optional imports for specific formats
48
+ try:
49
+ import polars as pl
50
+ except ImportError:
51
+ pl = None # type: ignore[assignment]
52
+
53
+ try:
54
+ import pandas as pd
55
+ except ImportError:
56
+ pd = None # type: ignore[assignment]
57
+
58
+ try:
59
+ import yaml
60
+ except ImportError:
61
+ yaml = None # type: ignore[assignment]
62
+
63
+ try:
64
+ from PIL import Image
65
+ except ImportError:
66
+ Image = None # type: ignore[assignment]
67
+
68
+
69
+ class LocalProvider:
70
+ """
71
+ Local filesystem provider with format detection.
72
+
73
+ Mirrors S3Provider interface for seamless filesystem abstraction.
74
+
75
+ Parsing Hooks:
76
+ - get_parsed_uri(): Get path for parsed version of a file
77
+ - read_parsed(): Read parsed content (markdown, images, etc.)
78
+ - write_parsed(): Write parsed content with metadata
79
+ - has_parsed(): Check if parsed version exists
80
+
81
+ Convention:
82
+ - Parsed files stored at {original_path}.parsed/
83
+ - Metadata at {original_path}.parsed/metadata.json
84
+ - Content at {original_path}.parsed/content.md (or other formats)
85
+ """
86
+
87
+ def exists(self, uri: str) -> bool:
88
+ """
89
+ Check if local file or directory exists.
90
+
91
+ Args:
92
+ uri: Local file path
93
+
94
+ Returns:
95
+ True if exists, False otherwise
96
+ """
97
+ return Path(uri).exists()
98
+
99
+ def open(self, uri: str, mode: str = "rb") -> BinaryIO:
100
+ """
101
+ Open local file.
102
+
103
+ Args:
104
+ uri: Local file path
105
+ mode: File mode (r, rb, w, wb, etc.)
106
+
107
+ Returns:
108
+ File object
109
+ """
110
+ # Ensure parent directory exists for write operations
111
+ if mode[0] == "w" or mode[0] == "a":
112
+ Path(uri).parent.mkdir(parents=True, exist_ok=True)
113
+
114
+ return open(uri, mode) # type: ignore[return-value]
115
+
116
+ def read(self, uri: str, use_polars: bool = True, **options) -> Any:
117
+ """
118
+ Read local file with format detection.
119
+
120
+ Supports same formats as S3Provider:
121
+ - JSON (.json)
122
+ - YAML (.yml, .yaml)
123
+ - CSV (.csv)
124
+ - Parquet (.parquet)
125
+ - Feather (.feather)
126
+ - Excel (.xlsx, .xls)
127
+ - Text (.txt, .log, .md)
128
+ - Images (.png, .jpg, .jpeg, .tiff, .svg)
129
+ - PDF (.pdf) - TODO: ContentService integration
130
+ - DOCX (.docx) - TODO: python-docx integration
131
+
132
+ Args:
133
+ uri: Local file path
134
+ use_polars: Use Polars for dataframes (default: True)
135
+ **options: Format-specific options
136
+
137
+ Returns:
138
+ Parsed data
139
+ """
140
+ p = Path(uri)
141
+ suffix = p.suffix.lower()
142
+
143
+ if not p.exists():
144
+ raise FileNotFoundError(f"File not found: {uri}")
145
+
146
+ # TODO: Integrate ContentService for PDF/DOCX
147
+ if suffix == ".pdf":
148
+ logger.warning("PDF parsing not yet implemented - use ContentService")
149
+ raise NotImplementedError(
150
+ "PDF parsing requires ContentService integration. "
151
+ "TODO: from rem.services.content import ContentService"
152
+ )
153
+
154
+ if suffix == ".docx":
155
+ logger.warning("DOCX parsing not yet implemented")
156
+ # TODO: Add python-docx
157
+ raise NotImplementedError(
158
+ "DOCX requires python-docx. "
159
+ "TODO: uv add python-docx and implement DocxProvider"
160
+ )
161
+
162
+ # Structured data
163
+ if suffix in [".yml", ".yaml"]:
164
+ if not yaml:
165
+ raise ImportError("PyYAML required for YAML support")
166
+ with open(uri, "r") as f:
167
+ return yaml.safe_load(f)
168
+
169
+ if suffix == ".json":
170
+ with open(uri, "r") as f:
171
+ return json.load(f)
172
+
173
+ if suffix in [".txt", ".log", ".md"]:
174
+ with open(uri, "r") as f:
175
+ return f.read()
176
+
177
+ # Columnar data
178
+ dataframe_lib = pl if use_polars and pl else pd
179
+ if not dataframe_lib:
180
+ raise ImportError(
181
+ "Either Polars or Pandas required for tabular data. "
182
+ "Install with: uv add polars"
183
+ )
184
+
185
+ if suffix == ".csv":
186
+ return dataframe_lib.read_csv(uri, **options)
187
+
188
+ if suffix == ".parquet":
189
+ return dataframe_lib.read_parquet(uri, **options)
190
+
191
+ if suffix == ".feather":
192
+ # TODO: Verify Polars feather support
193
+ if use_polars and pl:
194
+ logger.warning("Feather in Polars - consider Pandas if issues")
195
+ return dataframe_lib.read_feather(uri, **options)
196
+
197
+ if suffix in [".xls", ".xlsx"]:
198
+ if not pd:
199
+ raise ImportError("Pandas required for Excel")
200
+ # TODO: Requires openpyxl or xlrd
201
+ logger.warning("Excel requires openpyxl/xlrd - add to pyproject.toml if needed")
202
+ return pd.read_excel(uri, sheet_name=None, **options)
203
+
204
+ # Images
205
+ if suffix in [".png", ".jpg", ".jpeg", ".tiff", ".tif"]:
206
+ if not Image:
207
+ raise ImportError("Pillow required for images. Install with: uv add pillow")
208
+ return Image.open(uri)
209
+
210
+ if suffix == ".svg":
211
+ # TODO: SVG to PIL conversion
212
+ with open(uri, "r") as f:
213
+ return f.read() # Return SVG as text for now
214
+
215
+ # TODO: Audio formats
216
+ if suffix in [".wav", ".mp3", ".flac"]:
217
+ logger.warning(f"Audio format {suffix} not supported")
218
+ raise NotImplementedError(
219
+ f"Audio format {suffix} requires audio library. "
220
+ "TODO: Add librosa or pydub"
221
+ )
222
+
223
+ # Binary
224
+ if suffix == ".pickle":
225
+ import pickle
226
+ with open(uri, "rb") as f:
227
+ return pickle.load(f)
228
+
229
+ raise ValueError(
230
+ f"Unsupported file format: {suffix}. "
231
+ "Supported: .json, .yaml, .csv, .parquet, .txt, .png, etc."
232
+ )
233
+
234
+ def write(self, uri: str, data: Any, **options):
235
+ """
236
+ Write data to local file with format detection.
237
+
238
+ Mirrors S3Provider.write() interface for seamless filesystem abstraction.
239
+ Key difference: writes directly to disk instead of BytesIO buffer.
240
+
241
+ Args:
242
+ uri: Local file path
243
+ data: Data to write (DataFrame, dict, Image, bytes, str)
244
+ **options: Format-specific options
245
+ """
246
+ p = Path(uri)
247
+ suffix = p.suffix.lower()
248
+
249
+ # Ensure parent directory exists (unlike S3, local FS needs explicit mkdir)
250
+ p.parent.mkdir(parents=True, exist_ok=True)
251
+
252
+ # Dataframes
253
+ if suffix == ".parquet":
254
+ if hasattr(data, "write_parquet"): # Polars
255
+ data.write_parquet(uri, **options)
256
+ elif hasattr(data, "to_parquet"): # Pandas
257
+ data.to_parquet(uri, **options)
258
+ else:
259
+ raise TypeError(f"Cannot write {type(data)} to parquet")
260
+ return
261
+
262
+ if suffix == ".csv":
263
+ if hasattr(data, "write_csv"): # Polars
264
+ data.write_csv(uri, **options)
265
+ elif hasattr(data, "to_csv"): # Pandas
266
+ data.to_csv(uri, index=False, **options)
267
+ elif isinstance(data, (str, bytes)):
268
+ mode = "wb" if isinstance(data, bytes) else "w"
269
+ with open(uri, mode) as f:
270
+ f.write(data)
271
+ else:
272
+ raise TypeError(f"Cannot write {type(data)} to CSV")
273
+ return
274
+
275
+ if suffix == ".feather":
276
+ if hasattr(data, "write_feather"): # Polars (verify method)
277
+ data.write_feather(uri, **options)
278
+ elif hasattr(data, "to_feather"): # Pandas
279
+ data.to_feather(uri, **options)
280
+ else:
281
+ raise TypeError(f"Cannot write {type(data)} to feather")
282
+ return
283
+
284
+ # Structured data
285
+ if suffix in [".yml", ".yaml"]:
286
+ if not isinstance(data, dict):
287
+ raise TypeError(f"YAML requires dict, got {type(data)}")
288
+ if not yaml:
289
+ raise ImportError("PyYAML required for YAML")
290
+ with open(uri, "w") as f:
291
+ yaml.safe_dump(data, f)
292
+ return
293
+
294
+ if suffix == ".json":
295
+ if not isinstance(data, dict):
296
+ raise TypeError(f"JSON requires dict, got {type(data)}")
297
+ with open(uri, "w") as f:
298
+ json.dump(data, f, indent=2)
299
+ return
300
+
301
+ # Images
302
+ if suffix in [".png", ".jpg", ".jpeg", ".tiff", ".tif"]:
303
+ if not Image:
304
+ raise ImportError("Pillow required for images")
305
+ if not isinstance(data, Image.Image):
306
+ data = Image.fromarray(data)
307
+ format_name = suffix[1:]
308
+ save_options = {"format": format_name, **options}
309
+ if "dpi" in options:
310
+ dpi = options["dpi"]
311
+ save_options["dpi"] = (dpi, dpi) if isinstance(dpi, int) else dpi
312
+ data.save(uri, **save_options)
313
+ return
314
+
315
+ # Documents
316
+ if suffix == ".pdf":
317
+ with open(uri, "wb") as f:
318
+ f.write(data if isinstance(data, bytes) else data.encode())
319
+ return
320
+
321
+ if suffix == ".html":
322
+ with open(uri, "w") as f:
323
+ f.write(data if isinstance(data, str) else data.decode())
324
+ return
325
+
326
+ # Binary
327
+ if suffix == ".pickle":
328
+ import pickle
329
+ with open(uri, "wb") as f:
330
+ pickle.dump(data, f, **options)
331
+ return
332
+
333
+ # Text/binary fallback
334
+ if isinstance(data, str):
335
+ with open(uri, "w") as f:
336
+ f.write(data)
337
+ elif isinstance(data, bytes):
338
+ with open(uri, "wb") as f:
339
+ f.write(data)
340
+ else:
341
+ raise TypeError(f"Cannot write {type(data)} to {uri}")
342
+
343
+ def copy(self, uri_from: str, uri_to: str):
344
+ """
345
+ Copy local file or directory.
346
+
347
+ Args:
348
+ uri_from: Source path
349
+ uri_to: Destination path
350
+ """
351
+ source = Path(uri_from)
352
+ dest = Path(uri_to)
353
+
354
+ if not source.exists():
355
+ raise FileNotFoundError(f"Source not found: {uri_from}")
356
+
357
+ # Ensure destination parent exists
358
+ dest.parent.mkdir(parents=True, exist_ok=True)
359
+
360
+ if source.is_file():
361
+ shutil.copy2(source, dest)
362
+ elif source.is_dir():
363
+ shutil.copytree(source, dest, dirs_exist_ok=True)
364
+ else:
365
+ raise ValueError(f"Cannot copy {source}")
366
+
367
+ def ls(self, uri: str, **options) -> list[str]:
368
+ """
369
+ List files under directory recursively.
370
+
371
+ Args:
372
+ uri: Directory path
373
+ **options: Listing options
374
+
375
+ Returns:
376
+ List of file paths
377
+ """
378
+ p = Path(uri)
379
+
380
+ if not p.exists():
381
+ return []
382
+
383
+ if p.is_file():
384
+ return [str(p)]
385
+
386
+ # Recursive glob
387
+ pattern = options.get("pattern", "**/*")
388
+ results = []
389
+ for item in p.glob(pattern):
390
+ if item.is_file():
391
+ results.append(str(item))
392
+
393
+ return sorted(results)
394
+
395
+ def ls_dirs(self, uri: str, **options) -> list[str]:
396
+ """
397
+ List immediate child directories.
398
+
399
+ Args:
400
+ uri: Directory path
401
+ **options: Listing options
402
+
403
+ Returns:
404
+ List of directory paths
405
+ """
406
+ p = Path(uri)
407
+
408
+ if not p.exists() or not p.is_dir():
409
+ return []
410
+
411
+ dirs = [str(d) for d in p.iterdir() if d.is_dir()]
412
+ return sorted(dirs)
413
+
414
+ def ls_iter(self, uri: str, **options) -> Iterator[str]:
415
+ """
416
+ Iterate over files in directory.
417
+
418
+ Args:
419
+ uri: Directory path
420
+ **options: Listing options
421
+
422
+ Yields:
423
+ File paths
424
+ """
425
+ for file_path in self.ls(uri, **options):
426
+ yield file_path
427
+
428
+ def delete(self, uri: str, limit: int = 100) -> list[str]:
429
+ """
430
+ Delete file or directory contents.
431
+
432
+ Safety limit prevents accidental bulk deletions.
433
+
434
+ Args:
435
+ uri: File or directory path
436
+ limit: Maximum files to delete
437
+
438
+ Returns:
439
+ List of deleted paths
440
+ """
441
+ p = Path(uri)
442
+
443
+ if not p.exists():
444
+ return []
445
+
446
+ deleted = []
447
+
448
+ if p.is_file():
449
+ p.unlink()
450
+ deleted.append(str(p))
451
+ elif p.is_dir():
452
+ files = self.ls(uri)
453
+ if len(files) > limit:
454
+ raise ValueError(
455
+ f"Attempting to delete {len(files)} files exceeds "
456
+ f"safety limit of {limit}. Increase limit if intentional."
457
+ )
458
+ for file_path in files:
459
+ Path(file_path).unlink()
460
+ deleted.append(file_path)
461
+ # Remove empty directories
462
+ shutil.rmtree(p, ignore_errors=True)
463
+
464
+ return deleted
465
+
466
+ def read_dataset(self, uri: str):
467
+ """
468
+ Read local data as PyArrow dataset.
469
+
470
+ Args:
471
+ uri: Dataset path (parquet, etc.)
472
+
473
+ Returns:
474
+ PyArrow Dataset
475
+ """
476
+ if not pl:
477
+ raise ImportError("Polars required for datasets. Install with: uv add polars")
478
+
479
+ return pl.read_parquet(uri).to_arrow()
480
+
481
+ def read_image(self, uri: str):
482
+ """
483
+ Read local image as PIL Image.
484
+
485
+ Args:
486
+ uri: Image file path
487
+
488
+ Returns:
489
+ PIL Image
490
+ """
491
+ if not Image:
492
+ raise ImportError("Pillow required for images. Install with: uv add pillow")
493
+
494
+ return Image.open(uri)
495
+
496
+ def apply(self, uri: str, fn: Callable[[str], Any]) -> Any:
497
+ """
498
+ Apply function to local file.
499
+
500
+ Since file is already local, just pass the path.
501
+
502
+ Args:
503
+ uri: Local file path
504
+ fn: Function that takes file path
505
+
506
+ Returns:
507
+ Result of function call
508
+ """
509
+ p = Path(uri)
510
+ if not p.exists():
511
+ raise FileNotFoundError(f"File not found: {uri}")
512
+
513
+ return fn(str(p.absolute()))
514
+
515
+ def cache_data(self, data: Any, **kwargs) -> str:
516
+ """
517
+ Cache data locally.
518
+
519
+ TODO: Implement local caching strategy.
520
+
521
+ Args:
522
+ data: Data to cache
523
+ **kwargs: Caching options
524
+
525
+ Returns:
526
+ Local file path
527
+ """
528
+ raise NotImplementedError(
529
+ "Local caching not yet implemented. "
530
+ "TODO: Implement /tmp or ~/.rem/cache strategy"
531
+ )
532
+
533
+ def local_file(self, uri: str) -> str:
534
+ """
535
+ Return local file path (already local).
536
+
537
+ Args:
538
+ uri: Local file path
539
+
540
+ Returns:
541
+ Same path
542
+ """
543
+ return uri
544
+
545
+ # ========================================================================
546
+ # Parsing Hooks
547
+ # ========================================================================
548
+ # Convention: Separate uploads/ and parsed/ directories with deterministic matching
549
+ # Uploads: ~/.rem/fs/v1/uploads/user-123/2025/01/19/file.pdf
550
+ # Parsed: ~/.rem/fs/v1/parsed/user-123/2025/01/19/file.pdf/content.md
551
+ # ~/.rem/fs/v1/parsed/user-123/2025/01/19/file.pdf/metadata.json
552
+ # ~/.rem/fs/v1/parsed/user-123/2025/01/19/file.pdf/images/page_1.png
553
+ # ========================================================================
554
+
555
+ def get_parsed_uri(self, uri: str, resource: str = "content.md") -> str:
556
+ """
557
+ Get path for parsed version of a file.
558
+
559
+ Maps uploads/ paths to parsed/ paths deterministically:
560
+ uploads/user/2025/01/19/file.pdf -> parsed/user/2025/01/19/file.pdf/{resource}
561
+
562
+ Args:
563
+ uri: Original file path (e.g., /data/v1/uploads/user/2025/01/19/file.pdf)
564
+ resource: Resource within parsed directory (default: content.md)
565
+
566
+ Returns:
567
+ Parsed resource path (e.g., /data/v1/parsed/user/2025/01/19/file.pdf/content.md)
568
+
569
+ Example:
570
+ # Original upload
571
+ upload_path = "/home/user/.rem/fs/v1/uploads/user-123/2025/01/19/report.pdf"
572
+
573
+ # Get parsed markdown
574
+ parsed_path = fs.get_parsed_uri(upload_path)
575
+ # -> /home/user/.rem/fs/v1/parsed/user-123/2025/01/19/report.pdf/content.md
576
+
577
+ # Get parse metadata
578
+ meta_path = fs.get_parsed_uri(upload_path, "metadata.json")
579
+ # -> /home/user/.rem/fs/v1/parsed/user-123/2025/01/19/report.pdf/metadata.json
580
+
581
+ # Get extracted image
582
+ img_path = fs.get_parsed_uri(upload_path, "images/page_1.png")
583
+ # -> /home/user/.rem/fs/v1/parsed/user-123/2025/01/19/report.pdf/images/page_1.png
584
+ """
585
+ from rem.settings import settings
586
+
587
+ # Use Path for clean manipulation
588
+ path = Path(uri)
589
+ path_str = str(path)
590
+
591
+ # Replace uploads/ with parsed/ in the path
592
+ uploads_prefix = settings.s3.uploads_prefix
593
+ parsed_prefix = settings.s3.parsed_prefix
594
+
595
+ if f"/{uploads_prefix}/" in path_str:
596
+ # Replace uploads/ with parsed/ in the path
597
+ new_path = path_str.replace(f"/{uploads_prefix}/", f"/{parsed_prefix}/", 1)
598
+ # Append resource to the end (filename becomes a directory)
599
+ parsed_path = f"{new_path}/{resource}"
600
+ elif path_str.startswith(f"{uploads_prefix}/"):
601
+ # Handle case without leading slash
602
+ new_path = path_str.replace(f"{uploads_prefix}/", f"{parsed_prefix}/", 1)
603
+ parsed_path = f"{new_path}/{resource}"
604
+ else:
605
+ # Fallback: append .parsed/ if not in uploads/ directory
606
+ # This handles legacy paths or custom directories
607
+ parsed_path = f"{path_str}.parsed/{resource}"
608
+
609
+ return parsed_path
610
+
611
+ def has_parsed(self, uri: str) -> bool:
612
+ """
613
+ Check if parsed version exists for a file.
614
+
615
+ Args:
616
+ uri: Original file path
617
+
618
+ Returns:
619
+ True if metadata.json exists in .parsed/ directory
620
+
621
+ Example:
622
+ if fs.has_parsed("/data/file.pdf"):
623
+ content = fs.read_parsed("/data/file.pdf")
624
+ else:
625
+ # Trigger parsing workflow
626
+ content_service.process_and_save(uri)
627
+ """
628
+ metadata_path = self.get_parsed_uri(uri, "metadata.json")
629
+ return self.exists(metadata_path)
630
+
631
+ def read_parsed(self, uri: str, resource: str = "content.md", **options) -> Any:
632
+ """
633
+ Read parsed content for a file.
634
+
635
+ Args:
636
+ uri: Original file path
637
+ resource: Resource to read (default: content.md)
638
+ **options: Format-specific read options
639
+
640
+ Returns:
641
+ Parsed content (format depends on resource)
642
+
643
+ Raises:
644
+ FileNotFoundError: If parsed version doesn't exist
645
+
646
+ Example:
647
+ # Read parsed markdown
648
+ markdown = fs.read_parsed("/data/file.pdf")
649
+
650
+ # Read parse metadata
651
+ metadata = fs.read_parsed("/data/file.pdf", "metadata.json")
652
+
653
+ # Read extracted table
654
+ table = fs.read_parsed("/data/file.pdf", "tables/table_0.parquet")
655
+ """
656
+ parsed_path = self.get_parsed_uri(uri, resource)
657
+
658
+ if not self.exists(parsed_path):
659
+ raise FileNotFoundError(
660
+ f"Parsed resource not found: {resource}. "
661
+ f"Parse file first with ContentService.process_and_save('{uri}')"
662
+ )
663
+
664
+ return self.read(parsed_path, **options)
665
+
666
+ def write_parsed(
667
+ self,
668
+ uri: str,
669
+ content: Any,
670
+ resource: str = "content.md",
671
+ metadata: dict[str, Any] | None = None,
672
+ ):
673
+ """
674
+ Write parsed content for a file.
675
+
676
+ Automatically writes metadata.json with parse info if provided.
677
+
678
+ Args:
679
+ uri: Original file path
680
+ content: Parsed content to write
681
+ resource: Resource name (default: content.md)
682
+ metadata: Optional parse metadata (provider, timestamp, etc.)
683
+
684
+ Example:
685
+ # Write parsed markdown
686
+ fs.write_parsed(
687
+ "/data/file.pdf",
688
+ markdown_content,
689
+ metadata={
690
+ "provider": "kreuzberg",
691
+ "timestamp": datetime.now().isoformat(),
692
+ "page_count": 10,
693
+ }
694
+ )
695
+
696
+ # Write extracted image
697
+ fs.write_parsed(
698
+ "/data/file.pdf",
699
+ image_data,
700
+ resource="images/page_1.png"
701
+ )
702
+
703
+ # Write extracted table
704
+ fs.write_parsed(
705
+ "/data/file.pdf",
706
+ table_df,
707
+ resource="tables/table_0.parquet"
708
+ )
709
+ """
710
+ # Write primary content
711
+ parsed_path = self.get_parsed_uri(uri, resource)
712
+ self.write(parsed_path, content)
713
+
714
+ # Write metadata if provided
715
+ if metadata is not None:
716
+ # Add standard fields if not present
717
+ if "timestamp" not in metadata:
718
+ metadata["timestamp"] = datetime.now().isoformat()
719
+ if "source_uri" not in metadata:
720
+ metadata["source_uri"] = uri
721
+
722
+ metadata_path = self.get_parsed_uri(uri, "metadata.json")
723
+ self.write(metadata_path, metadata)
724
+
725
+ def list_parsed_resources(self, uri: str) -> list[str]:
726
+ """
727
+ List all resources in parsed directory.
728
+
729
+ Args:
730
+ uri: Original file path (upload path)
731
+
732
+ Returns:
733
+ List of resource paths (relative to parsed file directory)
734
+
735
+ Example:
736
+ upload_path = "/home/user/.rem/fs/v1/uploads/user-123/2025/01/19/report.pdf"
737
+ resources = fs.list_parsed_resources(upload_path)
738
+ # Returns: ['content.md', 'metadata.json', 'images/page_1.png', 'tables/table_0.parquet']
739
+
740
+ # Read all resources
741
+ for resource in resources:
742
+ data = fs.read_parsed(upload_path, resource)
743
+ """
744
+ # Get the parsed directory path (without specific resource)
745
+ parsed_base = self.get_parsed_uri(uri, "")
746
+ # Remove trailing slash for consistent listing
747
+ parsed_base = parsed_base.rstrip("/")
748
+
749
+ # List all files under the parsed directory
750
+ all_paths = self.ls(parsed_base)
751
+
752
+ # Extract relative paths from the parsed base
753
+ resources = []
754
+ for full_path in all_paths:
755
+ # Remove the parsed base prefix to get relative path
756
+ if full_path.startswith(parsed_base + "/"):
757
+ relative = full_path[len(parsed_base) + 1:] # +1 for the /
758
+ resources.append(relative)
759
+
760
+ return resources