remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,662 @@
1
+ # REM File System Service
2
+
3
+ Unified file system abstraction for S3 and local storage with format detection and Polars integration.
4
+
5
+ ## Features
6
+
7
+ - **Unified Interface**: Seamless operations across S3 and local filesystems
8
+ - **Format Detection**: Automatic reader/writer selection based on file extensions
9
+ - **Polars First**: Columnar data operations using Polars (with Pandas fallback)
10
+ - **Presigned URLs**: Generate S3 presigned URLs for direct access
11
+ - **ContentService Integration**: Pluggable content providers for specialized formats
12
+ - **Type Safety**: Full Pydantic validation for S3 metadata
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ # Core dependencies (already in main dependencies)
18
+ uv add boto3 pyyaml
19
+
20
+ # File system extras
21
+ uv add --optional fs polars pillow
22
+
23
+ # Or install individually
24
+ uv add polars pillow
25
+ ```
26
+
27
+ ## Quick Start
28
+
29
+ ```python
30
+ from rem.services.fs import FS, generate_presigned_url
31
+
32
+ fs = FS()
33
+
34
+ # Read from S3 or local - same interface
35
+ df = fs.read("s3://bucket/data.parquet")
36
+ df = fs.read("/local/path/data.csv", use_polars=True)
37
+
38
+ # Write with automatic format detection
39
+ fs.write("s3://bucket/output.json", {"key": "value"})
40
+ fs.write("/tmp/data.parquet", dataframe)
41
+
42
+ # Copy between filesystems
43
+ fs.copy("s3://bucket/file.pdf", "/tmp/local.pdf") # Download
44
+ fs.copy("/local/image.png", "s3://bucket/image.png") # Upload
45
+
46
+ # List files
47
+ files = fs.ls("s3://bucket/prefix/")
48
+ dirs = fs.ls_dirs("s3://bucket/")
49
+
50
+ # Generate presigned URLs
51
+ url = generate_presigned_url("s3://bucket/file.pdf", expiry=3600)
52
+ upload_url = generate_presigned_url("s3://bucket/new.pdf", for_upload=True)
53
+ ```
54
+
55
+ ## Supported Formats
56
+
57
+ ### Columnar Data (Polars/Pandas)
58
+ - **CSV** (`.csv`) - `pl.read_csv()` / `pl.write_csv()`
59
+ - **Parquet** (`.parquet`) - `pl.read_parquet()` / `pl.write_parquet()`
60
+ - **Feather** (`.feather`) - `pl.read_feather()` / `pl.write_feather()`
61
+
62
+ ### Structured Data
63
+ - **JSON** (`.json`) - Python dict serialization
64
+ - **YAML** (`.yml`, `.yaml`) - PyYAML integration
65
+
66
+ ### Documents
67
+ - **Text** (`.txt`, `.md`, `.log`) - UTF-8 text
68
+ - **PDF** (`.pdf`) - TODO: ContentService integration
69
+ - **DOCX** (`.docx`) - TODO: python-docx provider
70
+ - **HTML** (`.html`) - Raw HTML read/write
71
+
72
+ ### Images (Pillow)
73
+ - **PNG** (`.png`)
74
+ - **JPEG** (`.jpg`, `.jpeg`)
75
+ - **TIFF** (`.tiff`, `.tif`)
76
+ - **SVG** (`.svg`) - Read as text
77
+
78
+ ### Spreadsheets
79
+ - **Excel** (`.xlsx`, `.xls`) - TODO: Add `openpyxl`/`xlrd` to dependencies
80
+
81
+ ### Audio
82
+ - **WAV** (`.wav`) - TODO: Add `librosa` or `pydub` provider
83
+ - **MP3** (`.mp3`) - TODO: Audio processing library
84
+ - **FLAC** (`.flac`) - TODO: Audio processing library
85
+
86
+ ### Binary
87
+ - **Pickle** (`.pickle`) - Python pickle serialization
88
+
89
+ ## Configuration
90
+
91
+ Uses REM settings from `.env`:
92
+
93
+ ```bash
94
+ # S3 Settings (rem/settings.py -> S3Settings)
95
+ S3__BUCKET_NAME=rem-storage
96
+ S3__REGION=us-east-1
97
+
98
+ # For local dev (MinIO)
99
+ S3__ENDPOINT_URL=http://localhost:9000
100
+ S3__ACCESS_KEY_ID=minioadmin
101
+ S3__SECRET_ACCESS_KEY=minioadmin
102
+ S3__USE_SSL=false
103
+
104
+ # For production (IRSA in EKS)
105
+ # No access keys needed - uses IAM role
106
+ ```
107
+
108
+ ## Architecture
109
+
110
+ ```
111
+ FS (facade)
112
+ ├── S3Provider
113
+ │ ├── boto3 client (from settings)
114
+ │ ├── Format detection
115
+ │ ├── Presigned URLs
116
+ │ └── Multipart uploads (TODO)
117
+ └── LocalProvider
118
+ ├── pathlib operations
119
+ ├── Format detection
120
+ └── Same interface as S3Provider
121
+ ```
122
+
123
+ ## Design Principles
124
+
125
+ 1. **No upload/download methods** - Use `copy(from, to)` instead
126
+ 2. **No zip/unzip methods** - Use archive formats with `copy()`
127
+ 3. **Extension-based format detection** - Automatic reader/writer selection
128
+ 4. **DRY** - Shared format handling between S3 and local
129
+ 5. **Lean implementation** - Stubs/TODOs for heavy dependencies
130
+
131
+ ## API Reference
132
+
133
+ ### Core Operations
134
+
135
+ #### `fs.read(uri, use_polars=True, **options) -> Any`
136
+ Read file with automatic format detection.
137
+
138
+ ```python
139
+ # Columnar data (returns Polars DataFrame by default)
140
+ df = fs.read("s3://bucket/data.csv")
141
+ df = fs.read("s3://bucket/data.parquet", use_polars=False) # Pandas
142
+
143
+ # Structured data
144
+ config = fs.read("s3://bucket/config.yaml")
145
+ data = fs.read("s3://bucket/data.json")
146
+
147
+ # Images
148
+ img = fs.read("s3://bucket/image.png") # PIL Image
149
+
150
+ # Text
151
+ content = fs.read("s3://bucket/readme.md")
152
+ ```
153
+
154
+ #### `fs.write(uri, data, **options)`
155
+ Write file with automatic format detection.
156
+
157
+ ```python
158
+ # Columnar data
159
+ fs.write("s3://bucket/output.csv", polars_df)
160
+ fs.write("s3://bucket/output.parquet", pandas_df)
161
+
162
+ # Structured data
163
+ fs.write("s3://bucket/config.yaml", {"key": "value"})
164
+ fs.write("s3://bucket/data.json", {"data": [1, 2, 3]})
165
+
166
+ # Images
167
+ fs.write("s3://bucket/image.png", pil_image, dpi=300)
168
+
169
+ # Text
170
+ fs.write("s3://bucket/output.txt", "Hello, world!")
171
+ ```
172
+
173
+ #### `fs.copy(uri_from, uri_to)`
174
+ Copy between filesystems.
175
+
176
+ ```python
177
+ # S3 to S3
178
+ fs.copy("s3://bucket1/file.csv", "s3://bucket2/file.csv")
179
+
180
+ # Download
181
+ fs.copy("s3://bucket/file.pdf", "/tmp/file.pdf")
182
+
183
+ # Upload
184
+ fs.copy("/local/file.png", "s3://bucket/images/file.png")
185
+
186
+ # Local to local
187
+ fs.copy("/src/file.txt", "/dst/file.txt")
188
+ ```
189
+
190
+ #### `fs.ls(uri, **options) -> list[str]`
191
+ List files recursively.
192
+
193
+ ```python
194
+ # S3
195
+ files = fs.ls("s3://bucket/prefix/")
196
+ # [
197
+ # "s3://bucket/prefix/file1.csv",
198
+ # "s3://bucket/prefix/subdir/file2.json",
199
+ # ]
200
+
201
+ # Local
202
+ files = fs.ls("/path/to/dir/")
203
+ ```
204
+
205
+ #### `fs.ls_dirs(uri, **options) -> list[str]`
206
+ List immediate child directories.
207
+
208
+ ```python
209
+ dirs = fs.ls_dirs("s3://bucket/")
210
+ # [
211
+ # "s3://bucket/data",
212
+ # "s3://bucket/models",
213
+ # ]
214
+ ```
215
+
216
+ #### `fs.exists(uri) -> bool`
217
+ Check if file/directory exists.
218
+
219
+ ```python
220
+ if fs.exists("s3://bucket/file.csv"):
221
+ df = fs.read("s3://bucket/file.csv")
222
+ ```
223
+
224
+ #### `fs.delete(uri, limit=100) -> list[str]`
225
+ Delete file or directory contents (with safety limit).
226
+
227
+ ```python
228
+ deleted = fs.delete("s3://bucket/old_data/", limit=50)
229
+ ```
230
+
231
+ ### Advanced Operations
232
+
233
+ #### `fs.read_dataset(uri) -> pyarrow.Dataset`
234
+ Read as PyArrow dataset for lazy loading.
235
+
236
+ ```python
237
+ dataset = fs.read_dataset("s3://bucket/partitioned.parquet")
238
+ ```
239
+
240
+ #### `fs.read_image(uri) -> PIL.Image`
241
+ Read image explicitly.
242
+
243
+ ```python
244
+ img = fs.read_image("s3://bucket/photo.jpg")
245
+ img.show()
246
+ ```
247
+
248
+ #### `fs.apply(uri, fn) -> Any`
249
+ Apply function to file (downloads to /tmp if S3).
250
+
251
+ ```python
252
+ def process_image(path):
253
+ from PIL import Image
254
+ img = Image.open(path)
255
+ return img.size
256
+
257
+ width, height = fs.apply("s3://bucket/image.png", process_image)
258
+ ```
259
+
260
+ #### `fs.local_file(uri) -> str`
261
+ Get local path (downloads from S3 if needed).
262
+
263
+ ```python
264
+ local_path = fs.local_file("s3://bucket/model.pkl")
265
+ # Returns: "/tmp/model.pkl"
266
+ ```
267
+
268
+ #### `generate_presigned_url(url, expiry=3600, for_upload=False) -> str`
269
+ Generate S3 presigned URL.
270
+
271
+ ```python
272
+ # Download URL (expires in 1 hour)
273
+ download_url = generate_presigned_url("s3://bucket/file.pdf")
274
+
275
+ # Upload URL
276
+ upload_url = generate_presigned_url(
277
+ "s3://bucket/upload.pdf",
278
+ expiry=300, # 5 minutes
279
+ for_upload=True
280
+ )
281
+ ```
282
+
283
+ ## ContentService Integration
284
+
285
+ For specialized document parsing (PDF, DOCX, etc.), use `ContentService`:
286
+
287
+ ```python
288
+ from rem.services.content import ContentService
289
+
290
+ content_service = ContentService()
291
+
292
+ # Process PDF with OCR, layout detection, etc.
293
+ result = content_service.process_uri("s3://bucket/document.pdf")
294
+ # {
295
+ # "uri": "s3://bucket/document.pdf",
296
+ # "content": "Extracted text...",
297
+ # "metadata": {...},
298
+ # "provider": "pdf"
299
+ # }
300
+ ```
301
+
302
+ The `ContentService` provides pluggable providers for complex formats that require specialized parsing.
303
+
304
+ ## Parsing Hooks
305
+
306
+ Manage parsed file versions with clean separation from uploads. When you upload a PDF, what you really care about is the structured markdown + extracted images/tables. The FS provider maps uploads to parsed content deterministically.
307
+
308
+ ### Convention
309
+
310
+ Separate `uploads/` and `parsed/` directories with deterministic path mapping:
311
+
312
+ ```
313
+ # S3 paths
314
+ s3://rem-io-staging/v1/uploads/user-123/2025/01/19/report.pdf # Original
315
+ s3://rem-io-staging/v1/parsed/user-123/2025/01/19/report.pdf/ # Parsed directory
316
+ ├── content.md # Primary content
317
+ ├── metadata.json # Parse metadata
318
+ ├── images/page_1.png # Extracted images
319
+ └── tables/table_0.parquet # Extracted tables
320
+
321
+ # Local paths
322
+ ~/.rem/fs/v1/uploads/user-123/2025/01/19/report.pdf # Original
323
+ ~/.rem/fs/v1/parsed/user-123/2025/01/19/report.pdf/ # Parsed directory
324
+ ├── content.md
325
+ ├── metadata.json
326
+ ├── images/page_1.png
327
+ └── tables/table_0.parquet
328
+ ```
329
+
330
+ ### Configuration
331
+
332
+ Control paths via environment variables:
333
+
334
+ ```bash
335
+ # S3 Settings
336
+ S3__BUCKET_NAME=rem-io-staging
337
+ S3__VERSION=v1
338
+ S3__UPLOADS_PREFIX=uploads
339
+ S3__PARSED_PREFIX=parsed
340
+ ```
341
+
342
+ ### Basic Usage
343
+
344
+ ```python
345
+ from rem.services.fs import FS
346
+
347
+ fs = FS()
348
+ upload_uri = "s3://rem-io-staging/v1/uploads/user-123/2025/01/19/report.pdf"
349
+
350
+ # Check if already parsed
351
+ if fs.has_parsed(upload_uri):
352
+ markdown = fs.read_parsed(upload_uri)
353
+ else:
354
+ # Parse and cache
355
+ result = parse_file(upload_uri)
356
+ fs.write_parsed(
357
+ upload_uri,
358
+ result.markdown,
359
+ metadata={"provider": "kreuzberg", "page_count": 10}
360
+ )
361
+ # Writes to: s3://rem-io-staging/v1/parsed/user-123/2025/01/19/report.pdf/content.md
362
+
363
+ # List all parsed resources
364
+ resources = fs.list_parsed_resources(upload_uri)
365
+ # ['content.md', 'metadata.json', 'images/page_1.png', 'tables/table_0.parquet']
366
+
367
+ # Read specific resources
368
+ metadata = fs.read_parsed(upload_uri, "metadata.json")
369
+ image = fs.read_parsed(upload_uri, "images/page_1.png")
370
+ table = fs.read_parsed(upload_uri, "tables/table_0.parquet")
371
+ ```
372
+
373
+ ### Benefits
374
+
375
+ - **Separation of concerns**: Uploads and parsed content in separate directories
376
+ - **Deterministic mapping**: uploads/user/date/file.pdf -> parsed/user/date/file.pdf/
377
+ - **Caching**: Check `has_parsed()` before re-parsing expensive files
378
+ - **Discoverability**: `list_parsed_resources()` shows what's available
379
+ - **Flexibility**: Store markdown, images, tables, any extracted content
380
+ - **Scalable**: Clean separation works across S3 and local filesystems
381
+
382
+ See `parsing-hooks-examples.md` for more detailed examples.
383
+
384
+ ## TODO: Future Enhancements
385
+
386
+ ### High Priority
387
+ - [ ] **ContentService integration** for PDF parsing in `read()`
388
+ - [ ] **Multipart uploads** for large S3 files (>5GB)
389
+ - [ ] **Progress bars** for large uploads/downloads (tqdm)
390
+ - [ ] **Pagination** in `ls_iter()` for massive directories
391
+
392
+ ### Medium Priority
393
+ - [ ] **python-docx provider** for `.docx` files
394
+ - [ ] **Audio providers** (librosa/pydub) for `.wav`, `.mp3`, `.flac`
395
+ - [ ] **Excel dependencies** (openpyxl/xlrd) for full `.xlsx`/`.xls` support
396
+ - [ ] **Archive operations** (`.zip`, `.tar.gz`) via copy interface
397
+ - [ ] **S3 versioning** support in all operations
398
+
399
+ ### Low Priority
400
+ - [ ] **Local caching** strategy for `LocalProvider.cache_data()`
401
+ - [ ] **SVG to PIL** conversion for image operations
402
+ - [ ] **Video formats** (`.mp4`, `.avi`) via opencv or ffmpeg
403
+ - [ ] **Compression** options (gzip, brotli) for text formats
404
+
405
+ ## Testing
406
+
407
+ ```python
408
+ # Test basic operations
409
+ from rem.services.fs import FS
410
+
411
+ fs = FS()
412
+
413
+ # Write and read
414
+ fs.write("/tmp/test.json", {"test": "data"})
415
+ data = fs.read("/tmp/test.json")
416
+ assert data == {"test": "data"}
417
+
418
+ # S3 operations (requires configured bucket)
419
+ fs.write("s3://test-bucket/data.csv", df)
420
+ df2 = fs.read("s3://test-bucket/data.csv")
421
+ ```
422
+
423
+ ## Contributing
424
+
425
+ When adding new format support:
426
+
427
+ 1. Add reader logic to both `S3Provider.read()` and `LocalProvider.read()`
428
+ 2. Add writer logic to both `S3Provider.write()` and `LocalProvider.write()`
429
+ 3. Add optional dependency to `pyproject.toml` with comment
430
+ 4. Add format documentation to this README
431
+ 5. Consider ContentService for complex formats (PDF, DOCX, etc.)
432
+
433
+ ## Path Conventions
434
+
435
+ REM uses standardized path conventions for consistent file organization across local and S3 storage.
436
+
437
+ ### Path Structure
438
+
439
+ ```
440
+ {base_uri}/rem/{version}/{category}/{scope}/{date_parts}/
441
+ ```
442
+
443
+ **Base URI:**
444
+ - **Local**: `$REM_HOME/fs/` (defaults to `~/.rem/fs`)
445
+ - **S3**: `s3://{bucket}/` (from settings)
446
+ - **Auto-detection**: Uses S3 in production, local in development
447
+
448
+ **Components:**
449
+
450
+ | Component | Description | Example |
451
+ |-----------|-------------|---------|
452
+ | `base_uri` | Storage location | `s3://rem-bucket` or `/Users/user/.rem/fs` |
453
+ | `rem` | Namespace | `rem` |
454
+ | `version` | API version | `v1`, `v2` |
455
+ | `category` | Resource type | `uploads`, `schemas`, `users`, `temp` |
456
+ | `scope` | User or system | `system`, `user-123` |
457
+ | `date_parts` | Date hierarchy | `2025/01/19` or `2025/01/19/14_30` |
458
+
459
+ ### Upload Paths
460
+
461
+ Standard structure for file uploads with date-based partitioning:
462
+
463
+ ```python
464
+ from rem.services.fs import get_uploads_path, FS
465
+
466
+ # System uploads (no user)
467
+ path = get_uploads_path()
468
+ # /Users/user/.rem/fs/rem/v1/uploads/system/2025/01/19
469
+
470
+ # User-specific uploads
471
+ path = get_uploads_path(user_id="user-123")
472
+ # /Users/user/.rem/fs/rem/v1/uploads/user-123/2025/01/19
473
+
474
+ # With specific date
475
+ from datetime import date
476
+ path = get_uploads_path(user_id="user-456", dt=date(2025, 1, 15))
477
+ # /Users/user/.rem/fs/rem/v1/uploads/user-456/2025/01/15
478
+
479
+ # Include hour/minute for high-frequency uploads
480
+ from datetime import datetime
481
+ path = get_uploads_path(user_id="user-789", dt=datetime.now(), include_time=True)
482
+ # /Users/user/.rem/fs/rem/v1/uploads/user-789/2025/01/19/14_30
483
+
484
+ # Force S3
485
+ path = get_uploads_path(user_id="user-123", use_s3=True)
486
+ # s3://rem-bucket/rem/v1/uploads/user-123/2025/01/19
487
+
488
+ # Use with FS
489
+ fs = FS()
490
+ upload_dir = get_uploads_path(user_id="user-123")
491
+ fs.write(f"{upload_dir}/data.json", {"key": "value"})
492
+ ```
493
+
494
+ ### Versioned Resource Paths
495
+
496
+ For schemas, agents, tools, and datasets:
497
+
498
+ ```python
499
+ from rem.services.fs import get_versioned_path
500
+
501
+ # Schemas
502
+ path = get_versioned_path("schemas", "user-schema")
503
+ # /Users/user/.rem/fs/rem/v1/schemas/user-schema
504
+
505
+ # Agents (with version)
506
+ path = get_versioned_path("agents", "query-agent", version="v2")
507
+ # /Users/user/.rem/fs/rem/v2/agents/query-agent
508
+
509
+ # Tools
510
+ path = get_versioned_path("tools", "web-scraper")
511
+ # /Users/user/.rem/fs/rem/v1/tools/web-scraper
512
+
513
+ # Datasets
514
+ path = get_versioned_path("datasets", "training-data")
515
+ # /Users/user/.rem/fs/rem/v1/datasets/training-data
516
+ ```
517
+
518
+ ### User-Scoped Paths
519
+
520
+ For user-specific storage:
521
+
522
+ ```python
523
+ from rem.services.fs import get_user_path
524
+
525
+ # User root
526
+ path = get_user_path("user-123")
527
+ # /Users/user/.rem/fs/rem/v1/users/user-123
528
+
529
+ # User documents
530
+ path = get_user_path("user-123", "documents")
531
+ # /Users/user/.rem/fs/rem/v1/users/user-123/documents
532
+
533
+ # User images
534
+ path = get_user_path("user-456", "images")
535
+ # /Users/user/.rem/fs/rem/v1/users/user-456/images
536
+ ```
537
+
538
+ ### Temporary Paths
539
+
540
+ For temporary file processing with timestamps:
541
+
542
+ ```python
543
+ from rem.services.fs import get_temp_path
544
+
545
+ # Default temp
546
+ path = get_temp_path()
547
+ # /Users/user/.rem/fs/rem/v1/temp/tmp/20250119_143045
548
+
549
+ # Processing temp
550
+ path = get_temp_path("processing")
551
+ # /Users/user/.rem/fs/rem/v1/temp/processing/20250119_143045
552
+
553
+ # Conversion temp
554
+ path = get_temp_path("conversion")
555
+ # /Users/user/.rem/fs/rem/v1/temp/conversion/20250119_143045
556
+ ```
557
+
558
+ ### Path Utilities
559
+
560
+ ```python
561
+ from rem.services.fs import (
562
+ get_base_uri,
563
+ get_rem_home,
564
+ ensure_dir_exists,
565
+ join_path
566
+ )
567
+
568
+ # Get base URI (auto-detect based on environment)
569
+ base = get_base_uri()
570
+
571
+ # Force local or S3
572
+ base = get_base_uri(use_s3=False) # /Users/user/.rem/fs
573
+ base = get_base_uri(use_s3=True) # s3://rem-bucket
574
+
575
+ # Get REM_HOME directory
576
+ home = get_rem_home() # /Users/user/.rem
577
+
578
+ # Ensure directory exists (local only, no-op for S3)
579
+ path = ensure_dir_exists("/path/to/dir")
580
+
581
+ # Join paths (auto-detects S3 vs local)
582
+ path = join_path("s3://bucket", "rem", "v1", "uploads")
583
+ # s3://bucket/rem/v1/uploads
584
+
585
+ path = join_path("/home/user", "rem", "data")
586
+ # /home/user/rem/data
587
+ ```
588
+
589
+ ### Best Practices
590
+
591
+ 1. **Always use path functions** - Don't hardcode paths
592
+ ```python
593
+ # ✅ Good
594
+ from rem.services.fs import get_uploads_path
595
+ path = get_uploads_path(user_id="user-123")
596
+
597
+ # ❌ Bad
598
+ path = "/Users/user/.rem/fs/rem/v1/uploads/user-123/2025/01/19"
599
+ ```
600
+
601
+ 2. **Trust auto-detection** - Let environment determine S3 vs local
602
+ ```python
603
+ # ✅ Good - auto-detects based on ENVIRONMENT
604
+ path = get_uploads_path(user_id="user-123")
605
+
606
+ # ❌ Unnecessary - only force when you have a specific reason
607
+ path = get_uploads_path(user_id="user-123", use_s3=False)
608
+ ```
609
+
610
+ 3. **Use date partitioning** - Leverage hierarchy for scalability
611
+ ```python
612
+ # ✅ Good - partitioned by date
613
+ path = get_uploads_path(user_id="user-123", dt=datetime.now())
614
+
615
+ # ✅ Also good - include time for high-frequency uploads
616
+ path = get_uploads_path(user_id="user-123", include_time=True)
617
+ ```
618
+
619
+ 4. **User vs system scope** - Use user_id for user files, omit for system files
620
+ ```python
621
+ # User files
622
+ user_upload = get_uploads_path(user_id="user-123")
623
+
624
+ # System files (logs, configs, etc.)
625
+ system_upload = get_uploads_path() # Uses "system"
626
+ ```
627
+
628
+ 5. **Ensure directories exist** - For local paths before writing
629
+ ```python
630
+ from rem.services.fs import get_uploads_path, ensure_dir_exists, FS
631
+
632
+ path = get_uploads_path(user_id="user-123")
633
+ ensure_dir_exists(path) # No-op for S3
634
+
635
+ fs = FS()
636
+ fs.write(f"{path}/data.json", data)
637
+ ```
638
+
639
+ ### Path Reference
640
+
641
+ Quick reference for all path types:
642
+
643
+ | Function | Path Structure | Example |
644
+ |----------|----------------|---------|
645
+ | `get_uploads_path()` | `rem/v1/uploads/{system\|user_id}/{yyyy}/{mm}/{dd}[/{hh_mm}]` | `rem/v1/uploads/user-123/2025/01/19` |
646
+ | `get_versioned_path()` | `rem/{version}/{resource_type}/{name}` | `rem/v1/schemas/user-schema` |
647
+ | `get_user_path()` | `rem/v1/users/{user_id}[/{subpath}]` | `rem/v1/users/user-123/documents` |
648
+ | `get_temp_path()` | `rem/v1/temp/{prefix}/{timestamp}` | `rem/v1/temp/processing/20250119_143045` |
649
+
650
+ ### Examples
651
+
652
+ See `rem/src/rem/services/fs/examples_paths.py` for complete working examples:
653
+
654
+ ```bash
655
+ python -m rem.services.fs.examples_paths
656
+ ```
657
+
658
+ ## See Also
659
+
660
+ - ContentService: `rem/src/rem/services/content/` - Specialized parsing (PDF, DOCX, etc.)
661
+ - Settings: `rem/settings.py` - S3Settings, REM_HOME configuration
662
+ - Examples: `rem/src/rem/services/fs/examples_paths.py` - Path convention examples