remdb 0.3.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -0
- rem/agentic/README.md +760 -0
- rem/agentic/__init__.py +54 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +38 -0
- rem/agentic/agents/agent_manager.py +311 -0
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +425 -0
- rem/agentic/context_builder.py +360 -0
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +273 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +240 -0
- rem/agentic/providers/phoenix.py +926 -0
- rem/agentic/providers/pydantic_ai.py +854 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +737 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +242 -0
- rem/api/README.md +657 -0
- rem/api/deps.py +253 -0
- rem/api/main.py +460 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +820 -0
- rem/api/mcp_router/server.py +243 -0
- rem/api/mcp_router/tools.py +1605 -0
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +520 -0
- rem/api/routers/auth.py +898 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/child_streaming.py +394 -0
- rem/api/routers/chat/completions.py +702 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +202 -0
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +546 -0
- rem/api/routers/chat/streaming.py +950 -0
- rem/api/routers/chat/streaming_utils.py +327 -0
- rem/api/routers/common.py +18 -0
- rem/api/routers/dev.py +87 -0
- rem/api/routers/feedback.py +276 -0
- rem/api/routers/messages.py +620 -0
- rem/api/routers/models.py +86 -0
- rem/api/routers/query.py +362 -0
- rem/api/routers/shared_sessions.py +422 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +36 -0
- rem/auth/jwt.py +367 -0
- rem/auth/middleware.py +318 -0
- rem/auth/providers/__init__.py +16 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/email.py +215 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +517 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +299 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +549 -0
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +495 -0
- rem/cli/commands/db.py +828 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1698 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +388 -0
- rem/cli/commands/query.py +109 -0
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +230 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/commands/session.py +453 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +123 -0
- rem/config.py +244 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +70 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +672 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +246 -0
- rem/models/entities/__init__.py +68 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +64 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +181 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/session.py +84 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/subscriber.py +175 -0
- rem/models/entities/user.py +93 -0
- rem/py.typed +0 -0
- rem/registry.py +373 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/agent-builder.yaml +235 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +132 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +18 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +760 -0
- rem/services/content/service.py +762 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +322 -0
- rem/services/dreaming/moment_service.py +251 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/email/__init__.py +10 -0
- rem/services/email/service.py +522 -0
- rem/services/email/templates.py +360 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +127 -0
- rem/services/embeddings/worker.py +435 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +960 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +757 -0
- rem/services/postgres/__init__.py +49 -0
- rem/services/postgres/diff_service.py +599 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
- rem/services/postgres/register_type.py +353 -0
- rem/services/postgres/repository.py +481 -0
- rem/services/postgres/schema_generator.py +661 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +355 -0
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +318 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +180 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +608 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +13 -0
- rem/services/session/compression.py +488 -0
- rem/services/session/pydantic_messages.py +310 -0
- rem/services/session/reload.py +85 -0
- rem/services/user_service.py +130 -0
- rem/settings.py +1877 -0
- rem/sql/background_indexes.sql +52 -0
- rem/sql/migrations/001_install.sql +983 -0
- rem/sql/migrations/002_install_models.sql +3157 -0
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +282 -0
- rem/sql/migrations/005_schema_update.sql +145 -0
- rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +628 -0
- rem/utils/__init__.py +61 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +436 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/files.py +323 -0
- rem/utils/markdown.py +16 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +492 -0
- rem/utils/schema_loader.py +649 -0
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +350 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +325 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +7 -0
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- rem/workers/unlogged_maintainer.py +463 -0
- remdb-0.3.242.dist-info/METADATA +1632 -0
- remdb-0.3.242.dist-info/RECORD +235 -0
- remdb-0.3.242.dist-info/WHEEL +4 -0
- remdb-0.3.242.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,662 @@
|
|
|
1
|
+
# REM File System Service
|
|
2
|
+
|
|
3
|
+
Unified file system abstraction for S3 and local storage with format detection and Polars integration.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Unified Interface**: Seamless operations across S3 and local filesystems
|
|
8
|
+
- **Format Detection**: Automatic reader/writer selection based on file extensions
|
|
9
|
+
- **Polars First**: Columnar data operations using Polars (with Pandas fallback)
|
|
10
|
+
- **Presigned URLs**: Generate S3 presigned URLs for direct access
|
|
11
|
+
- **ContentService Integration**: Pluggable content providers for specialized formats
|
|
12
|
+
- **Type Safety**: Full Pydantic validation for S3 metadata
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
# Core dependencies (already in main dependencies)
|
|
18
|
+
uv add boto3 pyyaml
|
|
19
|
+
|
|
20
|
+
# File system extras
|
|
21
|
+
uv add --optional fs polars pillow
|
|
22
|
+
|
|
23
|
+
# Or install individually
|
|
24
|
+
uv add polars pillow
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from rem.services.fs import FS, generate_presigned_url
|
|
31
|
+
|
|
32
|
+
fs = FS()
|
|
33
|
+
|
|
34
|
+
# Read from S3 or local - same interface
|
|
35
|
+
df = fs.read("s3://bucket/data.parquet")
|
|
36
|
+
df = fs.read("/local/path/data.csv", use_polars=True)
|
|
37
|
+
|
|
38
|
+
# Write with automatic format detection
|
|
39
|
+
fs.write("s3://bucket/output.json", {"key": "value"})
|
|
40
|
+
fs.write("/tmp/data.parquet", dataframe)
|
|
41
|
+
|
|
42
|
+
# Copy between filesystems
|
|
43
|
+
fs.copy("s3://bucket/file.pdf", "/tmp/local.pdf") # Download
|
|
44
|
+
fs.copy("/local/image.png", "s3://bucket/image.png") # Upload
|
|
45
|
+
|
|
46
|
+
# List files
|
|
47
|
+
files = fs.ls("s3://bucket/prefix/")
|
|
48
|
+
dirs = fs.ls_dirs("s3://bucket/")
|
|
49
|
+
|
|
50
|
+
# Generate presigned URLs
|
|
51
|
+
url = generate_presigned_url("s3://bucket/file.pdf", expiry=3600)
|
|
52
|
+
upload_url = generate_presigned_url("s3://bucket/new.pdf", for_upload=True)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Supported Formats
|
|
56
|
+
|
|
57
|
+
### Columnar Data (Polars/Pandas)
|
|
58
|
+
- **CSV** (`.csv`) - `pl.read_csv()` / `pl.write_csv()`
|
|
59
|
+
- **Parquet** (`.parquet`) - `pl.read_parquet()` / `pl.write_parquet()`
|
|
60
|
+
- **Feather** (`.feather`) - `pl.read_feather()` / `pl.write_feather()`
|
|
61
|
+
|
|
62
|
+
### Structured Data
|
|
63
|
+
- **JSON** (`.json`) - Python dict serialization
|
|
64
|
+
- **YAML** (`.yml`, `.yaml`) - PyYAML integration
|
|
65
|
+
|
|
66
|
+
### Documents
|
|
67
|
+
- **Text** (`.txt`, `.md`, `.log`) - UTF-8 text
|
|
68
|
+
- **PDF** (`.pdf`) - TODO: ContentService integration
|
|
69
|
+
- **DOCX** (`.docx`) - TODO: python-docx provider
|
|
70
|
+
- **HTML** (`.html`) - Raw HTML read/write
|
|
71
|
+
|
|
72
|
+
### Images (Pillow)
|
|
73
|
+
- **PNG** (`.png`)
|
|
74
|
+
- **JPEG** (`.jpg`, `.jpeg`)
|
|
75
|
+
- **TIFF** (`.tiff`, `.tif`)
|
|
76
|
+
- **SVG** (`.svg`) - Read as text
|
|
77
|
+
|
|
78
|
+
### Spreadsheets
|
|
79
|
+
- **Excel** (`.xlsx`, `.xls`) - TODO: Add `openpyxl`/`xlrd` to dependencies
|
|
80
|
+
|
|
81
|
+
### Audio
|
|
82
|
+
- **WAV** (`.wav`) - TODO: Add `librosa` or `pydub` provider
|
|
83
|
+
- **MP3** (`.mp3`) - TODO: Audio processing library
|
|
84
|
+
- **FLAC** (`.flac`) - TODO: Audio processing library
|
|
85
|
+
|
|
86
|
+
### Binary
|
|
87
|
+
- **Pickle** (`.pickle`) - Python pickle serialization
|
|
88
|
+
|
|
89
|
+
## Configuration
|
|
90
|
+
|
|
91
|
+
Uses REM settings from `.env`:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# S3 Settings (rem/settings.py -> S3Settings)
|
|
95
|
+
S3__BUCKET_NAME=rem-storage
|
|
96
|
+
S3__REGION=us-east-1
|
|
97
|
+
|
|
98
|
+
# For local dev (MinIO)
|
|
99
|
+
S3__ENDPOINT_URL=http://localhost:9000
|
|
100
|
+
S3__ACCESS_KEY_ID=minioadmin
|
|
101
|
+
S3__SECRET_ACCESS_KEY=minioadmin
|
|
102
|
+
S3__USE_SSL=false
|
|
103
|
+
|
|
104
|
+
# For production (IRSA in EKS)
|
|
105
|
+
# No access keys needed - uses IAM role
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Architecture
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
FS (facade)
|
|
112
|
+
├── S3Provider
|
|
113
|
+
│ ├── boto3 client (from settings)
|
|
114
|
+
│ ├── Format detection
|
|
115
|
+
│ ├── Presigned URLs
|
|
116
|
+
│ └── Multipart uploads (TODO)
|
|
117
|
+
└── LocalProvider
|
|
118
|
+
├── pathlib operations
|
|
119
|
+
├── Format detection
|
|
120
|
+
└── Same interface as S3Provider
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Design Principles
|
|
124
|
+
|
|
125
|
+
1. **No upload/download methods** - Use `copy(from, to)` instead
|
|
126
|
+
2. **No zip/unzip methods** - Use archive formats with `copy()`
|
|
127
|
+
3. **Extension-based format detection** - Automatic reader/writer selection
|
|
128
|
+
4. **DRY** - Shared format handling between S3 and local
|
|
129
|
+
5. **Lean implementation** - Stubs/TODOs for heavy dependencies
|
|
130
|
+
|
|
131
|
+
## API Reference
|
|
132
|
+
|
|
133
|
+
### Core Operations
|
|
134
|
+
|
|
135
|
+
#### `fs.read(uri, use_polars=True, **options) -> Any`
|
|
136
|
+
Read file with automatic format detection.
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
# Columnar data (returns Polars DataFrame by default)
|
|
140
|
+
df = fs.read("s3://bucket/data.csv")
|
|
141
|
+
df = fs.read("s3://bucket/data.parquet", use_polars=False) # Pandas
|
|
142
|
+
|
|
143
|
+
# Structured data
|
|
144
|
+
config = fs.read("s3://bucket/config.yaml")
|
|
145
|
+
data = fs.read("s3://bucket/data.json")
|
|
146
|
+
|
|
147
|
+
# Images
|
|
148
|
+
img = fs.read("s3://bucket/image.png") # PIL Image
|
|
149
|
+
|
|
150
|
+
# Text
|
|
151
|
+
content = fs.read("s3://bucket/readme.md")
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
#### `fs.write(uri, data, **options)`
|
|
155
|
+
Write file with automatic format detection.
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
# Columnar data
|
|
159
|
+
fs.write("s3://bucket/output.csv", polars_df)
|
|
160
|
+
fs.write("s3://bucket/output.parquet", pandas_df)
|
|
161
|
+
|
|
162
|
+
# Structured data
|
|
163
|
+
fs.write("s3://bucket/config.yaml", {"key": "value"})
|
|
164
|
+
fs.write("s3://bucket/data.json", {"data": [1, 2, 3]})
|
|
165
|
+
|
|
166
|
+
# Images
|
|
167
|
+
fs.write("s3://bucket/image.png", pil_image, dpi=300)
|
|
168
|
+
|
|
169
|
+
# Text
|
|
170
|
+
fs.write("s3://bucket/output.txt", "Hello, world!")
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
#### `fs.copy(uri_from, uri_to)`
|
|
174
|
+
Copy between filesystems.
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
# S3 to S3
|
|
178
|
+
fs.copy("s3://bucket1/file.csv", "s3://bucket2/file.csv")
|
|
179
|
+
|
|
180
|
+
# Download
|
|
181
|
+
fs.copy("s3://bucket/file.pdf", "/tmp/file.pdf")
|
|
182
|
+
|
|
183
|
+
# Upload
|
|
184
|
+
fs.copy("/local/file.png", "s3://bucket/images/file.png")
|
|
185
|
+
|
|
186
|
+
# Local to local
|
|
187
|
+
fs.copy("/src/file.txt", "/dst/file.txt")
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
#### `fs.ls(uri, **options) -> list[str]`
|
|
191
|
+
List files recursively.
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
# S3
|
|
195
|
+
files = fs.ls("s3://bucket/prefix/")
|
|
196
|
+
# [
|
|
197
|
+
# "s3://bucket/prefix/file1.csv",
|
|
198
|
+
# "s3://bucket/prefix/subdir/file2.json",
|
|
199
|
+
# ]
|
|
200
|
+
|
|
201
|
+
# Local
|
|
202
|
+
files = fs.ls("/path/to/dir/")
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
#### `fs.ls_dirs(uri, **options) -> list[str]`
|
|
206
|
+
List immediate child directories.
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
dirs = fs.ls_dirs("s3://bucket/")
|
|
210
|
+
# [
|
|
211
|
+
# "s3://bucket/data",
|
|
212
|
+
# "s3://bucket/models",
|
|
213
|
+
# ]
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
#### `fs.exists(uri) -> bool`
|
|
217
|
+
Check if file/directory exists.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
if fs.exists("s3://bucket/file.csv"):
|
|
221
|
+
df = fs.read("s3://bucket/file.csv")
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
#### `fs.delete(uri, limit=100) -> list[str]`
|
|
225
|
+
Delete file or directory contents (with safety limit).
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
deleted = fs.delete("s3://bucket/old_data/", limit=50)
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Advanced Operations
|
|
232
|
+
|
|
233
|
+
#### `fs.read_dataset(uri) -> pyarrow.Dataset`
|
|
234
|
+
Read as PyArrow dataset for lazy loading.
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
dataset = fs.read_dataset("s3://bucket/partitioned.parquet")
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
#### `fs.read_image(uri) -> PIL.Image`
|
|
241
|
+
Read image explicitly.
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
img = fs.read_image("s3://bucket/photo.jpg")
|
|
245
|
+
img.show()
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
#### `fs.apply(uri, fn) -> Any`
|
|
249
|
+
Apply function to file (downloads to /tmp if S3).
|
|
250
|
+
|
|
251
|
+
```python
|
|
252
|
+
def process_image(path):
|
|
253
|
+
from PIL import Image
|
|
254
|
+
img = Image.open(path)
|
|
255
|
+
return img.size
|
|
256
|
+
|
|
257
|
+
width, height = fs.apply("s3://bucket/image.png", process_image)
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
#### `fs.local_file(uri) -> str`
|
|
261
|
+
Get local path (downloads from S3 if needed).
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
local_path = fs.local_file("s3://bucket/model.pkl")
|
|
265
|
+
# Returns: "/tmp/model.pkl"
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
#### `generate_presigned_url(url, expiry=3600, for_upload=False) -> str`
|
|
269
|
+
Generate S3 presigned URL.
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
# Download URL (expires in 1 hour)
|
|
273
|
+
download_url = generate_presigned_url("s3://bucket/file.pdf")
|
|
274
|
+
|
|
275
|
+
# Upload URL
|
|
276
|
+
upload_url = generate_presigned_url(
|
|
277
|
+
"s3://bucket/upload.pdf",
|
|
278
|
+
expiry=300, # 5 minutes
|
|
279
|
+
for_upload=True
|
|
280
|
+
)
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## ContentService Integration
|
|
284
|
+
|
|
285
|
+
For specialized document parsing (PDF, DOCX, etc.), use `ContentService`:
|
|
286
|
+
|
|
287
|
+
```python
|
|
288
|
+
from rem.services.content import ContentService
|
|
289
|
+
|
|
290
|
+
content_service = ContentService()
|
|
291
|
+
|
|
292
|
+
# Process PDF with OCR, layout detection, etc.
|
|
293
|
+
result = content_service.process_uri("s3://bucket/document.pdf")
|
|
294
|
+
# {
|
|
295
|
+
# "uri": "s3://bucket/document.pdf",
|
|
296
|
+
# "content": "Extracted text...",
|
|
297
|
+
# "metadata": {...},
|
|
298
|
+
# "provider": "pdf"
|
|
299
|
+
# }
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
The `ContentService` provides pluggable providers for complex formats that require specialized parsing.
|
|
303
|
+
|
|
304
|
+
## Parsing Hooks
|
|
305
|
+
|
|
306
|
+
Manage parsed file versions with clean separation from uploads. When you upload a PDF, what you really care about is the structured markdown + extracted images/tables. The FS provider maps uploads to parsed content deterministically.
|
|
307
|
+
|
|
308
|
+
### Convention
|
|
309
|
+
|
|
310
|
+
Separate `uploads/` and `parsed/` directories with deterministic path mapping:
|
|
311
|
+
|
|
312
|
+
```
|
|
313
|
+
# S3 paths
|
|
314
|
+
s3://rem-io-staging/v1/uploads/user-123/2025/01/19/report.pdf # Original
|
|
315
|
+
s3://rem-io-staging/v1/parsed/user-123/2025/01/19/report.pdf/ # Parsed directory
|
|
316
|
+
├── content.md # Primary content
|
|
317
|
+
├── metadata.json # Parse metadata
|
|
318
|
+
├── images/page_1.png # Extracted images
|
|
319
|
+
└── tables/table_0.parquet # Extracted tables
|
|
320
|
+
|
|
321
|
+
# Local paths
|
|
322
|
+
~/.rem/fs/v1/uploads/user-123/2025/01/19/report.pdf # Original
|
|
323
|
+
~/.rem/fs/v1/parsed/user-123/2025/01/19/report.pdf/ # Parsed directory
|
|
324
|
+
├── content.md
|
|
325
|
+
├── metadata.json
|
|
326
|
+
├── images/page_1.png
|
|
327
|
+
└── tables/table_0.parquet
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
### Configuration
|
|
331
|
+
|
|
332
|
+
Control paths via environment variables:
|
|
333
|
+
|
|
334
|
+
```bash
|
|
335
|
+
# S3 Settings
|
|
336
|
+
S3__BUCKET_NAME=rem-io-staging
|
|
337
|
+
S3__VERSION=v1
|
|
338
|
+
S3__UPLOADS_PREFIX=uploads
|
|
339
|
+
S3__PARSED_PREFIX=parsed
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
### Basic Usage
|
|
343
|
+
|
|
344
|
+
```python
|
|
345
|
+
from rem.services.fs import FS
|
|
346
|
+
|
|
347
|
+
fs = FS()
|
|
348
|
+
upload_uri = "s3://rem-io-staging/v1/uploads/user-123/2025/01/19/report.pdf"
|
|
349
|
+
|
|
350
|
+
# Check if already parsed
|
|
351
|
+
if fs.has_parsed(upload_uri):
|
|
352
|
+
markdown = fs.read_parsed(upload_uri)
|
|
353
|
+
else:
|
|
354
|
+
# Parse and cache
|
|
355
|
+
result = parse_file(upload_uri)
|
|
356
|
+
fs.write_parsed(
|
|
357
|
+
upload_uri,
|
|
358
|
+
result.markdown,
|
|
359
|
+
metadata={"provider": "kreuzberg", "page_count": 10}
|
|
360
|
+
)
|
|
361
|
+
# Writes to: s3://rem-io-staging/v1/parsed/user-123/2025/01/19/report.pdf/content.md
|
|
362
|
+
|
|
363
|
+
# List all parsed resources
|
|
364
|
+
resources = fs.list_parsed_resources(upload_uri)
|
|
365
|
+
# ['content.md', 'metadata.json', 'images/page_1.png', 'tables/table_0.parquet']
|
|
366
|
+
|
|
367
|
+
# Read specific resources
|
|
368
|
+
metadata = fs.read_parsed(upload_uri, "metadata.json")
|
|
369
|
+
image = fs.read_parsed(upload_uri, "images/page_1.png")
|
|
370
|
+
table = fs.read_parsed(upload_uri, "tables/table_0.parquet")
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### Benefits
|
|
374
|
+
|
|
375
|
+
- **Separation of concerns**: Uploads and parsed content in separate directories
|
|
376
|
+
- **Deterministic mapping**: uploads/user/date/file.pdf -> parsed/user/date/file.pdf/
|
|
377
|
+
- **Caching**: Check `has_parsed()` before re-parsing expensive files
|
|
378
|
+
- **Discoverability**: `list_parsed_resources()` shows what's available
|
|
379
|
+
- **Flexibility**: Store markdown, images, tables, any extracted content
|
|
380
|
+
- **Scalable**: Clean separation works across S3 and local filesystems
|
|
381
|
+
|
|
382
|
+
See `parsing-hooks-examples.md` for more detailed examples.
|
|
383
|
+
|
|
384
|
+
## TODO: Future Enhancements
|
|
385
|
+
|
|
386
|
+
### High Priority
|
|
387
|
+
- [ ] **ContentService integration** for PDF parsing in `read()`
|
|
388
|
+
- [ ] **Multipart uploads** for large S3 files (>5GB)
|
|
389
|
+
- [ ] **Progress bars** for large uploads/downloads (tqdm)
|
|
390
|
+
- [ ] **Pagination** in `ls_iter()` for massive directories
|
|
391
|
+
|
|
392
|
+
### Medium Priority
|
|
393
|
+
- [ ] **python-docx provider** for `.docx` files
|
|
394
|
+
- [ ] **Audio providers** (librosa/pydub) for `.wav`, `.mp3`, `.flac`
|
|
395
|
+
- [ ] **Excel dependencies** (openpyxl/xlrd) for full `.xlsx`/`.xls` support
|
|
396
|
+
- [ ] **Archive operations** (`.zip`, `.tar.gz`) via copy interface
|
|
397
|
+
- [ ] **S3 versioning** support in all operations
|
|
398
|
+
|
|
399
|
+
### Low Priority
|
|
400
|
+
- [ ] **Local caching** strategy for `LocalProvider.cache_data()`
|
|
401
|
+
- [ ] **SVG to PIL** conversion for image operations
|
|
402
|
+
- [ ] **Video formats** (`.mp4`, `.avi`) via opencv or ffmpeg
|
|
403
|
+
- [ ] **Compression** options (gzip, brotli) for text formats
|
|
404
|
+
|
|
405
|
+
## Testing
|
|
406
|
+
|
|
407
|
+
```python
|
|
408
|
+
# Test basic operations
|
|
409
|
+
from rem.services.fs import FS
|
|
410
|
+
|
|
411
|
+
fs = FS()
|
|
412
|
+
|
|
413
|
+
# Write and read
|
|
414
|
+
fs.write("/tmp/test.json", {"test": "data"})
|
|
415
|
+
data = fs.read("/tmp/test.json")
|
|
416
|
+
assert data == {"test": "data"}
|
|
417
|
+
|
|
418
|
+
# S3 operations (requires configured bucket)
|
|
419
|
+
fs.write("s3://test-bucket/data.csv", df)
|
|
420
|
+
df2 = fs.read("s3://test-bucket/data.csv")
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
## Contributing
|
|
424
|
+
|
|
425
|
+
When adding new format support:
|
|
426
|
+
|
|
427
|
+
1. Add reader logic to both `S3Provider.read()` and `LocalProvider.read()`
|
|
428
|
+
2. Add writer logic to both `S3Provider.write()` and `LocalProvider.write()`
|
|
429
|
+
3. Add optional dependency to `pyproject.toml` with comment
|
|
430
|
+
4. Add format documentation to this README
|
|
431
|
+
5. Consider ContentService for complex formats (PDF, DOCX, etc.)
|
|
432
|
+
|
|
433
|
+
## Path Conventions
|
|
434
|
+
|
|
435
|
+
REM uses standardized path conventions for consistent file organization across local and S3 storage.
|
|
436
|
+
|
|
437
|
+
### Path Structure
|
|
438
|
+
|
|
439
|
+
```
|
|
440
|
+
{base_uri}/rem/{version}/{category}/{scope}/{date_parts}/
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
**Base URI:**
|
|
444
|
+
- **Local**: `$REM_HOME/fs/` (defaults to `~/.rem/fs`)
|
|
445
|
+
- **S3**: `s3://{bucket}/` (from settings)
|
|
446
|
+
- **Auto-detection**: Uses S3 in production, local in development
|
|
447
|
+
|
|
448
|
+
**Components:**
|
|
449
|
+
|
|
450
|
+
| Component | Description | Example |
|
|
451
|
+
|-----------|-------------|---------|
|
|
452
|
+
| `base_uri` | Storage location | `s3://rem-bucket` or `/Users/user/.rem/fs` |
|
|
453
|
+
| `rem` | Namespace | `rem` |
|
|
454
|
+
| `version` | API version | `v1`, `v2` |
|
|
455
|
+
| `category` | Resource type | `uploads`, `schemas`, `users`, `temp` |
|
|
456
|
+
| `scope` | User or system | `system`, `user-123` |
|
|
457
|
+
| `date_parts` | Date hierarchy | `2025/01/19` or `2025/01/19/14_30` |
|
|
458
|
+
|
|
459
|
+
### Upload Paths
|
|
460
|
+
|
|
461
|
+
Standard structure for file uploads with date-based partitioning:
|
|
462
|
+
|
|
463
|
+
```python
|
|
464
|
+
from rem.services.fs import get_uploads_path, FS
|
|
465
|
+
|
|
466
|
+
# System uploads (no user)
|
|
467
|
+
path = get_uploads_path()
|
|
468
|
+
# /Users/user/.rem/fs/rem/v1/uploads/system/2025/01/19
|
|
469
|
+
|
|
470
|
+
# User-specific uploads
|
|
471
|
+
path = get_uploads_path(user_id="user-123")
|
|
472
|
+
# /Users/user/.rem/fs/rem/v1/uploads/user-123/2025/01/19
|
|
473
|
+
|
|
474
|
+
# With specific date
|
|
475
|
+
from datetime import date
|
|
476
|
+
path = get_uploads_path(user_id="user-456", dt=date(2025, 1, 15))
|
|
477
|
+
# /Users/user/.rem/fs/rem/v1/uploads/user-456/2025/01/15
|
|
478
|
+
|
|
479
|
+
# Include hour/minute for high-frequency uploads
|
|
480
|
+
from datetime import datetime
|
|
481
|
+
path = get_uploads_path(user_id="user-789", dt=datetime.now(), include_time=True)
|
|
482
|
+
# /Users/user/.rem/fs/rem/v1/uploads/user-789/2025/01/19/14_30
|
|
483
|
+
|
|
484
|
+
# Force S3
|
|
485
|
+
path = get_uploads_path(user_id="user-123", use_s3=True)
|
|
486
|
+
# s3://rem-bucket/rem/v1/uploads/user-123/2025/01/19
|
|
487
|
+
|
|
488
|
+
# Use with FS
|
|
489
|
+
fs = FS()
|
|
490
|
+
upload_dir = get_uploads_path(user_id="user-123")
|
|
491
|
+
fs.write(f"{upload_dir}/data.json", {"key": "value"})
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
### Versioned Resource Paths
|
|
495
|
+
|
|
496
|
+
For schemas, agents, tools, and datasets:
|
|
497
|
+
|
|
498
|
+
```python
|
|
499
|
+
from rem.services.fs import get_versioned_path
|
|
500
|
+
|
|
501
|
+
# Schemas
|
|
502
|
+
path = get_versioned_path("schemas", "user-schema")
|
|
503
|
+
# /Users/user/.rem/fs/rem/v1/schemas/user-schema
|
|
504
|
+
|
|
505
|
+
# Agents (with version)
|
|
506
|
+
path = get_versioned_path("agents", "query-agent", version="v2")
|
|
507
|
+
# /Users/user/.rem/fs/rem/v2/agents/query-agent
|
|
508
|
+
|
|
509
|
+
# Tools
|
|
510
|
+
path = get_versioned_path("tools", "web-scraper")
|
|
511
|
+
# /Users/user/.rem/fs/rem/v1/tools/web-scraper
|
|
512
|
+
|
|
513
|
+
# Datasets
|
|
514
|
+
path = get_versioned_path("datasets", "training-data")
|
|
515
|
+
# /Users/user/.rem/fs/rem/v1/datasets/training-data
|
|
516
|
+
```
|
|
517
|
+
|
|
518
|
+
### User-Scoped Paths
|
|
519
|
+
|
|
520
|
+
For user-specific storage:
|
|
521
|
+
|
|
522
|
+
```python
|
|
523
|
+
from rem.services.fs import get_user_path
|
|
524
|
+
|
|
525
|
+
# User root
|
|
526
|
+
path = get_user_path("user-123")
|
|
527
|
+
# /Users/user/.rem/fs/rem/v1/users/user-123
|
|
528
|
+
|
|
529
|
+
# User documents
|
|
530
|
+
path = get_user_path("user-123", "documents")
|
|
531
|
+
# /Users/user/.rem/fs/rem/v1/users/user-123/documents
|
|
532
|
+
|
|
533
|
+
# User images
|
|
534
|
+
path = get_user_path("user-456", "images")
|
|
535
|
+
# /Users/user/.rem/fs/rem/v1/users/user-456/images
|
|
536
|
+
```
|
|
537
|
+
|
|
538
|
+
### Temporary Paths
|
|
539
|
+
|
|
540
|
+
For temporary file processing with timestamps:
|
|
541
|
+
|
|
542
|
+
```python
|
|
543
|
+
from rem.services.fs import get_temp_path
|
|
544
|
+
|
|
545
|
+
# Default temp
|
|
546
|
+
path = get_temp_path()
|
|
547
|
+
# /Users/user/.rem/fs/rem/v1/temp/tmp/20250119_143045
|
|
548
|
+
|
|
549
|
+
# Processing temp
|
|
550
|
+
path = get_temp_path("processing")
|
|
551
|
+
# /Users/user/.rem/fs/rem/v1/temp/processing/20250119_143045
|
|
552
|
+
|
|
553
|
+
# Conversion temp
|
|
554
|
+
path = get_temp_path("conversion")
|
|
555
|
+
# /Users/user/.rem/fs/rem/v1/temp/conversion/20250119_143045
|
|
556
|
+
```
|
|
557
|
+
|
|
558
|
+
### Path Utilities
|
|
559
|
+
|
|
560
|
+
```python
|
|
561
|
+
from rem.services.fs import (
|
|
562
|
+
get_base_uri,
|
|
563
|
+
get_rem_home,
|
|
564
|
+
ensure_dir_exists,
|
|
565
|
+
join_path
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
# Get base URI (auto-detect based on environment)
|
|
569
|
+
base = get_base_uri()
|
|
570
|
+
|
|
571
|
+
# Force local or S3
|
|
572
|
+
base = get_base_uri(use_s3=False) # /Users/user/.rem/fs
|
|
573
|
+
base = get_base_uri(use_s3=True) # s3://rem-bucket
|
|
574
|
+
|
|
575
|
+
# Get REM_HOME directory
|
|
576
|
+
home = get_rem_home() # /Users/user/.rem
|
|
577
|
+
|
|
578
|
+
# Ensure directory exists (local only, no-op for S3)
|
|
579
|
+
path = ensure_dir_exists("/path/to/dir")
|
|
580
|
+
|
|
581
|
+
# Join paths (auto-detects S3 vs local)
|
|
582
|
+
path = join_path("s3://bucket", "rem", "v1", "uploads")
|
|
583
|
+
# s3://bucket/rem/v1/uploads
|
|
584
|
+
|
|
585
|
+
path = join_path("/home/user", "rem", "data")
|
|
586
|
+
# /home/user/rem/data
|
|
587
|
+
```
|
|
588
|
+
|
|
589
|
+
### Best Practices
|
|
590
|
+
|
|
591
|
+
1. **Always use path functions** - Don't hardcode paths
|
|
592
|
+
```python
|
|
593
|
+
# ✅ Good
|
|
594
|
+
from rem.services.fs import get_uploads_path
|
|
595
|
+
path = get_uploads_path(user_id="user-123")
|
|
596
|
+
|
|
597
|
+
# ❌ Bad
|
|
598
|
+
path = "/Users/user/.rem/fs/rem/v1/uploads/user-123/2025/01/19"
|
|
599
|
+
```
|
|
600
|
+
|
|
601
|
+
2. **Trust auto-detection** - Let environment determine S3 vs local
|
|
602
|
+
```python
|
|
603
|
+
# ✅ Good - auto-detects based on ENVIRONMENT
|
|
604
|
+
path = get_uploads_path(user_id="user-123")
|
|
605
|
+
|
|
606
|
+
# ❌ Unnecessary - only force when you have a specific reason
|
|
607
|
+
path = get_uploads_path(user_id="user-123", use_s3=False)
|
|
608
|
+
```
|
|
609
|
+
|
|
610
|
+
3. **Use date partitioning** - Leverage hierarchy for scalability
|
|
611
|
+
```python
|
|
612
|
+
# ✅ Good - partitioned by date
|
|
613
|
+
path = get_uploads_path(user_id="user-123", dt=datetime.now())
|
|
614
|
+
|
|
615
|
+
# ✅ Also good - include time for high-frequency uploads
|
|
616
|
+
path = get_uploads_path(user_id="user-123", include_time=True)
|
|
617
|
+
```
|
|
618
|
+
|
|
619
|
+
4. **User vs system scope** - Use user_id for user files, omit for system files
|
|
620
|
+
```python
|
|
621
|
+
# User files
|
|
622
|
+
user_upload = get_uploads_path(user_id="user-123")
|
|
623
|
+
|
|
624
|
+
# System files (logs, configs, etc.)
|
|
625
|
+
system_upload = get_uploads_path() # Uses "system"
|
|
626
|
+
```
|
|
627
|
+
|
|
628
|
+
5. **Ensure directories exist** - For local paths before writing
|
|
629
|
+
```python
|
|
630
|
+
from rem.services.fs import get_uploads_path, ensure_dir_exists, FS
|
|
631
|
+
|
|
632
|
+
path = get_uploads_path(user_id="user-123")
|
|
633
|
+
ensure_dir_exists(path) # No-op for S3
|
|
634
|
+
|
|
635
|
+
fs = FS()
|
|
636
|
+
fs.write(f"{path}/data.json", data)
|
|
637
|
+
```
|
|
638
|
+
|
|
639
|
+
### Path Reference
|
|
640
|
+
|
|
641
|
+
Quick reference for all path types:
|
|
642
|
+
|
|
643
|
+
| Function | Path Structure | Example |
|
|
644
|
+
|----------|----------------|---------|
|
|
645
|
+
| `get_uploads_path()` | `rem/v1/uploads/{system\|user_id}/{yyyy}/{mm}/{dd}[/{hh_mm}]` | `rem/v1/uploads/user-123/2025/01/19` |
|
|
646
|
+
| `get_versioned_path()` | `rem/{version}/{resource_type}/{name}` | `rem/v1/schemas/user-schema` |
|
|
647
|
+
| `get_user_path()` | `rem/v1/users/{user_id}[/{subpath}]` | `rem/v1/users/user-123/documents` |
|
|
648
|
+
| `get_temp_path()` | `rem/v1/temp/{prefix}/{timestamp}` | `rem/v1/temp/processing/20250119_143045` |
|
|
649
|
+
|
|
650
|
+
### Examples
|
|
651
|
+
|
|
652
|
+
See `rem/src/rem/services/fs/examples_paths.py` for complete working examples:
|
|
653
|
+
|
|
654
|
+
```bash
|
|
655
|
+
python -m rem.services.fs.examples_paths
|
|
656
|
+
```
|
|
657
|
+
|
|
658
|
+
## See Also
|
|
659
|
+
|
|
660
|
+
- ContentService: `rem/src/rem/services/content/` - Specialized parsing (PDF, DOCX, etc.)
|
|
661
|
+
- Settings: `rem/settings.py` - S3Settings, REM_HOME configuration
|
|
662
|
+
- Examples: `rem/src/rem/services/fs/examples_paths.py` - Path convention examples
|