remdb 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +566 -0
  44. rem/cli/commands/configure.py +497 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1302 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +96 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +676 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +336 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.3.0.dist-info/METADATA +1455 -0
  185. remdb-0.3.0.dist-info/RECORD +187 -0
  186. remdb-0.3.0.dist-info/WHEEL +4 -0
  187. remdb-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,1269 @@
1
+ # ContentService - File Ingestion & Processing System
2
+
3
+ Centralized file ingestion pipeline with rich parsing state and pluggable content providers.
4
+
5
+ ## Overview
6
+
7
+ ContentService provides **three distinct entry points** for file operations:
8
+
9
+ 1. **`ingest_file()`** - Complete ingestion pipeline (MCP tools, full workflow)
10
+ - Read from source (local/S3/HTTP)
11
+ - Store to internal tenant-scoped storage
12
+ - Parse and extract rich content (parsing state)
13
+ - Chunk and embed into searchable resources
14
+ - Create File entity + Resource chunks in database
15
+
16
+ 2. **`process_uri()`** - Read-only content extraction (CLI, testing)
17
+ - Extract content from any URI
18
+ - No storage, no database writes
19
+ - Returns: extracted text + metadata
20
+
21
+ 3. **`process_and_save()`** - Process existing stored files (workers)
22
+ - Process files already in internal storage
23
+ - Chunk and create Resource entities
24
+ - Used by SQS worker for async processing
25
+
26
+ ### Parsing State - The Innovation
27
+
28
+ Files (PDF, WAV, DOCX, etc.) are converted to **rich parsing state**:
29
+ - **Content**: Markdown-formatted text (preserves structure)
30
+ - **Metadata**: File info, extraction details, timestamps
31
+ - **Tables**: Structured data extracted as CSV
32
+ - **Images**: Extracted images for multimodal RAG
33
+ - **Provider Info**: Which parser was used, version, settings
34
+
35
+ This enables agents to deeply understand documents beyond simple text.
36
+
37
+ ## File Processing Conventions
38
+
39
+ ### S3 Path Structure
40
+
41
+ When files are uploaded to S3, they follow a strict convention for organization:
42
+
43
+ ```
44
+ s3://bucket/uploads/path/to/document.pdf # Original uploaded file
45
+ s3://bucket/parsed/path/to/document-pdf/ # Parsed artifacts directory
46
+ ├── document.pdf.parsed.md # Structured markdown output
47
+ ├── document.pdf.meta.yaml # Extraction metadata
48
+ └── artifacts/ # Extracted artifacts
49
+ ├── images/ # Extracted images
50
+ │ ├── page-1-img-0.png
51
+ │ ├── page-2-img-1.jpg
52
+ │ └── ...
53
+ └── tables/ # Extracted tables
54
+ ├── table-0.csv
55
+ ├── table-0.png # Cropped table image
56
+ ├── table-1.csv
57
+ └── ...
58
+ ```
59
+
60
+ ### Path Mapping Convention
61
+
62
+ The parsed path mirrors the upload path with two changes:
63
+ 1. Replace `/uploads/` prefix with `/parsed/`
64
+ 2. Use filename with extension as directory name (dots converted to hyphens)
65
+
66
+ **Examples:**
67
+ ```
68
+ uploads/docs/report.pdf → parsed/docs/report-pdf/
69
+ uploads/data/sheet.xlsx → parsed/data/sheet-xlsx/
70
+ uploads/user/123/invoice.pdf → parsed/user/123/invoice-pdf/
71
+ ```
72
+
73
+ ### Output File Formats
74
+
75
+ #### 1. Structured Markdown (`.parsed.md`)
76
+
77
+ All documents are converted to structured markdown before chunking and embedding. This provides:
78
+ - Consistent format for downstream processing
79
+ - Preserves document structure (headings, lists, tables)
80
+ - Human-readable intermediate representation
81
+ - Easy to version control and diff
82
+
83
+ **Example structure:**
84
+ ```markdown
85
+ # Document Title
86
+
87
+ ## Metadata
88
+ - **Source**: report.pdf
89
+ - **Pages**: 42
90
+ - **Extracted**: 2025-01-15T10:30:00Z
91
+ - **Parser**: kreuzberg
92
+ - **Language**: en (0.98 confidence)
93
+
94
+ ## Content
95
+
96
+ Lorem ipsum dolor sit amet...
97
+
98
+ ### Section 1
99
+
100
+ Content with **formatting** preserved.
101
+
102
+ ### Tables
103
+
104
+ See [Table 1](artifacts/tables/table-0.csv) - Financial Summary
105
+
106
+ ### Images
107
+
108
+ ![Diagram 1](artifacts/images/page-5-img-0.png)
109
+ ```
110
+
111
+ #### 2. Metadata YAML (`.meta.yaml`)
112
+
113
+ Comprehensive extraction metadata in YAML format for downstream processing:
114
+
115
+ ```yaml
116
+ # Source file metadata
117
+ source:
118
+ uri: s3://bucket/uploads/docs/report.pdf
119
+ filename: report.pdf
120
+ size_bytes: 2456789
121
+ content_type: application/pdf
122
+ uploaded_at: "2025-01-15T10:25:00Z"
123
+ etag: "abc123..."
124
+
125
+ # Extraction metadata
126
+ extraction:
127
+ parser: kreuzberg
128
+ parser_version: "3.21.0"
129
+ extracted_at: "2025-01-15T10:30:00Z"
130
+ processing_time_seconds: 45.2
131
+ config:
132
+ extract_tables: true
133
+ extract_images: true
134
+ force_ocr: false
135
+
136
+ # Document metadata
137
+ document:
138
+ page_count: 42
139
+ word_count: 8542
140
+ char_count: 52341
141
+ detected_language: en
142
+ language_confidence: 0.98
143
+ document_type: report
144
+ document_type_confidence: 0.87
145
+
146
+ # Extracted artifacts
147
+ artifacts:
148
+ tables:
149
+ - id: table-0
150
+ page: 5
151
+ rows: 12
152
+ columns: 6
153
+ file: artifacts/tables/table-0.csv
154
+ image: artifacts/tables/table-0.png
155
+ confidence: 0.92
156
+ - id: table-1
157
+ page: 18
158
+ rows: 8
159
+ columns: 4
160
+ file: artifacts/tables/table-1.csv
161
+ image: artifacts/tables/table-1.png
162
+ confidence: 0.88
163
+
164
+ images:
165
+ - id: img-0
166
+ page: 3
167
+ file: artifacts/images/page-3-img-0.png
168
+ width: 1200
169
+ height: 800
170
+ format: png
171
+ ocr_applied: false
172
+ - id: img-1
173
+ page: 12
174
+ file: artifacts/images/page-12-img-1.jpg
175
+ width: 800
176
+ height: 600
177
+ format: jpeg
178
+ ocr_applied: true
179
+ ocr_text: "Extracted text from image..."
180
+
181
+ # Quality metrics
182
+ quality:
183
+ overall_score: 0.94
184
+ ocr_required: false
185
+ warnings: []
186
+
187
+ # Chunking metadata (added after embedding)
188
+ chunking:
189
+ strategy: semantic
190
+ chunk_count: 156
191
+ avg_chunk_size: 512
192
+ overlap_tokens: 50
193
+ ```
194
+
195
+ ### Processing Pipeline
196
+
197
+ ```
198
+ 1. Upload to S3
199
+ └─> uploads/docs/report.pdf
200
+
201
+ 2. S3 Event → SQS → Worker Pod
202
+ └─> ContentService.process_uri()
203
+
204
+ 3. Extract with Kreuzberg
205
+ ├─> Text content
206
+ ├─> Tables (CSV + images)
207
+ ├─> Images (PNG/JPEG)
208
+ └─> Metadata
209
+
210
+ 4. Generate structured outputs
211
+ ├─> Convert to markdown
212
+ ├─> Save artifacts to S3
213
+ └─> Generate meta.yaml
214
+
215
+ 5. Write to S3 parsed directory
216
+ └─> parsed/docs/report-pdf/
217
+ ├── report.pdf.parsed.md
218
+ ├── report.pdf.meta.yaml
219
+ └── artifacts/
220
+ ├── images/
221
+ └── tables/
222
+
223
+ 6. Chunk and embed markdown
224
+ └─> PostgreSQL + pgvector (as Resources)
225
+
226
+ 7. Create graph edges
227
+ └─> Link Resources to User, Files, Moments
228
+ ```
229
+
230
+ ### Why This Convention?
231
+
232
+ **Separation of concerns:**
233
+ - `/uploads/` - Raw user-uploaded files (immutable)
234
+ - `/parsed/` - Processed, structured outputs (reproducible)
235
+
236
+ **Artifact organization:**
237
+ - Keeps all related files together in one directory
238
+ - Easy to reference artifacts from markdown
239
+ - Preserves relative paths for portability
240
+ - Simple to delete all artifacts for a document
241
+
242
+ **Structured markdown:**
243
+ - Single format for all document types
244
+ - Consistent chunking and embedding strategy
245
+ - Easier to build RAG context
246
+ - Human-readable for debugging
247
+
248
+ **Metadata tracking:**
249
+ - Complete audit trail of processing
250
+ - Reproducible extraction (config stored)
251
+ - Quality metrics for filtering
252
+ - Artifact registry for lookup
253
+
254
+ ## Architecture
255
+
256
+ ```
257
+ User uploads file
258
+
259
+ S3: s3://rem/uploads/docs/report.pdf
260
+ ↓ (S3 ObjectCreated event)
261
+ SQS: rem-file-processing queue
262
+ ↓ (KEDA monitors queue depth)
263
+ K8s Deployment: file-processor (0-20 pods)
264
+ ↓ (SQSFileProcessor worker)
265
+ ContentService.process_uri()
266
+ ↓ (Kreuzberg DocProvider)
267
+ Extract: text + tables + images + metadata
268
+
269
+ Generate structured outputs:
270
+ - report.pdf.parsed.md (markdown)
271
+ - report.pdf.meta.yaml (metadata)
272
+ - artifacts/images/*.png
273
+ - artifacts/tables/*.csv
274
+
275
+ S3: s3://rem/parsed/docs/report-pdf/
276
+ ↓ (TODO: chunking + embedding)
277
+ PostgreSQL + pgvector (as Resources)
278
+ ↓ (TODO: graph edges)
279
+ Link to User, File entities
280
+ ```
281
+
282
+ ## Quick Start
283
+
284
+ ### 1. Complete Ingestion (MCP Tool Pattern)
285
+
286
+ ```python
287
+ from rem.services.content import ContentService
288
+
289
+ service = ContentService()
290
+
291
+ # Full pipeline: read → store → parse → chunk → embed
292
+ result = await service.ingest_file(
293
+ file_uri="/path/to/contract.pdf", # or s3://, https://
294
+ user_id="user-123",
295
+ category="legal",
296
+ tags=["contract", "q1-2025"],
297
+ is_local_server=True # Security: only local servers can read local files
298
+ )
299
+
300
+ print(f"File ID: {result['file_id']}")
301
+ print(f"Storage: {result['storage_uri']}")
302
+ print(f"Resources created: {result['resources_created']}")
303
+ print(f"Status: {result['processing_status']}")
304
+ print(f"Parsing metadata: {result['parsing_metadata']}")
305
+ ```
306
+
307
+ ### 2. Read-Only Extraction (CLI Pattern)
308
+
309
+ ```python
310
+ service = ContentService()
311
+
312
+ # Extract content only (no storage, no database)
313
+ result = service.process_uri("./README.md")
314
+
315
+ print(result["content"]) # Extracted text
316
+ print(result["metadata"]) # File metadata
317
+ print(result["provider"]) # "markdown"
318
+ ```
319
+
320
+ ### 3. Process Existing Files (Worker Pattern)
321
+
322
+ ```python
323
+ service = ContentService()
324
+
325
+ # Process file already in internal storage
326
+ result = await service.process_and_save(
327
+ uri="file:///Users/.../.rem/fs/acme-corp/files/abc123/doc.pdf",
328
+ user_id="user-123"
329
+ )
330
+
331
+ print(f"Chunks created: {result['chunk_count']}")
332
+ ```
333
+
334
+ ### CLI Usage
335
+
336
+ ```bash
337
+ # Read-only extraction (no storage)
338
+ rem process uri ./README.md
339
+ rem process uri s3://bucket/doc.md -o json
340
+ rem process uri https://example.com/paper.pdf -s output.json
341
+ ```
342
+
343
+ ### MCP Tool Usage
344
+
345
+ ```python
346
+ # Via MCP protocol (uses ingest_file internally)
347
+ result = await parse_and_ingest_file(
348
+ file_uri="s3://bucket/report.pdf",
349
+ user_id="user-123",
350
+ is_local_server=False # Remote server = no local file access
351
+ )
352
+ ```
353
+
354
+ ## Supported File Formats
355
+
356
+ ### Currently Supported
357
+
358
+ #### Markdown (`.md`, `.markdown`)
359
+ - **Provider**: `TextProvider`
360
+ - **Capabilities**: UTF-8 text extraction, heading analysis, line/char counts
361
+ - **Use case**: Documentation, notes, structured text
362
+
363
+ #### PDF (`.pdf`)
364
+ - **Provider**: `DocProvider` (powered by Kreuzberg)
365
+ - **Capabilities**:
366
+ - Text extraction with OCR fallback (Tesseract)
367
+ - Intelligent table detection and reconstruction
368
+ - Multi-format support (native PDF, scanned, password-protected)
369
+ - Daemon-safe subprocess workaround for ASGI servers
370
+ - Configurable accuracy thresholds
371
+ - **Use case**: Documents, reports, forms, scanned papers
372
+
373
+ #### Audio (`.wav`, `.mp3`, `.m4a`, `.flac`, `.ogg`)
374
+ - **Provider**: `AudioProvider` (powered by OpenAI Whisper)
375
+ - **Capabilities**:
376
+ - Speech-to-text transcription
377
+ - Automatic silence-based chunking
378
+ - Markdown-formatted output with timestamps
379
+ - Cost estimation ($0.006/minute)
380
+ - **Use case**: Meeting recordings, interviews, podcasts, voice memos
381
+
382
+ #### Images (`.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`)
383
+ - **Provider**: `ImageProvider` (vision LLM + CLIP embeddings)
384
+ - **Capabilities**:
385
+ - **Tier-based vision analysis**: Gold tier users always get vision LLM descriptions
386
+ - **Sampling-based analysis**: Non-gold users get analysis based on sample rate (0.0-1.0)
387
+ - **Multi-provider support**: Anthropic Claude, Google Gemini, OpenAI GPT-4o
388
+ - **Metadata extraction**: Image dimensions, format detection
389
+ - **Markdown descriptions**: Vision LLM generates detailed markdown descriptions
390
+ - **CLIP embeddings**: Semantic image search with Jina AI (512/768-dim vectors)
391
+ - **Graceful degradation**: Falls back when API keys unavailable
392
+ - **Use case**: Screenshots, diagrams, charts, photos, scanned images
393
+ - **Configuration**:
394
+ - Vision LLM (expensive, tier/sample-gated):
395
+ - `CONTENT__IMAGE_VLLM_SAMPLE_RATE`: Sampling rate (0.0 = never, 1.0 = always)
396
+ - `CONTENT__IMAGE_VLLM_PROVIDER`: Provider (anthropic, gemini, openai)
397
+ - `CONTENT__IMAGE_VLLM_MODEL`: Model name (optional, uses provider default)
398
+ - CLIP Embeddings (cheap, always-on when key provided):
399
+ - `CONTENT__CLIP_PROVIDER`: Provider (jina for API, self-hosted for future, default: jina)
400
+ - `CONTENT__CLIP_MODEL`: Model (jina-clip-v1 or jina-clip-v2, default: v2)
401
+ - `CONTENT__JINA_API_KEY`: Jina AI API key (get free key at https://jina.ai/embeddings/)
402
+ - **Storage**: Saves to `ImageResource` table (separate from regular `Resource`)
403
+ - **Pricing**:
404
+ - Vision LLM: ~$0.01-0.05 per image (varies by provider)
405
+ - CLIP: ~$0.02 per million tokens (~4K tokens per 512x512 image = $0.00008/image)
406
+ - Free tier: 10M CLIP tokens (~2,500 images)
407
+
408
+ ### Planned Support (via Kreuzberg)
409
+
410
+ Kreuzberg supports 50+ file formats. Easy to add via provider pattern:
411
+
412
+ #### Documents & Productivity
413
+ - **Word**: `.docx`, `.doc` - Microsoft Word documents
414
+ - **Excel**: `.xlsx`, `.xls`, `.ods` - Spreadsheets with table extraction
415
+ - **PowerPoint**: `.pptx`, `.ppt` - Presentations
416
+ - **Rich Text**: `.rtf` - Formatted text documents
417
+ - **EPUB**: Digital books
418
+
419
+ #### Structured Data
420
+ - **Web**: `.html`, `.xml` - Web pages and markup
421
+ - **Data**: `.json`, `.yaml`, `.toml` - Configuration and data files
422
+
423
+ #### Academic & Technical
424
+ - **LaTeX**: `.tex`, `.bib` - Academic papers and bibliographies
425
+ - **Jupyter**: `.ipynb` - Notebooks with code and outputs
426
+ - **Markup**: `.rst`, `.org` - reStructuredText, Org Mode
427
+ - **Markdown variants**: Enhanced markdown processing
428
+
429
+ #### Communication
430
+ - **Email**: `.eml`, `.msg` - Email messages with attachments
431
+
432
+ #### Archives
433
+ - **Compressed**: `.zip`, `.tar`, `.gz`, `.7z` - Extract and process contents
434
+
435
+ ## Kreuzberg Document Intelligence
436
+
437
+ ### What is Kreuzberg?
438
+
439
+ Kreuzberg is a polyglot document processing system with a Rust core that extracts text, metadata, and structured information from documents. It provides:
440
+
441
+ - **50+ file format support** - PDFs, Office docs, images, HTML, XML, archives, email, and more
442
+ - **OCR with table extraction** - Multiple backends (Tesseract, EasyOCR, PaddleOCR)
443
+ - **Intelligent table detection** - Reconstructs table structure with configurable thresholds
444
+ - **Batch processing** - Concurrent document handling with automatic resource management
445
+ - **Memory efficiency** - Streaming parsers handle multi-GB files with constant memory
446
+ - **Language detection** - Automatic detection with confidence thresholds
447
+ - **Metadata extraction** - Authors, titles, dates, page counts, EXIF, format-specific props
448
+
449
+ ### ExtractionConfig Options
450
+
451
+ Kreuzberg's `ExtractionConfig` provides fine-grained control over extraction:
452
+
453
+ ```python
454
+ from kreuzberg import ExtractionConfig, GMFTConfig
455
+
456
+ config = ExtractionConfig(
457
+ # Table extraction
458
+ extract_tables=True, # Enable table detection
459
+ extract_tables_from_ocr=False, # Extract tables from OCR images
460
+ gmft_config=GMFTConfig(
461
+ detector_base_threshold=0.85, # Detection confidence threshold
462
+ enable_multi_header=True, # Support multi-row headers
463
+ remove_null_rows=True, # Clean up empty rows
464
+ ),
465
+
466
+ # Image extraction
467
+ extract_images=True, # Extract embedded images
468
+ deduplicate_images=True, # Remove duplicate images
469
+ ocr_extracted_images=False, # Run OCR on extracted images
470
+ image_ocr_min_dimensions=(50, 50), # Min image size for OCR
471
+ image_ocr_max_dimensions=(10000, 10000),# Max image size for OCR
472
+
473
+ # OCR configuration
474
+ force_ocr=False, # Force OCR even for native text
475
+ ocr_backend='tesseract', # 'tesseract', 'paddle', 'easyocr'
476
+
477
+ # Content processing
478
+ chunk_content=False, # Enable semantic chunking
479
+ max_chars=2000, # Max chars per chunk
480
+ max_overlap=100, # Overlap between chunks
481
+
482
+ # Entity extraction
483
+ extract_entities=False, # Extract named entities
484
+ extract_keywords=False, # Extract keywords
485
+ keyword_count=10, # Number of keywords to extract
486
+
487
+ # Language detection
488
+ auto_detect_language=False, # Auto-detect document language
489
+ language_detection_model='auto', # 'lite', 'full', 'auto'
490
+
491
+ # Document classification
492
+ auto_detect_document_type=False, # Classify document type
493
+ document_type_confidence_threshold=0.5, # Min confidence threshold
494
+
495
+ # Quality processing
496
+ enable_quality_processing=True, # Enable quality assessment
497
+
498
+ # PDF-specific
499
+ pdf_password='', # Password for encrypted PDFs
500
+ target_dpi=150, # Target DPI for rendering
501
+ auto_adjust_dpi=True, # Auto-adjust DPI for large pages
502
+
503
+ # HTML/JSON-specific
504
+ html_to_markdown_config=None, # HTML→Markdown options
505
+ json_config=None, # JSON extraction options
506
+
507
+ # Performance
508
+ use_cache=True, # Enable extraction cache
509
+ )
510
+ ```
511
+
512
+ ### Extraction Result Structure
513
+
514
+ Kreuzberg returns a rich result object with comprehensive metadata:
515
+
516
+ ```python
517
+ from kreuzberg import extract_file_sync
518
+ from pathlib import Path
519
+
520
+ result = extract_file_sync(Path("document.pdf"), config=config)
521
+
522
+ # Core content
523
+ result.content # str: Full extracted text
524
+ result.mime_type # str: Detected MIME type
525
+
526
+ # Tables (list of dicts)
527
+ for table in result.tables:
528
+ table['page_number'] # int: Source page
529
+ table['text'] # str: Table as markdown/text
530
+ table['df'] # pandas.DataFrame: Structured data
531
+ table['cropped_image'] # PIL.Image: Cropped table image
532
+
533
+ # Images (list of dicts)
534
+ for img in result.images:
535
+ img['page'] # int: Source page
536
+ img['image'] # PIL.Image: Image data
537
+ img['width'], img['height'] # int: Dimensions
538
+ img['format'] # str: Image format
539
+
540
+ # Document metadata
541
+ result.metadata = {
542
+ 'page_count': int, # Number of pages
543
+ 'author': str, # Document author
544
+ 'title': str, # Document title
545
+ 'creation_date': str, # ISO8601 timestamp
546
+ 'modification_date': str, # ISO8601 timestamp
547
+ 'summary': str, # Auto-generated summary
548
+ 'quality_score': float, # 0.0-1.0 quality metric
549
+ }
550
+
551
+ # Language detection
552
+ result.detected_languages = [
553
+ {'language': 'en', 'confidence': 0.98},
554
+ {'language': 'es', 'confidence': 0.02},
555
+ ]
556
+
557
+ # Document classification
558
+ result.document_type # str: 'report', 'invoice', etc.
559
+ result.document_type_confidence # float: 0.0-1.0
560
+
561
+ # Entity extraction (if enabled)
562
+ result.entities = {
563
+ 'PERSON': ['John Doe', 'Jane Smith'],
564
+ 'ORG': ['Acme Corp', 'TechCo'],
565
+ 'GPE': ['New York', 'London'],
566
+ 'DATE': ['2025-01-15', '2024-12-31'],
567
+ }
568
+
569
+ # Keywords (if enabled)
570
+ result.keywords = ['machine learning', 'artificial intelligence', ...]
571
+
572
+ # Chunks (if chunk_content=True)
573
+ result.chunks = [
574
+ {'text': '...', 'start': 0, 'end': 2000},
575
+ {'text': '...', 'start': 1900, 'end': 3900},
576
+ ]
577
+
578
+ # OCR results (if OCR was applied)
579
+ result.image_ocr_results = [...]
580
+
581
+ # Layout information
582
+ result.layout # Document layout structure
583
+
584
+ # Utility methods
585
+ result.to_dict() # Convert to dict for serialization
586
+ result.to_markdown() # Convert to markdown format
587
+ result.export_tables_to_csv(dir) # Export all tables to CSV files
588
+ result.export_tables_to_tsv(dir) # Export all tables to TSV files
589
+ result.get_table_summaries() # Get summaries of all tables
590
+ ```
591
+
592
+ ### Daemon Process Workaround
593
+
594
+ When running in ASGI servers (Hypercorn, Uvicorn), Kreuzberg's ProcessPoolExecutor may fail due to daemon restrictions. Our `DocProvider` implements a subprocess workaround:
595
+
596
+ ```python
597
+ def _is_daemon_process(self) -> bool:
598
+ """Check if running in a daemon process."""
599
+ try:
600
+ return multiprocessing.current_process().daemon
601
+ except Exception:
602
+ return False
603
+
604
+ def _parse_in_subprocess(self, file_path: Path) -> dict:
605
+ """Run kreuzberg in a separate subprocess to bypass daemon restrictions."""
606
+ # Executes parsing in isolated subprocess
607
+ # Serializes config and result as JSON
608
+ # 5 minute timeout for large documents
609
+ ```
610
+
611
+ This pattern ensures reliable parsing in production deployments.
612
+
613
+ ## Provider Configuration
614
+
615
+ ### Default Provider Settings
616
+
617
+ Providers are configured via `ContentSettings` in `settings.py`:
618
+
619
+ ```python
620
+ from rem.settings import settings
621
+
622
+ # View default supported types
623
+ print(settings.content.supported_text_types)
624
+ # [".txt", ".md", ".json", ".yaml", ".py", ".js", ...]
625
+
626
+ print(settings.content.supported_doc_types)
627
+ # [".pdf", ".docx", ".pptx", ".xlsx", ".png", ".jpg", ...]
628
+
629
+ print(settings.content.supported_audio_types)
630
+ # [".wav", ".mp3", ".m4a", ".flac", ".ogg"]
631
+ ```
632
+
633
+ ### Environment Variable Overrides
634
+
635
+ You can override the default extension lists via environment variables:
636
+
637
+ ```bash
638
+ # Override document types to only support PDFs
639
+ export CONTENT__SUPPORTED_DOC_TYPES=".pdf"
640
+
641
+ # Override text types to only support markdown and Python
642
+ export CONTENT__SUPPORTED_TEXT_TYPES=".md,.py"
643
+
644
+ # Override audio types to only support WAV
645
+ export CONTENT__SUPPORTED_AUDIO_TYPES=".wav"
646
+
647
+ # Disable audio transcription in development
648
+ export CONTENT__SUPPORTED_AUDIO_TYPES=""
649
+ ```
650
+
651
+ ### Benefits of Settings-Based Configuration
652
+
653
+ 1. **No Long Lists** - Extension lists defined in one place (settings.py)
654
+ 2. **Environment Override** - Easy configuration via env vars
655
+ 3. **Clean Code** - ContentService.__init__() is concise
656
+ 4. **Testable** - Settings can be mocked/overridden in tests
657
+ 5. **DRY** - Single source of truth for supported file types
658
+
659
+ ## Kind-Based Document Routing
660
+
661
+ **Special Processing for YAML/JSON Files**
662
+
663
+ REM implements a **kind-based routing system** that intercepts YAML/JSON files during ingestion and routes them to specialized processors based on their `kind` field. This allows different document types to be stored in appropriate database tables instead of the default `resources` table.
664
+
665
+ ### Architecture
666
+
667
+ ```
668
+ YAML/JSON file ingestion
669
+
670
+ ContentService.process_and_save()
671
+
672
+ Check for 'kind' field in document
673
+ ├─ kind=agent or kind=evaluator
674
+ │ ↓
675
+ │ SchemaProvider validates schema structure
676
+ │ ↓
677
+ │ Save to 'schemas' table (NOT resources)
678
+ │ ↓
679
+ │ Log: "🔧 Custom provider flow initiated: kind=agent"
680
+
681
+ ├─ kind=engram
682
+ │ ↓
683
+ │ EngramProcessor creates structured memory
684
+ │ ↓
685
+ │ Save to 'resources' table (parent engram)
686
+ │ + 'moments' table (temporal events)
687
+ │ ↓
688
+ │ Log: "🔧 Custom provider flow initiated: kind=engram"
689
+
690
+ └─ No kind field
691
+
692
+ Standard text processing
693
+
694
+ Save to 'resources' table (default)
695
+ ```
696
+
697
+ ### Supported Kinds
698
+
699
+ #### 1. **kind=agent** (Agent Schemas)
700
+
701
+ Agent schemas define AI agents with structured outputs and tool access.
702
+
703
+ **Required fields:**
704
+ - `kind: agent`
705
+ - `name: <kebab-case-name>` (e.g., "cv-parser", "query-agent")
706
+ - `type: object`
707
+ - `properties: {...}` (output schema)
708
+ - `json_schema_extra.version: "1.0.0"`
709
+
710
+ **Storage:** `schemas` table with `category='agent'`
711
+
712
+ **Example:**
713
+ ```yaml
714
+ ---
715
+ type: object
716
+ description: |
717
+ You are a CV parser that extracts structured candidate information.
718
+
719
+ properties:
720
+ candidate_name:
721
+ type: string
722
+ description: Full name
723
+ skills:
724
+ type: array
725
+ items:
726
+ type: string
727
+
728
+ required:
729
+ - candidate_name
730
+
731
+ json_schema_extra:
732
+ kind: agent
733
+ name: cv-parser
734
+ version: "1.0.0"
735
+ tags: [recruitment, hr]
736
+ ```
737
+
738
+ #### 2. **kind=evaluator** (Evaluator Schemas)
739
+
740
+ Evaluator schemas define LLM-as-a-Judge evaluators for agent output assessment.
741
+
742
+ **Required fields:**
743
+ - `kind: evaluator`
744
+ - `name: <kebab-case-name>` (e.g., "rem-lookup-correctness")
745
+ - `type: object`
746
+ - `properties: {...}` (evaluation criteria)
747
+ - `json_schema_extra.version: "1.0.0"`
748
+
749
+ **Storage:** `schemas` table with `category='evaluator'`
750
+
751
+ **Example:**
752
+ ```yaml
753
+ ---
754
+ type: object
755
+ description: |
756
+ Evaluate REM LOOKUP query correctness.
757
+
758
+ properties:
759
+ correctness:
760
+ type: number
761
+ minimum: 0.0
762
+ maximum: 1.0
763
+ reasoning:
764
+ type: string
765
+
766
+ required:
767
+ - correctness
768
+
769
+ json_schema_extra:
770
+ kind: evaluator
771
+ name: rem-lookup-correctness
772
+ version: "1.0.0"
773
+ ```
774
+
775
+ #### 3. **kind=engram** (Memory Documents)
776
+
777
+ Engrams are bio-inspired memory documents that combine resources and temporal moments.
778
+
779
+ **Required fields:**
780
+ - `kind: engram`
781
+ - `name: <unique-identifier>`
782
+ - `content: <text content>`
783
+ - `moments: [...]` (optional temporal events)
784
+
785
+ **Storage:**
786
+ - Parent engram → `resources` table with `category='engram'`
787
+ - Child moments → `moments` table with graph edges to parent
788
+
789
+ **Example:**
790
+ ```yaml
791
+ ---
792
+ kind: engram
793
+ name: team-standup-2025-01-15
794
+ category: meeting
795
+ summary: Daily standup discussion
796
+ content: |
797
+ Team discussed sprint progress and blockers.
798
+
799
+ moments:
800
+ - start_time: 2025-01-15T09:00:00Z
801
+ end_time: 2025-01-15T09:15:00Z
802
+ summary: Sprint update
803
+ speakers: [sarah, mike]
804
+ topics: [sprint-planning, blockers]
805
+
806
+ graph_edges:
807
+ - dst: sarah-chen
808
+ rel_type: participated_in
809
+ weight: 1.0
810
+ ```
811
+
812
+ ### Implementation Details
813
+
814
+ #### ContentService._process_schema()
815
+
816
+ Handles `kind=agent` and `kind=evaluator`:
817
+
818
+ ```python
819
+ async def _process_schema(result: dict, uri: str, user_id: str) -> dict:
820
+ """
821
+ Save agent/evaluator schema to schemas table.
822
+
823
+ Args:
824
+ result: Extraction result from SchemaProvider with:
825
+ - metadata.kind: "agent" or "evaluator"
826
+ - metadata.name: Schema name
827
+ - schema_data: Full JSON Schema dict
828
+ uri: File URI
829
+ user_id: Tenant ID
830
+
831
+ Returns:
832
+ dict with schema_name, kind, version, status
833
+ """
834
+ # Create Schema entity
835
+ schema_entity = Schema(
836
+ tenant_id=user_id,
837
+ name=metadata['name'],
838
+ spec=schema_data,
839
+ category=metadata['kind'], # "agent" or "evaluator"
840
+ provider_configs=metadata.get('provider_configs', []),
841
+ embedding_fields=metadata.get('embedding_fields', []),
842
+ )
843
+
844
+ # Save to schemas table (NOT resources)
845
+ await postgres.batch_upsert(
846
+ records=[schema_entity],
847
+ model=Schema,
848
+ table_name="schemas",
849
+ entity_key_field="name",
850
+ )
851
+
852
+ logger.info(f"✅ Schema saved: {name} (kind={kind})")
853
+ ```
854
+
855
+ #### ContentService._process_engram()
856
+
857
+ Handles `kind=engram`:
858
+
859
+ ```python
860
+ async def _process_engram(data: dict, uri: str, user_id: str) -> dict:
861
+ """
862
+ Process engram into resources + moments.
863
+
864
+ Args:
865
+ data: Parsed engram with kind=engram
866
+ uri: File URI
867
+ user_id: User ID
868
+
869
+ Returns:
870
+ dict with resource_id, moment_ids, chunks_created
871
+ """
872
+ # Delegate to EngramProcessor
873
+ processor = EngramProcessor(postgres)
874
+ result = await processor.process_engram(
875
+ data=data,
876
+ user_id=user_id,
877
+ )
878
+
879
+ logger.info(f"✅ Engram processed: {result['resource_id']} "
880
+ f"with {len(result['moment_ids'])} moments")
881
+
882
+ return result
883
+ ```
884
+
885
+ ### Why Kind-Based Routing?
886
+
887
+ **Separation of Concerns:**
888
+ - Agents/evaluators belong in `schemas` table (metadata, no embeddings)
889
+ - Engrams belong in `resources` + `moments` tables (dual indexing)
890
+ - Regular files belong in `resources` table (standard chunking)
891
+
892
+ **Intercepting Default Flow:**
893
+ - Without routing, YAML/JSON files would be chunked and embedded as generic resources
894
+ - `kind` field allows processors to intercept before default processing
895
+ - Enables specialized handling while maintaining simple ingestion API
896
+
897
+ **Extensibility:**
898
+ - Easy to add new kinds (e.g., `kind=workflow`, `kind=config`)
899
+ - Each kind maps to a specialized processor
900
+ - Processors can save to any tables, not just resources
901
+
902
+ ### Logging
903
+
904
+ Custom provider flows are logged with emoji markers for visibility:
905
+
906
+ ```
907
+ 🔧 Custom provider flow initiated: kind=agent for cv-parser.yaml
908
+ Saving schema to schemas table: kind=agent, name=cv-parser, version=1.0.0
909
+ ✅ Schema saved: cv-parser (kind=agent)
910
+ ```
911
+
912
+ ```
913
+ 🔧 Custom provider flow initiated: kind=engram for team-standup.yaml
914
+ Processing engram: team-standup-2025-01-15
915
+ ✅ Engram processed: res_abc123 with 3 moments
916
+ ```
917
+
918
+ ## Provider Plugin System
919
+
920
+ You can register custom providers to either:
921
+ 1. **Add support for new file types** (e.g., `.epub`, `.rtf`)
922
+ 2. **Override default providers** for specific formats (e.g., use PyMuPDF instead of Kreuzberg for PDFs)
923
+
924
+ ### Example: Custom Provider for New File Type
925
+
926
+ ```python
927
+ from rem.services.content import ContentService
928
+ from rem.services.content.providers import ContentProvider
929
+
930
+ class EpubProvider(ContentProvider):
931
+ @property
932
+ def name(self) -> str:
933
+ return "epub"
934
+
935
+ def extract(self, content: bytes, metadata: dict) -> dict:
936
+ # Use ebooklib or other EPUB parser
937
+ text = extract_text_from_epub(content)
938
+ return {
939
+ "text": text,
940
+ "metadata": {
941
+ "chapters": 12,
942
+ "author": "...",
943
+ }
944
+ }
945
+
946
+ # Register for new file type
947
+ service = ContentService()
948
+ service.register_provider([".epub"], EpubProvider())
949
+ ```
950
+
951
+ ### Example: Override Default Provider
952
+
953
+ **Note**: This is a hypothetical example. The default `DocProvider` already handles PDFs well via Kreuzberg.
954
+
955
+ ```python
956
+ # Hypothetical: Override DocProvider with custom PDF parser
957
+ class CustomPDFProvider(ContentProvider):
958
+ @property
959
+ def name(self) -> str:
960
+ return "custom_pdf"
961
+
962
+ def extract(self, content: bytes, metadata: dict) -> dict:
963
+ # Use PyMuPDF, pdfplumber, or other library instead of Kreuzberg
964
+ text = extract_text_with_pymupdf(content)
965
+ return {
966
+ "text": text,
967
+ "metadata": {"parser": "pymupdf"}
968
+ }
969
+
970
+ # Override .pdf extension to use custom provider instead of DocProvider
971
+ service = ContentService()
972
+ service.register_provider([".pdf"], CustomPDFProvider())
973
+ ```
974
+
975
+ ### Provider Interface
976
+
977
+ All providers must implement:
978
+
979
+ ```python
980
+ class ContentProvider(ABC):
981
+ @property
982
+ @abstractmethod
983
+ def name(self) -> str:
984
+ """Provider name (e.g., 'markdown', 'pdf')."""
985
+ pass
986
+
987
+ @abstractmethod
988
+ def extract(self, content: bytes, metadata: dict) -> dict:
989
+ """
990
+ Extract content from file bytes.
991
+
992
+ Args:
993
+ content: Raw file bytes
994
+ metadata: File metadata (size, mime_type, etc.)
995
+
996
+ Returns:
997
+ Dict with 'text' and optional 'metadata' fields
998
+ """
999
+ pass
1000
+ ```
1001
+
1002
+ ## Event-Driven Processing
1003
+
1004
+ ### SQS Worker
1005
+
1006
+ Background worker that consumes S3 events from SQS queue.
1007
+
1008
+ **Features:**
1009
+ - Long polling (20s) for efficient SQS usage
1010
+ - Graceful shutdown (SIGTERM/SIGINT)
1011
+ - Batch processing (up to 10 messages)
1012
+ - DLQ support (3 retries)
1013
+ - IRSA authentication (no credentials in code)
1014
+
1015
+ **Entry point:**
1016
+ ```bash
1017
+ python -m rem.workers.sqs_file_processor
1018
+ ```
1019
+
1020
+ ### K8s Deployment
1021
+
1022
+ Production deployment with KEDA autoscaling at `manifests/application/file-processor/`
1023
+
1024
+ **Scaling Example:**
1025
+ - 0 messages → 0 pods (scale to zero)
1026
+ - 25 messages → 3 pods
1027
+ - 100 messages → 10 pods
1028
+ - 250 messages → 20 pods (max)
1029
+
1030
+ ## Configuration
1031
+
1032
+ ### Environment Variables
1033
+
1034
+ ```bash
1035
+ # .env or K8s ConfigMap
1036
+ SQS__QUEUE_URL=https://sqs.us-east-1.amazonaws.com/ACCOUNT_ID/rem-file-processing
1037
+ SQS__REGION=us-east-1
1038
+ SQS__MAX_MESSAGES=10
1039
+ SQS__WAIT_TIME_SECONDS=20
1040
+ SQS__VISIBILITY_TIMEOUT=300
1041
+
1042
+ S3__BUCKET_NAME=rem
1043
+ S3__REGION=us-east-1
1044
+ ```
1045
+
1046
+ ### IRSA Permissions
1047
+
1048
+ Worker pods need two IAM roles:
1049
+
1050
+ 1. **KEDA operator role** - Read SQS metrics
1051
+ ```
1052
+ sqs:GetQueueAttributes
1053
+ sqs:ListQueues
1054
+ ```
1055
+
1056
+ 2. **File processor role** - Consume queue + read S3
1057
+ ```
1058
+ sqs:ReceiveMessage
1059
+ sqs:DeleteMessage
1060
+ sqs:ChangeMessageVisibility
1061
+ s3:GetObject
1062
+ ```
1063
+
1064
+ ## Local Development
1065
+
1066
+ ### Test ContentService
1067
+
1068
+ ```bash
1069
+ cd rem
1070
+ uv pip install -e .
1071
+
1072
+ # Test with local file
1073
+ echo "# Test Document" > test.md
1074
+ python -c "from rem.services.content import ContentService; \
1075
+ print(ContentService().process_uri('test.md'))"
1076
+ ```
1077
+
1078
+ ### Test CLI
1079
+
1080
+ ```bash
1081
+ # Test with local file
1082
+ rem process uri test.md
1083
+
1084
+ # Test with S3 (requires AWS credentials or IRSA)
1085
+ aws s3 cp test.md s3://rem/uploads/
1086
+ rem process uri s3://rem/uploads/test.md
1087
+ ```
1088
+
1089
+ ### Test Worker
1090
+
1091
+ ```bash
1092
+ # Set environment variables
1093
+ export SQS__QUEUE_URL=https://sqs.us-east-1.amazonaws.com/ACCOUNT_ID/rem-file-processing
1094
+ export SQS__REGION=us-east-1
1095
+ export S3__BUCKET_NAME=rem
1096
+ export S3__REGION=us-east-1
1097
+
1098
+ # Run worker
1099
+ python -m rem.workers.sqs_file_processor
1100
+
1101
+ # Upload test file (in another terminal)
1102
+ aws s3 cp test.md s3://rem/uploads/
1103
+
1104
+ # Watch worker logs process the file
1105
+ ```
1106
+
1107
+ ## Production Deployment
1108
+
1109
+ ### 1. Build and push Docker image
1110
+
1111
+ ```bash
1112
+ cd rem
1113
+ docker build -f Dockerfile.worker -t your-registry/rem:latest .
1114
+ docker push your-registry/rem:latest
1115
+ ```
1116
+
1117
+ ### 2. Deploy infrastructure (Pulumi)
1118
+
1119
+ ```bash
1120
+ cd ../manifests/infra/file-queue
1121
+ pulumi up
1122
+ # Note queue URL and IAM policy ARNs
1123
+ ```
1124
+
1125
+ ### 3. Deploy KEDA platform
1126
+
1127
+ ```bash
1128
+ cd ../../platform/keda
1129
+ kubectl apply -f application.yaml
1130
+ ```
1131
+
1132
+ ### 4. Configure IRSA roles
1133
+
1134
+ ```bash
1135
+ # Annotate KEDA operator ServiceAccount
1136
+ kubectl annotate sa keda-operator -n keda \
1137
+ eks.amazonaws.com/role-arn=arn:aws:iam::ACCOUNT_ID:role/keda-operator-role
1138
+ ```
1139
+
1140
+ ### 5. Deploy file processor
1141
+
1142
+ ```bash
1143
+ cd ../../application/file-processor
1144
+
1145
+ # Update ACCOUNT_ID in deployment.yaml and keda-scaledobject.yaml
1146
+ sed -i 's/ACCOUNT_ID/123456789012/g' *.yaml
1147
+
1148
+ kubectl apply -f .
1149
+ ```
1150
+
1151
+ ### 6. Verify scaling
1152
+
1153
+ ```bash
1154
+ # Should start at 0 pods
1155
+ kubectl get pods -l app=file-processor
1156
+
1157
+ # Upload test files
1158
+ for i in {1..25}; do
1159
+ echo "test $i" | aws s3 cp - s3://rem/uploads/test-$i.md
1160
+ done
1161
+
1162
+ # Watch scale up to 3 pods
1163
+ kubectl get pods -l app=file-processor -w
1164
+
1165
+ # Watch HPA
1166
+ kubectl get hpa file-processor-scaler -w
1167
+ ```
1168
+
1169
+ ## Cost Optimization
1170
+
1171
+ - **Scale to zero**: No cost when idle
1172
+ - **Spot instances**: 70-90% savings (configured in Deployment affinity)
1173
+ - **Long polling**: Reduces SQS API calls
1174
+ - **Batch processing**: Up to 10 messages per receive
1175
+ - **KEDA efficiency**: Only scales when needed
1176
+
1177
+ **Monthly cost estimate** (us-east-1, assuming 10k files/day):
1178
+ - SQS: ~$1 (requests + data transfer)
1179
+ - S3 storage: ~$5 (100 GB)
1180
+ - Compute: ~$10 (spot instances, avg 2 pods)
1181
+ - **Total: ~$16/month**
1182
+
1183
+ ## Future Enhancements
1184
+
1185
+ ### Short Term
1186
+ - [ ] PDF support (PyMuPDF/pdfplumber provider)
1187
+ - [ ] PostgreSQL storage integration
1188
+ - [ ] Embedding generation (OpenAI/local models)
1189
+ - [ ] Graph edge creation
1190
+
1191
+ ### Medium Term
1192
+ - [ ] HTML/web page extraction
1193
+ - [ ] DOCX/Office formats
1194
+ - [ ] Image OCR (Tesseract/cloud OCR)
1195
+ - [ ] Chunking strategies for large documents
1196
+
1197
+ ### Long Term
1198
+ - [ ] Video transcription
1199
+ - [ ] Audio processing
1200
+ - [ ] Custom ML model inference
1201
+ - [ ] Multi-language support
1202
+
1203
+ ## Monitoring
1204
+
1205
+ ### CloudWatch Metrics
1206
+
1207
+ - **Queue depth**: `ApproximateNumberOfMessagesVisible`
1208
+ - **Processing rate**: Messages per second
1209
+ - **DLQ depth**: Failed messages
1210
+ - **Pod count**: Kubernetes metrics
1211
+
1212
+ ### Logs
1213
+
1214
+ ```bash
1215
+ # Worker logs
1216
+ kubectl logs -l app=file-processor -f
1217
+
1218
+ # KEDA scaling events
1219
+ kubectl logs -n keda -l app.kubernetes.io/name=keda-operator --tail=100
1220
+ ```
1221
+
1222
+ ### Alerts
1223
+
1224
+ - Queue depth > 100 for 5+ minutes (backlog)
1225
+ - DLQ depth > 0 (processing failures)
1226
+ - Pod crash loops (worker errors)
1227
+
1228
+ ## Troubleshooting
1229
+
1230
+ ### Pods not scaling
1231
+
1232
+ ```bash
1233
+ # Check ScaledObject
1234
+ kubectl describe scaledobject file-processor-scaler
1235
+
1236
+ # Check KEDA logs
1237
+ kubectl logs -n keda -l app.kubernetes.io/name=keda-operator --tail=50
1238
+ ```
1239
+
1240
+ ### Access Denied errors
1241
+
1242
+ ```bash
1243
+ # Verify IRSA annotation
1244
+ kubectl get sa file-processor -o yaml | grep role-arn
1245
+
1246
+ # Check pod environment
1247
+ kubectl get pod -l app=file-processor -o yaml | grep -A5 env:
1248
+ ```
1249
+
1250
+ ### Messages not being processed
1251
+
1252
+ ```bash
1253
+ # Check queue has messages
1254
+ aws sqs get-queue-attributes \
1255
+ --queue-url QUEUE_URL \
1256
+ --attribute-names ApproximateNumberOfMessages
1257
+
1258
+ # Check worker logs
1259
+ kubectl logs -l app=file-processor --tail=100
1260
+ ```
1261
+
1262
+ ## See Also
1263
+
1264
+ - Architecture: `/CLAUDE.md`
1265
+ - FS Service: `rem/src/rem/services/fs/` - Unified S3/local file operations
1266
+ - Settings: `rem/settings.py` - S3Settings, SQSSettings configuration
1267
+ - Infrastructure: `manifests/infra/file-queue/README.md`
1268
+ - KEDA: `manifests/platform/keda/README.md`
1269
+ - Deployment: `manifests/application/file-processor/README.md`