remdb 0.2.6__py3-none-any.whl → 0.3.103__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (82) hide show
  1. rem/__init__.py +129 -2
  2. rem/agentic/README.md +76 -0
  3. rem/agentic/__init__.py +15 -0
  4. rem/agentic/agents/__init__.py +16 -2
  5. rem/agentic/agents/sse_simulator.py +500 -0
  6. rem/agentic/context.py +7 -5
  7. rem/agentic/llm_provider_models.py +301 -0
  8. rem/agentic/providers/phoenix.py +32 -43
  9. rem/agentic/providers/pydantic_ai.py +84 -10
  10. rem/api/README.md +238 -1
  11. rem/api/deps.py +255 -0
  12. rem/api/main.py +70 -22
  13. rem/api/mcp_router/server.py +8 -1
  14. rem/api/mcp_router/tools.py +80 -0
  15. rem/api/middleware/tracking.py +172 -0
  16. rem/api/routers/admin.py +277 -0
  17. rem/api/routers/auth.py +124 -0
  18. rem/api/routers/chat/completions.py +123 -14
  19. rem/api/routers/chat/models.py +7 -3
  20. rem/api/routers/chat/sse_events.py +526 -0
  21. rem/api/routers/chat/streaming.py +468 -45
  22. rem/api/routers/dev.py +81 -0
  23. rem/api/routers/feedback.py +455 -0
  24. rem/api/routers/messages.py +473 -0
  25. rem/api/routers/models.py +78 -0
  26. rem/api/routers/shared_sessions.py +406 -0
  27. rem/auth/middleware.py +126 -27
  28. rem/cli/commands/ask.py +15 -11
  29. rem/cli/commands/configure.py +169 -94
  30. rem/cli/commands/db.py +53 -7
  31. rem/cli/commands/experiments.py +278 -96
  32. rem/cli/commands/process.py +8 -7
  33. rem/cli/commands/scaffold.py +47 -0
  34. rem/cli/commands/schema.py +9 -9
  35. rem/cli/main.py +10 -0
  36. rem/config.py +2 -2
  37. rem/models/core/core_model.py +7 -1
  38. rem/models/entities/__init__.py +21 -0
  39. rem/models/entities/domain_resource.py +38 -0
  40. rem/models/entities/feedback.py +123 -0
  41. rem/models/entities/message.py +30 -1
  42. rem/models/entities/session.py +83 -0
  43. rem/models/entities/shared_session.py +206 -0
  44. rem/models/entities/user.py +10 -3
  45. rem/registry.py +367 -0
  46. rem/schemas/agents/rem.yaml +7 -3
  47. rem/services/content/providers.py +94 -140
  48. rem/services/content/service.py +85 -16
  49. rem/services/dreaming/affinity_service.py +2 -16
  50. rem/services/dreaming/moment_service.py +2 -15
  51. rem/services/embeddings/api.py +20 -13
  52. rem/services/phoenix/EXPERIMENT_DESIGN.md +3 -3
  53. rem/services/phoenix/client.py +252 -19
  54. rem/services/postgres/README.md +29 -10
  55. rem/services/postgres/repository.py +132 -0
  56. rem/services/postgres/schema_generator.py +86 -5
  57. rem/services/rate_limit.py +113 -0
  58. rem/services/rem/README.md +14 -0
  59. rem/services/session/compression.py +17 -1
  60. rem/services/user_service.py +98 -0
  61. rem/settings.py +115 -17
  62. rem/sql/background_indexes.sql +10 -0
  63. rem/sql/migrations/001_install.sql +152 -2
  64. rem/sql/migrations/002_install_models.sql +580 -231
  65. rem/sql/migrations/003_seed_default_user.sql +48 -0
  66. rem/utils/constants.py +97 -0
  67. rem/utils/date_utils.py +228 -0
  68. rem/utils/embeddings.py +17 -4
  69. rem/utils/files.py +167 -0
  70. rem/utils/mime_types.py +158 -0
  71. rem/utils/model_helpers.py +156 -1
  72. rem/utils/schema_loader.py +273 -14
  73. rem/utils/sql_types.py +3 -1
  74. rem/utils/vision.py +9 -14
  75. rem/workers/README.md +14 -14
  76. rem/workers/db_maintainer.py +74 -0
  77. {remdb-0.2.6.dist-info → remdb-0.3.103.dist-info}/METADATA +486 -132
  78. {remdb-0.2.6.dist-info → remdb-0.3.103.dist-info}/RECORD +80 -57
  79. {remdb-0.2.6.dist-info → remdb-0.3.103.dist-info}/WHEEL +1 -1
  80. rem/sql/002_install_models.sql +0 -1068
  81. rem/sql/install_models.sql +0 -1038
  82. {remdb-0.2.6.dist-info → remdb-0.3.103.dist-info}/entry_points.txt +0 -0
@@ -2,17 +2,27 @@
2
2
 
3
3
  import json
4
4
  import multiprocessing
5
- import os
6
5
  import random
7
6
  import subprocess
8
7
  import sys
9
- import tempfile
10
8
  from abc import ABC, abstractmethod
11
9
  from pathlib import Path
12
10
  from typing import Any, Optional
13
11
 
14
12
  from loguru import logger
15
13
 
14
+ from rem.utils.constants import (
15
+ AUDIO_CHUNK_TARGET_SECONDS,
16
+ AUDIO_CHUNK_WINDOW_SECONDS,
17
+ MIN_SILENCE_MS,
18
+ SILENCE_THRESHOLD_DB,
19
+ SUBPROCESS_TIMEOUT_SECONDS,
20
+ WAV_HEADER_MIN_BYTES,
21
+ WHISPER_COST_PER_MINUTE,
22
+ )
23
+ from rem.utils.files import temp_file_from_bytes
24
+ from rem.utils.mime_types import get_extension
25
+
16
26
 
17
27
  class ContentProvider(ABC):
18
28
  """Base class for content extraction providers."""
@@ -132,7 +142,7 @@ import sys
132
142
  from pathlib import Path
133
143
  from kreuzberg import ExtractionConfig, extract_file_sync
134
144
 
135
- # Parse document with table extraction
145
+ # Parse document with kreuzberg 3.x
136
146
  config = ExtractionConfig(
137
147
  extract_tables=True,
138
148
  chunk_content=False,
@@ -144,13 +154,7 @@ result = extract_file_sync(Path(sys.argv[1]), config=config)
144
154
  # Serialize result to JSON
145
155
  output = {
146
156
  'content': result.content,
147
- 'tables': [
148
- {
149
- 'page_number': t.get('page_number', 0),
150
- 'text': t.get('text', ''),
151
- }
152
- for t in result.tables
153
- ],
157
+ 'tables': [t.model_dump() for t in result.tables] if result.tables else [],
154
158
  'metadata': result.metadata
155
159
  }
156
160
  print(json.dumps(output))
@@ -161,7 +165,7 @@ print(json.dumps(output))
161
165
  [sys.executable, "-c", script, str(file_path)],
162
166
  capture_output=True,
163
167
  text=True,
164
- timeout=300, # 5 minute timeout
168
+ timeout=SUBPROCESS_TIMEOUT_SECONDS,
165
169
  )
166
170
 
167
171
  if result.returncode != 0:
@@ -183,21 +187,9 @@ print(json.dumps(output))
183
187
  # Write bytes to temp file for kreuzberg
184
188
  # Detect extension from metadata
185
189
  content_type = metadata.get("content_type", "")
186
- extension_map = {
187
- "application/pdf": ".pdf",
188
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
189
- "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
190
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
191
- "image/png": ".png",
192
- "image/jpeg": ".jpg",
193
- }
194
- suffix = extension_map.get(content_type, ".pdf") # Default to PDF
195
-
196
- with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
197
- tmp.write(content)
198
- tmp_path = Path(tmp.name)
190
+ suffix = get_extension(content_type, default=".pdf")
199
191
 
200
- try:
192
+ with temp_file_from_bytes(content, suffix=suffix) as tmp_path:
201
193
  # Check if running in daemon process
202
194
  if self._is_daemon_process():
203
195
  logger.info("Daemon process detected - using subprocess workaround for document parsing")
@@ -211,7 +203,7 @@ print(json.dumps(output))
211
203
  }
212
204
  except Exception as e:
213
205
  logger.error(f"Subprocess parsing failed: {e}. Falling back to text-only.")
214
- # Fallback to simple text extraction
206
+ # Fallback to simple text extraction (kreuzberg 3.x API)
215
207
  from kreuzberg import ExtractionConfig, extract_file_sync
216
208
  config = ExtractionConfig(extract_tables=False)
217
209
  result = extract_file_sync(tmp_path, config=config)
@@ -221,17 +213,17 @@ print(json.dumps(output))
221
213
  "file_extension": tmp_path.suffix,
222
214
  }
223
215
  else:
224
- # Normal execution (not in daemon)
216
+ # Normal execution (not in daemon) - kreuzberg 4.x with native ONNX/Rust
225
217
  from kreuzberg import ExtractionConfig, extract_file_sync
226
218
  config = ExtractionConfig(
227
- extract_tables=True,
228
- chunk_content=False,
229
- extract_keywords=False,
219
+ enable_quality_processing=True, # Enables table extraction with native ONNX
220
+ chunk_content=False, # We handle chunking ourselves
221
+ extract_tables=False, # Disable table extraction to avoid PyTorch dependency
230
222
  )
231
223
  result = extract_file_sync(tmp_path, config=config)
232
224
  text = result.content
233
225
  extraction_metadata = {
234
- "table_count": len(result.tables),
226
+ "table_count": len(result.tables) if result.tables else 0,
235
227
  "parser": "kreuzberg",
236
228
  "file_extension": tmp_path.suffix,
237
229
  }
@@ -241,10 +233,6 @@ print(json.dumps(output))
241
233
  "metadata": extraction_metadata,
242
234
  }
243
235
 
244
- finally:
245
- # Clean up temp file
246
- tmp_path.unlink(missing_ok=True)
247
-
248
236
 
249
237
  class AudioProvider(ContentProvider):
250
238
  """
@@ -289,19 +277,20 @@ class AudioProvider(ContentProvider):
289
277
  ValueError: If OpenAI API key missing
290
278
  """
291
279
  # Handle empty or invalid content
292
- if not content or len(content) < 44: # WAV header is minimum 44 bytes
280
+ if not content or len(content) < WAV_HEADER_MIN_BYTES:
293
281
  logger.warning("Audio content too small to be valid WAV file")
294
282
  return {
295
283
  "text": "[Invalid or empty audio file]",
296
284
  "metadata": {"error": "invalid_content", "size": len(content)},
297
285
  }
298
286
 
299
- # Check for OpenAI API key
300
- api_key = os.getenv("OPENAI_API_KEY")
287
+ # Check for OpenAI API key (use settings)
288
+ from rem.settings import settings
289
+ api_key = settings.llm.openai_api_key
301
290
  if not api_key:
302
- logger.warning("No OPENAI_API_KEY found - audio transcription disabled")
291
+ logger.warning("No OpenAI API key found - audio transcription disabled")
303
292
  return {
304
- "text": "[Audio transcription requires OPENAI_API_KEY environment variable]",
293
+ "text": "[Audio transcription requires LLM__OPENAI_API_KEY to be set]",
305
294
  "metadata": {"error": "missing_api_key"},
306
295
  }
307
296
 
@@ -318,83 +307,74 @@ class AudioProvider(ContentProvider):
318
307
  # Write bytes to temp file
319
308
  # Detect extension from metadata or use .wav as fallback
320
309
  content_type = metadata.get("content_type", "audio/wav")
321
- extension_map = {
322
- "audio/wav": ".wav",
323
- "audio/mpeg": ".mp3",
324
- "audio/mp4": ".m4a",
325
- "audio/x-m4a": ".m4a",
326
- "audio/flac": ".flac",
327
- "audio/ogg": ".ogg",
328
- }
329
- extension = extension_map.get(content_type, ".wav")
310
+ extension = get_extension(content_type, default=".wav")
330
311
 
331
- with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
332
- tmp.write(content)
333
- tmp_path = Path(tmp.name)
312
+ chunker = None
313
+ chunks = None
334
314
 
335
- try:
336
- logger.info(f"Processing audio file: {tmp_path.name} ({len(content) / 1024 / 1024:.1f} MB)")
337
-
338
- # Step 1: Chunk audio by silence
339
- chunker = AudioChunker(
340
- target_chunk_seconds=60.0,
341
- chunk_window_seconds=2.0,
342
- silence_threshold_db=-40.0,
343
- min_silence_ms=500,
344
- )
345
-
346
- chunks = chunker.chunk_audio(tmp_path)
347
- logger.info(f"Created {len(chunks)} audio chunks")
348
-
349
- # Step 2: Transcribe chunks
350
- transcriber = AudioTranscriber(api_key=api_key)
351
- results = transcriber.transcribe_chunks(chunks)
352
- logger.info(f"Transcribed {len(results)} chunks")
353
-
354
- # Step 3: Combine into markdown format
355
- # Format: Each chunk becomes a section with timestamp
356
- markdown_parts = []
357
- for result in results:
358
- timestamp = f"{result.start_seconds:.1f}s - {result.end_seconds:.1f}s"
359
- markdown_parts.append(f"## [{timestamp}]\n\n{result.text}\n")
360
-
361
- markdown_text = "\n".join(markdown_parts)
362
-
363
- # Calculate metadata
364
- total_duration = sum(r.duration_seconds for r in results)
365
- estimated_cost = (total_duration / 60) * 0.006 # $0.006 per minute
366
- successful_chunks = sum(1 for r in results if r.confidence > 0)
367
-
368
- extraction_metadata = {
369
- "chunk_count": len(chunks),
370
- "transcribed_chunks": successful_chunks,
371
- "duration_seconds": total_duration,
372
- "estimated_cost": estimated_cost,
373
- "parser": "whisper_api",
374
- }
315
+ with temp_file_from_bytes(content, suffix=extension) as tmp_path:
316
+ try:
317
+ logger.info(f"Processing audio file: {tmp_path.name} ({len(content) / 1024 / 1024:.1f} MB)")
318
+
319
+ # Step 1: Chunk audio by silence
320
+ chunker = AudioChunker(
321
+ target_chunk_seconds=AUDIO_CHUNK_TARGET_SECONDS,
322
+ chunk_window_seconds=AUDIO_CHUNK_WINDOW_SECONDS,
323
+ silence_threshold_db=SILENCE_THRESHOLD_DB,
324
+ min_silence_ms=MIN_SILENCE_MS,
325
+ )
375
326
 
376
- logger.info(
377
- f"Transcription complete: {successful_chunks}/{len(chunks)} chunks, "
378
- f"${estimated_cost:.3f} cost"
379
- )
327
+ chunks = chunker.chunk_audio(tmp_path)
328
+ logger.info(f"Created {len(chunks)} audio chunks")
380
329
 
381
- return {
382
- "text": markdown_text,
383
- "metadata": extraction_metadata,
384
- }
330
+ # Step 2: Transcribe chunks
331
+ transcriber = AudioTranscriber(api_key=api_key)
332
+ results = transcriber.transcribe_chunks(chunks)
333
+ logger.info(f"Transcribed {len(results)} chunks")
385
334
 
386
- except Exception as e:
387
- logger.error(f"Audio extraction failed: {e}")
388
- raise RuntimeError(f"Audio transcription failed: {e}") from e
335
+ # Step 3: Combine into markdown format
336
+ # Format: Each chunk becomes a section with timestamp
337
+ markdown_parts = []
338
+ for result in results:
339
+ timestamp = f"{result.start_seconds:.1f}s - {result.end_seconds:.1f}s"
340
+ markdown_parts.append(f"## [{timestamp}]\n\n{result.text}\n")
341
+
342
+ markdown_text = "\n".join(markdown_parts)
343
+
344
+ # Calculate metadata
345
+ total_duration = sum(r.duration_seconds for r in results)
346
+ estimated_cost = (total_duration / 60) * WHISPER_COST_PER_MINUTE
347
+ successful_chunks = sum(1 for r in results if r.confidence > 0)
348
+
349
+ extraction_metadata = {
350
+ "chunk_count": len(chunks),
351
+ "transcribed_chunks": successful_chunks,
352
+ "duration_seconds": total_duration,
353
+ "estimated_cost": estimated_cost,
354
+ "parser": "whisper_api",
355
+ }
356
+
357
+ logger.info(
358
+ f"Transcription complete: {successful_chunks}/{len(chunks)} chunks, "
359
+ f"${estimated_cost:.3f} cost"
360
+ )
361
+
362
+ return {
363
+ "text": markdown_text,
364
+ "metadata": extraction_metadata,
365
+ }
389
366
 
390
- finally:
391
- # Clean up temp file and chunks
392
- try:
393
- tmp_path.unlink(missing_ok=True)
394
- if 'chunker' in locals() and 'chunks' in locals():
395
- chunker.cleanup_chunks(chunks)
396
367
  except Exception as e:
397
- logger.warning(f"Cleanup failed: {e}")
368
+ logger.error(f"Audio extraction failed: {e}")
369
+ raise RuntimeError(f"Audio transcription failed: {e}") from e
370
+
371
+ finally:
372
+ # Clean up audio chunks (temp file cleanup handled by context manager)
373
+ if chunker is not None and chunks is not None:
374
+ try:
375
+ chunker.cleanup_chunks(chunks)
376
+ except Exception as e:
377
+ logger.warning(f"Chunk cleanup failed: {e}")
398
378
 
399
379
 
400
380
  class SchemaProvider(ContentProvider):
@@ -672,19 +652,9 @@ class ImageProvider(ContentProvider):
672
652
 
673
653
  # Write bytes to temp file for analysis
674
654
  content_type = metadata.get("content_type", "image/png")
675
- extension_map = {
676
- "image/png": ".png",
677
- "image/jpeg": ".jpg",
678
- "image/gif": ".gif",
679
- "image/webp": ".webp",
680
- }
681
- extension = extension_map.get(content_type, ".png")
682
-
683
- with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
684
- tmp.write(content)
685
- tmp_path = Path(tmp.name)
655
+ extension = get_extension(content_type, default=".png")
686
656
 
687
- try:
657
+ with temp_file_from_bytes(content, suffix=extension) as tmp_path:
688
658
  # Analyze image
689
659
  result = analyzer.analyze_image(tmp_path)
690
660
  vision_description = result.description
@@ -692,9 +662,6 @@ class ImageProvider(ContentProvider):
692
662
  vision_model = result.model
693
663
 
694
664
  logger.info(f"Vision analysis complete: {len(vision_description)} chars")
695
- finally:
696
- # Clean up temp file
697
- tmp_path.unlink(missing_ok=True)
698
665
 
699
666
  except ImportError as e:
700
667
  logger.warning(f"Vision analysis not available: {e}")
@@ -737,19 +704,9 @@ class ImageProvider(ContentProvider):
737
704
  if embedder.is_available():
738
705
  # Write bytes to temp file for CLIP embedding
739
706
  content_type = metadata.get("content_type", "image/png")
740
- extension_map = {
741
- "image/png": ".png",
742
- "image/jpeg": ".jpg",
743
- "image/gif": ".gif",
744
- "image/webp": ".webp",
745
- }
746
- extension = extension_map.get(content_type, ".png")
707
+ extension = get_extension(content_type, default=".png")
747
708
 
748
- with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
749
- tmp.write(content)
750
- tmp_path = Path(tmp.name)
751
-
752
- try:
709
+ with temp_file_from_bytes(content, suffix=extension) as tmp_path:
753
710
  # Generate CLIP embedding
754
711
  result = embedder.embed_image(tmp_path)
755
712
  if result:
@@ -759,9 +716,6 @@ class ImageProvider(ContentProvider):
759
716
  logger.info(
760
717
  f"CLIP embedding generated: {clip_dimensions} dims, {clip_tokens} tokens"
761
718
  )
762
- finally:
763
- # Clean up temp file
764
- tmp_path.unlink(missing_ok=True)
765
719
  else:
766
720
  logger.debug(
767
721
  "CLIP embeddings disabled - set CONTENT__JINA_API_KEY to enable. "
@@ -159,13 +159,22 @@ class ContentService:
159
159
 
160
160
  extracted_content = provider.extract(content_bytes, metadata)
161
161
 
162
- return {
162
+ # Build result with standard fields
163
+ result = {
163
164
  "uri": uri,
164
165
  "content": extracted_content["text"],
165
166
  "metadata": {**metadata, **extracted_content.get("metadata", {})},
166
167
  "provider": provider.name,
167
168
  }
168
169
 
170
+ # Preserve schema-specific fields if present (from SchemaProvider)
171
+ if "is_schema" in extracted_content:
172
+ result["is_schema"] = extracted_content["is_schema"]
173
+ if "schema_data" in extracted_content:
174
+ result["schema_data"] = extracted_content["schema_data"]
175
+
176
+ return result
177
+
169
178
  except ClientError as e:
170
179
  error_code = e.response.get("Error", {}).get("Code", "")
171
180
  if error_code == "NoSuchKey":
@@ -221,13 +230,22 @@ class ContentService:
221
230
  provider = self._get_provider(file_path.suffix)
222
231
  extracted_content = provider.extract(content_bytes, metadata)
223
232
 
224
- return {
233
+ # Build result with standard fields
234
+ result = {
225
235
  "uri": str(file_path.absolute()),
226
236
  "content": extracted_content["text"],
227
237
  "metadata": {**metadata, **extracted_content.get("metadata", {})},
228
238
  "provider": provider.name,
229
239
  }
230
240
 
241
+ # Preserve schema-specific fields if present (from SchemaProvider)
242
+ if "is_schema" in extracted_content:
243
+ result["is_schema"] = extracted_content["is_schema"]
244
+ if "schema_data" in extracted_content:
245
+ result["schema_data"] = extracted_content["schema_data"]
246
+
247
+ return result
248
+
231
249
  def _get_provider(self, suffix: str) -> ContentProvider:
232
250
  """Get content provider for file extension."""
233
251
  suffix_lower = suffix.lower()
@@ -260,6 +278,7 @@ class ContentService:
260
278
  category: str | None = None,
261
279
  tags: list[str] | None = None,
262
280
  is_local_server: bool = False,
281
+ resource_type: str | None = None,
263
282
  ) -> dict[str, Any]:
264
283
  """
265
284
  Complete file ingestion pipeline: read → store → parse → chunk → embed.
@@ -304,6 +323,9 @@ class ContentService:
304
323
  category: Optional category tag (document, code, audio, etc.)
305
324
  tags: Optional list of tags
306
325
  is_local_server: True if running as local/stdio MCP server
326
+ resource_type: Optional resource type (case-insensitive). Supports:
327
+ - "resource", "resources", "Resource" → Resource (default)
328
+ - "domain-resource", "domain_resource", "DomainResource" → DomainResource
307
329
 
308
330
  Returns:
309
331
  dict with:
@@ -400,6 +422,7 @@ class ContentService:
400
422
  processing_result = await self.process_and_save(
401
423
  uri=storage_uri,
402
424
  user_id=user_id,
425
+ resource_type=resource_type,
403
426
  )
404
427
  processing_status = processing_result.get("status", "completed")
405
428
  resources_created = processing_result.get("chunk_count", 0)
@@ -441,7 +464,12 @@ class ContentService:
441
464
  "message": f"File ingested and {processing_status}. Created {resources_created} resources.",
442
465
  }
443
466
 
444
- async def process_and_save(self, uri: str, user_id: str | None = None) -> dict[str, Any]:
467
+ async def process_and_save(
468
+ self,
469
+ uri: str,
470
+ user_id: str | None = None,
471
+ resource_type: str | None = None,
472
+ ) -> dict[str, Any]:
445
473
  """
446
474
  Process file end-to-end: extract → markdown → chunk → save.
447
475
 
@@ -456,6 +484,8 @@ class ContentService:
456
484
  Args:
457
485
  uri: File URI (s3://bucket/key or local path)
458
486
  user_id: Optional user ID for multi-tenancy
487
+ resource_type: Optional resource type (case-insensitive). Defaults to "Resource".
488
+ Supports: resource, domain-resource, domain_resource, DomainResource, etc.
459
489
 
460
490
  Returns:
461
491
  dict with file metadata and chunk count
@@ -470,8 +500,9 @@ class ContentService:
470
500
  file_suffix = Path(uri).suffix.lower()
471
501
  if file_suffix in ['.yaml', '.yml', '.json']:
472
502
  # Check if schema provider detected a valid schema
473
- if result.get('metadata', {}).get('is_schema'):
474
- logger.info(f"🔧 Custom provider flow initiated: kind={result['metadata'].get('kind')} for {filename}")
503
+ # is_schema flag is at top level of result (preserved from SchemaProvider)
504
+ if result.get('is_schema'):
505
+ logger.info(f"🔧 Custom provider flow initiated: kind={result.get('metadata', {}).get('kind')} for {filename}")
475
506
  return await self._process_schema(result, uri, user_id)
476
507
 
477
508
  # Check for engram kind in raw data
@@ -515,9 +546,26 @@ class ContentService:
515
546
  await self.file_repo.upsert(file)
516
547
  logger.info(f"Saved File: {filename}")
517
548
 
518
- # Create Resource entities for each chunk
519
- resources = [
520
- Resource(
549
+ # Resolve resource model class from type parameter (case-insensitive)
550
+ from typing import cast, Type
551
+ from pydantic import BaseModel
552
+ from rem.utils.model_helpers import model_from_arbitrary_casing, get_table_name
553
+
554
+ resource_model: Type[BaseModel] = Resource # Default
555
+ if resource_type:
556
+ try:
557
+ resource_model = model_from_arbitrary_casing(resource_type)
558
+ logger.info(f"Using resource model: {resource_model.__name__}")
559
+ except ValueError as e:
560
+ logger.warning(f"Invalid resource_type '{resource_type}', using default Resource: {e}")
561
+ resource_model = Resource
562
+
563
+ # Get table name for the resolved model
564
+ table_name = get_table_name(resource_model)
565
+
566
+ # Create resource entities for each chunk
567
+ resources: list[BaseModel] = [
568
+ resource_model(
521
569
  name=f"{filename}#chunk-{i}",
522
570
  uri=f"{uri}#chunk-{i}",
523
571
  ordinal=i,
@@ -529,14 +577,35 @@ class ContentService:
529
577
  for i, chunk in enumerate(chunks)
530
578
  ]
531
579
 
532
- if self.resource_repo:
533
- await self.resource_repo.upsert(
534
- resources,
535
- embeddable_fields=["content"],
536
- generate_embeddings=True,
537
- )
538
- logger.info(f"Saved {len(resources)} Resource chunks")
539
- logger.info(f"Queued {len(resources)} embedding generation tasks for content field")
580
+ # Save resources to the appropriate table
581
+ if resources:
582
+ from rem.services.postgres import get_postgres_service
583
+
584
+ postgres = get_postgres_service()
585
+ if postgres:
586
+ await postgres.connect()
587
+ try:
588
+ await postgres.batch_upsert(
589
+ records=cast(list[BaseModel | dict], resources),
590
+ model=resource_model,
591
+ table_name=table_name,
592
+ entity_key_field="name",
593
+ embeddable_fields=["content"],
594
+ generate_embeddings=True,
595
+ )
596
+ logger.info(f"Saved {len(resources)} {resource_model.__name__} chunks to {table_name}")
597
+ logger.info(f"Queued {len(resources)} embedding generation tasks for content field")
598
+ finally:
599
+ await postgres.disconnect()
600
+ elif self.resource_repo:
601
+ # Fallback to injected repo (only works for default Resource)
602
+ await self.resource_repo.upsert(
603
+ resources,
604
+ embeddable_fields=["content"],
605
+ generate_embeddings=True,
606
+ )
607
+ logger.info(f"Saved {len(resources)} Resource chunks")
608
+ logger.info(f"Queued {len(resources)} embedding generation tasks for content field")
540
609
 
541
610
  return {
542
611
  "file": file.model_dump(),
@@ -8,12 +8,11 @@ vector similarity (fast) or LLM analysis (intelligent).
8
8
  import json
9
9
  from datetime import datetime, timedelta
10
10
  from enum import Enum
11
- from pathlib import Path
12
11
  from typing import Any, Optional
13
12
 
14
- import yaml
15
13
  from loguru import logger
16
14
 
15
+ from ...utils.schema_loader import load_agent_schema
17
16
  from ...agentic.providers.pydantic_ai import create_agent
18
17
  from ...agentic.serialization import serialize_agent_result
19
18
  from ...models.core import QueryType, RemQuery, SearchParameters
@@ -125,20 +124,7 @@ async def build_affinity(
125
124
  # Load LLM agent for relationship assessment if needed
126
125
  affinity_agent = None
127
126
  if mode == AffinityMode.LLM:
128
- schema_path = (
129
- Path(__file__).parent.parent.parent
130
- / "schemas"
131
- / "agents"
132
- / "resource-affinity-assessor.yaml"
133
- )
134
-
135
- if not schema_path.exists():
136
- raise FileNotFoundError(
137
- f"ResourceAffinityAssessor schema not found: {schema_path}"
138
- )
139
-
140
- with open(schema_path) as f:
141
- agent_schema = yaml.safe_load(f)
127
+ agent_schema = load_agent_schema("resource-affinity-assessor")
142
128
 
143
129
  affinity_agent_runtime = await create_agent(
144
130
  agent_schema_override=agent_schema,
@@ -8,13 +8,12 @@ with temporal boundaries and metadata.
8
8
 
9
9
  import json
10
10
  from datetime import datetime, timedelta
11
- from pathlib import Path
12
11
  from typing import Any, Optional
13
12
  from uuid import uuid4
14
13
 
15
- import yaml
16
14
  from loguru import logger
17
15
 
16
+ from ...utils.schema_loader import load_agent_schema
18
17
  from ...agentic.providers.pydantic_ai import create_agent
19
18
  from ...agentic.serialization import serialize_agent_result
20
19
  from ...models.entities.moment import Moment, Person
@@ -101,19 +100,7 @@ async def construct_moments(
101
100
  }
102
101
 
103
102
  # Load MomentBuilder agent schema
104
- schema_path = (
105
- Path(__file__).parent.parent.parent
106
- / "schemas"
107
- / "agents"
108
- / "core"
109
- / "moment-builder.yaml"
110
- )
111
-
112
- if not schema_path.exists():
113
- raise FileNotFoundError(f"MomentBuilder schema not found: {schema_path}")
114
-
115
- with open(schema_path) as f:
116
- agent_schema = yaml.safe_load(f)
103
+ agent_schema = load_agent_schema("moment-builder")
117
104
 
118
105
  # Prepare input data for agent
119
106
  input_data = {