remdb 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +566 -0
  44. rem/cli/commands/configure.py +497 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1302 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +96 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +801 -0
  104. rem/services/content/service.py +676 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +336 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.3.7.dist-info/METADATA +1473 -0
  185. remdb-0.3.7.dist-info/RECORD +187 -0
  186. remdb-0.3.7.dist-info/WHEEL +4 -0
  187. remdb-0.3.7.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,801 @@
1
+ """Content provider plugins for different file types."""
2
+
3
+ import json
4
+ import multiprocessing
5
+ import os
6
+ import random
7
+ import subprocess
8
+ import sys
9
+ import tempfile
10
+ from abc import ABC, abstractmethod
11
+ from pathlib import Path
12
+ from typing import Any, Optional
13
+
14
+ from loguru import logger
15
+
16
+
17
+ class ContentProvider(ABC):
18
+ """Base class for content extraction providers."""
19
+
20
+ @property
21
+ @abstractmethod
22
+ def name(self) -> str:
23
+ """Provider name for logging/debugging."""
24
+ pass
25
+
26
+ @abstractmethod
27
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
28
+ """
29
+ Extract text content from file bytes.
30
+
31
+ Args:
32
+ content: Raw file bytes
33
+ metadata: File metadata (size, type, etc.)
34
+
35
+ Returns:
36
+ dict with:
37
+ - text: Extracted text content
38
+ - metadata: Additional metadata from extraction (optional)
39
+ """
40
+ pass
41
+
42
+
43
+ class TextProvider(ContentProvider):
44
+ """
45
+ Text content provider for plain text formats.
46
+
47
+ Supports:
48
+ - Markdown (.md, .markdown) - With heading detection
49
+ - JSON (.json) - Pretty-printed text extraction
50
+ - YAML (.yaml, .yml) - Text extraction
51
+ - Plain text (.txt) - Direct UTF-8 extraction
52
+ - Code files (.py, .js, .ts, etc.) - Source code as text
53
+
54
+ Simple UTF-8 text extraction with basic metadata.
55
+ Future: Could add frontmatter parsing, JSON schema validation, etc.
56
+ """
57
+
58
+ @property
59
+ def name(self) -> str:
60
+ return "text"
61
+
62
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
63
+ """
64
+ Extract text content from plain text files.
65
+
66
+ Args:
67
+ content: Text file bytes
68
+ metadata: File metadata
69
+
70
+ Returns:
71
+ dict with text and optional metadata (line count, headings for markdown, etc.)
72
+ """
73
+ # Decode UTF-8 (with fallback to latin-1)
74
+ try:
75
+ text = content.decode("utf-8")
76
+ except UnicodeDecodeError:
77
+ logger.debug("UTF-8 decode failed, falling back to latin-1")
78
+ text = content.decode("latin-1")
79
+
80
+ # Basic text analysis
81
+ lines = text.split("\n")
82
+
83
+ # Detect headings (for markdown files)
84
+ headings = [line for line in lines if line.strip().startswith("#")]
85
+
86
+ extraction_metadata = {
87
+ "line_count": len(lines),
88
+ "heading_count": len(headings) if headings else None,
89
+ "char_count": len(text),
90
+ "encoding": "utf-8",
91
+ }
92
+
93
+ return {
94
+ "text": text,
95
+ "metadata": extraction_metadata,
96
+ }
97
+
98
+
99
+ class DocProvider(ContentProvider):
100
+ """
101
+ Document content provider using Kreuzberg.
102
+
103
+ Supports multiple document formats via Kreuzberg:
104
+ - PDF (.pdf) - Text extraction with OCR fallback
105
+ - Word (.docx) - Native format support
106
+ - PowerPoint (.pptx) - Slide content extraction
107
+ - Excel (.xlsx) - Spreadsheet data extraction
108
+ - Images (.png, .jpg) - OCR text extraction
109
+
110
+ Handles:
111
+ - Text extraction with OCR fallback
112
+ - Table detection and extraction
113
+ - Daemon process workaround for multiprocessing restrictions
114
+ """
115
+
116
+ @property
117
+ def name(self) -> str:
118
+ return "doc"
119
+
120
+ def _is_daemon_process(self) -> bool:
121
+ """Check if running in a daemon process."""
122
+ try:
123
+ return multiprocessing.current_process().daemon
124
+ except Exception:
125
+ return False
126
+
127
+ def _parse_in_subprocess(self, file_path: Path) -> dict:
128
+ """Run kreuzberg in a separate subprocess to bypass daemon restrictions."""
129
+ script = """
130
+ import json
131
+ import sys
132
+ from pathlib import Path
133
+ from kreuzberg import ExtractionConfig, extract_file_sync
134
+
135
+ # Parse document with table extraction (requires PyTorch - Python <3.13 required)
136
+ config = ExtractionConfig(
137
+ extract_tables=True,
138
+ chunk_content=False,
139
+ extract_keywords=False,
140
+ )
141
+
142
+ result = extract_file_sync(Path(sys.argv[1]), config=config)
143
+
144
+ # Serialize result to JSON
145
+ output = {
146
+ 'content': result.content,
147
+ 'tables': [t.model_dump() for t in result.tables] if result.tables else [],
148
+ 'metadata': result.metadata
149
+ }
150
+ print(json.dumps(output))
151
+ """
152
+
153
+ # Run in subprocess
154
+ result = subprocess.run(
155
+ [sys.executable, "-c", script, str(file_path)],
156
+ capture_output=True,
157
+ text=True,
158
+ timeout=300, # 5 minute timeout
159
+ )
160
+
161
+ if result.returncode != 0:
162
+ raise RuntimeError(f"Subprocess parsing failed: {result.stderr}")
163
+
164
+ return json.loads(result.stdout)
165
+
166
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
167
+ """
168
+ Extract document content using Kreuzberg.
169
+
170
+ Args:
171
+ content: Document file bytes
172
+ metadata: File metadata (should include content_type or extension)
173
+
174
+ Returns:
175
+ dict with text and extraction metadata
176
+ """
177
+ # Write bytes to temp file for kreuzberg
178
+ # Detect extension from metadata
179
+ content_type = metadata.get("content_type", "")
180
+ extension_map = {
181
+ "application/pdf": ".pdf",
182
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
183
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
184
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
185
+ "image/png": ".png",
186
+ "image/jpeg": ".jpg",
187
+ }
188
+ suffix = extension_map.get(content_type, ".pdf") # Default to PDF
189
+
190
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
191
+ tmp.write(content)
192
+ tmp_path = Path(tmp.name)
193
+
194
+ try:
195
+ # Check if running in daemon process
196
+ if self._is_daemon_process():
197
+ logger.info("Daemon process detected - using subprocess workaround for document parsing")
198
+ try:
199
+ result_dict = self._parse_in_subprocess(tmp_path)
200
+ text = result_dict["content"]
201
+ extraction_metadata = {
202
+ "table_count": len(result_dict["tables"]),
203
+ "parser": "kreuzberg_subprocess",
204
+ "file_extension": tmp_path.suffix,
205
+ }
206
+ except Exception as e:
207
+ logger.error(f"Subprocess parsing failed: {e}. Falling back to text-only.")
208
+ # Fallback to simple text extraction
209
+ from kreuzberg import ExtractionConfig, extract_file_sync
210
+ config = ExtractionConfig(extract_tables=False)
211
+ result = extract_file_sync(tmp_path, config=config)
212
+ text = result.content
213
+ extraction_metadata = {
214
+ "parser": "kreuzberg_fallback",
215
+ "file_extension": tmp_path.suffix,
216
+ }
217
+ else:
218
+ # Normal execution (not in daemon)
219
+ from kreuzberg import ExtractionConfig, extract_file_sync
220
+ # Table extraction with gmft (requires PyTorch - Python <3.13 required)
221
+ config = ExtractionConfig(
222
+ extract_tables=True,
223
+ chunk_content=False,
224
+ extract_keywords=False,
225
+ )
226
+ result = extract_file_sync(tmp_path, config=config)
227
+ text = result.content
228
+ extraction_metadata = {
229
+ "table_count": len(result.tables) if result.tables else 0,
230
+ "parser": "kreuzberg",
231
+ "file_extension": tmp_path.suffix,
232
+ }
233
+
234
+ return {
235
+ "text": text,
236
+ "metadata": extraction_metadata,
237
+ }
238
+
239
+ finally:
240
+ # Clean up temp file
241
+ tmp_path.unlink(missing_ok=True)
242
+
243
+
244
+ class AudioProvider(ContentProvider):
245
+ """
246
+ Audio content provider using AudioChunker + OpenAI Whisper.
247
+
248
+ Handles:
249
+ - Audio chunking by silence near minute boundaries
250
+ - Transcription via OpenAI Whisper API
251
+ - Converts chunks to markdown format
252
+ - Supports WAV, M4A, MP3, FLAC, OGG (via pydub + ffmpeg)
253
+
254
+ Process:
255
+ 1. Write audio bytes to temp file
256
+ 2. Chunk audio by silence (AudioChunker)
257
+ 3. Transcribe chunks (AudioTranscriber)
258
+ 4. Combine into markdown format with timestamps
259
+ 5. Clean up temp files
260
+
261
+ Returns markdown-formatted transcription that integrates
262
+ seamlessly with ContentService's markdown → chunk → embed pipeline.
263
+ """
264
+
265
+ @property
266
+ def name(self) -> str:
267
+ return "audio"
268
+
269
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
270
+ """
271
+ Extract audio content via transcription.
272
+
273
+ Args:
274
+ content: Audio file bytes
275
+ metadata: File metadata (size, type, etc.)
276
+
277
+ Returns:
278
+ dict with:
279
+ - text: Markdown-formatted transcription with timestamps
280
+ - metadata: Extraction metadata (chunk_count, duration, cost)
281
+
282
+ Raises:
283
+ RuntimeError: If transcription fails or pydub not available
284
+ ValueError: If OpenAI API key missing
285
+ """
286
+ # Handle empty or invalid content
287
+ if not content or len(content) < 44: # WAV header is minimum 44 bytes
288
+ logger.warning("Audio content too small to be valid WAV file")
289
+ return {
290
+ "text": "[Invalid or empty audio file]",
291
+ "metadata": {"error": "invalid_content", "size": len(content)},
292
+ }
293
+
294
+ # Check for OpenAI API key
295
+ api_key = os.getenv("OPENAI_API_KEY")
296
+ if not api_key:
297
+ logger.warning("No OPENAI_API_KEY found - audio transcription disabled")
298
+ return {
299
+ "text": "[Audio transcription requires OPENAI_API_KEY environment variable]",
300
+ "metadata": {"error": "missing_api_key"},
301
+ }
302
+
303
+ # Import audio services (lazy import)
304
+ try:
305
+ from rem.services.audio import AudioChunker, AudioTranscriber
306
+ except ImportError as e:
307
+ logger.error(f"Audio services not available: {e}")
308
+ return {
309
+ "text": "[Audio processing requires: pip install rem[audio]]",
310
+ "metadata": {"error": "missing_dependencies"},
311
+ }
312
+
313
+ # Write bytes to temp file
314
+ # Detect extension from metadata or use .wav as fallback
315
+ content_type = metadata.get("content_type", "audio/wav")
316
+ extension_map = {
317
+ "audio/wav": ".wav",
318
+ "audio/mpeg": ".mp3",
319
+ "audio/mp4": ".m4a",
320
+ "audio/x-m4a": ".m4a",
321
+ "audio/flac": ".flac",
322
+ "audio/ogg": ".ogg",
323
+ }
324
+ extension = extension_map.get(content_type, ".wav")
325
+
326
+ with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
327
+ tmp.write(content)
328
+ tmp_path = Path(tmp.name)
329
+
330
+ try:
331
+ logger.info(f"Processing audio file: {tmp_path.name} ({len(content) / 1024 / 1024:.1f} MB)")
332
+
333
+ # Step 1: Chunk audio by silence
334
+ chunker = AudioChunker(
335
+ target_chunk_seconds=60.0,
336
+ chunk_window_seconds=2.0,
337
+ silence_threshold_db=-40.0,
338
+ min_silence_ms=500,
339
+ )
340
+
341
+ chunks = chunker.chunk_audio(tmp_path)
342
+ logger.info(f"Created {len(chunks)} audio chunks")
343
+
344
+ # Step 2: Transcribe chunks
345
+ transcriber = AudioTranscriber(api_key=api_key)
346
+ results = transcriber.transcribe_chunks(chunks)
347
+ logger.info(f"Transcribed {len(results)} chunks")
348
+
349
+ # Step 3: Combine into markdown format
350
+ # Format: Each chunk becomes a section with timestamp
351
+ markdown_parts = []
352
+ for result in results:
353
+ timestamp = f"{result.start_seconds:.1f}s - {result.end_seconds:.1f}s"
354
+ markdown_parts.append(f"## [{timestamp}]\n\n{result.text}\n")
355
+
356
+ markdown_text = "\n".join(markdown_parts)
357
+
358
+ # Calculate metadata
359
+ total_duration = sum(r.duration_seconds for r in results)
360
+ estimated_cost = (total_duration / 60) * 0.006 # $0.006 per minute
361
+ successful_chunks = sum(1 for r in results if r.confidence > 0)
362
+
363
+ extraction_metadata = {
364
+ "chunk_count": len(chunks),
365
+ "transcribed_chunks": successful_chunks,
366
+ "duration_seconds": total_duration,
367
+ "estimated_cost": estimated_cost,
368
+ "parser": "whisper_api",
369
+ }
370
+
371
+ logger.info(
372
+ f"Transcription complete: {successful_chunks}/{len(chunks)} chunks, "
373
+ f"${estimated_cost:.3f} cost"
374
+ )
375
+
376
+ return {
377
+ "text": markdown_text,
378
+ "metadata": extraction_metadata,
379
+ }
380
+
381
+ except Exception as e:
382
+ logger.error(f"Audio extraction failed: {e}")
383
+ raise RuntimeError(f"Audio transcription failed: {e}") from e
384
+
385
+ finally:
386
+ # Clean up temp file and chunks
387
+ try:
388
+ tmp_path.unlink(missing_ok=True)
389
+ if 'chunker' in locals() and 'chunks' in locals():
390
+ chunker.cleanup_chunks(chunks)
391
+ except Exception as e:
392
+ logger.warning(f"Cleanup failed: {e}")
393
+
394
+
395
+ class SchemaProvider(ContentProvider):
396
+ """
397
+ Schema content provider for agent/evaluator schemas.
398
+
399
+ Detects and processes YAML/JSON files containing:
400
+ - Agent schemas (type: object with json_schema_extra.kind: agent and json_schema_extra.name: <name>)
401
+ - Evaluator schemas (type: object with json_schema_extra.kind: evaluator and json_schema_extra.name: <name>)
402
+
403
+ Stores schemas in the schemas table with deterministic IDs for upsert by name.
404
+
405
+ Pattern:
406
+ - Checks for schema markers (type: object + kind + name)
407
+ - Generates deterministic ID for upsert (tenant+schema_name)
408
+ - Stores full schema JSON in schemas table
409
+ - Extracts metadata (version, tags, provider_configs, embedding_fields)
410
+ """
411
+
412
+ @property
413
+ def name(self) -> str:
414
+ return "schema"
415
+
416
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
417
+ """
418
+ Extract and validate agent/evaluator schema.
419
+
420
+ Args:
421
+ content: YAML or JSON file bytes
422
+ metadata: File metadata
423
+
424
+ Returns:
425
+ dict with:
426
+ - text: Human-readable schema summary
427
+ - metadata: Schema metadata
428
+ - schema_data: Full schema dict for storage
429
+ - is_schema: True if valid schema detected
430
+
431
+ Raises:
432
+ ValueError: If schema is invalid
433
+ """
434
+ import json
435
+ import yaml
436
+ from uuid import uuid5, NAMESPACE_DNS
437
+
438
+ # Decode content
439
+ try:
440
+ text = content.decode("utf-8")
441
+ except UnicodeDecodeError:
442
+ text = content.decode("latin-1")
443
+
444
+ # Try to parse as YAML/JSON
445
+ if metadata.get("content_type") == "application/json":
446
+ schema_data = json.loads(text) # Raises JSONDecodeError on invalid JSON
447
+ else:
448
+ # Try YAML first (supports both YAML and JSON)
449
+ schema_data = yaml.safe_load(text) # Raises yaml.YAMLError on invalid YAML
450
+
451
+ # Check if it's a schema (type: object + json_schema_extra.kind + json_schema_extra.name)
452
+ if not isinstance(schema_data, dict):
453
+ return {
454
+ "text": text,
455
+ "metadata": {"parser": "schema_fallback"},
456
+ "is_schema": False,
457
+ }
458
+
459
+ # Check for schema markers
460
+ is_object_type = schema_data.get("type") == "object"
461
+ json_schema_extra = schema_data.get("json_schema_extra", {})
462
+ kind = json_schema_extra.get("kind", "")
463
+ schema_name = json_schema_extra.get("name", "")
464
+
465
+ # Must have type: object, kind (agent or evaluator), and name
466
+ is_agent_schema = is_object_type and kind == "agent" and schema_name
467
+ is_evaluator_schema = is_object_type and kind == "evaluator" and schema_name
468
+
469
+ if not (is_agent_schema or is_evaluator_schema):
470
+ return {
471
+ "text": text,
472
+ "metadata": {"parser": "schema_fallback"},
473
+ "is_schema": False,
474
+ }
475
+
476
+ # Extract schema metadata
477
+ schema_type = kind # "agent" or "evaluator"
478
+ version = json_schema_extra.get("version", "1.0.0")
479
+ tags = json_schema_extra.get("tags", [])
480
+
481
+ # Use name directly (already in kebab-case format)
482
+ short_name = schema_name
483
+
484
+ # Build human-readable summary
485
+ description = schema_data.get("description", "No description provided")
486
+ description_preview = description[:200] + "..." if len(description) > 200 else description
487
+
488
+ properties = schema_data.get("properties", {})
489
+ required_fields = schema_data.get("required", [])
490
+
491
+ summary_parts = [
492
+ f"# {schema_type.title()} Schema: {short_name}",
493
+ f"**Version:** {version}",
494
+ f"**Name:** {schema_name}",
495
+ f"**Kind:** {kind}",
496
+ "",
497
+ "## Description",
498
+ description_preview,
499
+ "",
500
+ "## Output Fields",
501
+ ]
502
+
503
+ for field_name, field_spec in list(properties.items())[:10]: # Limit to 10 fields
504
+ field_type = field_spec.get("type", "unknown")
505
+ field_desc = field_spec.get("description", "")
506
+ required = " (required)" if field_name in required_fields else ""
507
+ summary_parts.append(f"- **{field_name}**: {field_type}{required} - {field_desc[:50]}")
508
+
509
+ if len(properties) > 10:
510
+ summary_parts.append(f"- ... and {len(properties) - 10} more fields")
511
+
512
+ text_summary = "\n".join(summary_parts)
513
+
514
+ # Extract additional metadata
515
+ extraction_metadata = {
516
+ "parser": "schema",
517
+ "schema_type": schema_type,
518
+ "short_name": short_name,
519
+ "version": version,
520
+ "kind": kind,
521
+ "name": schema_name,
522
+ "tags": tags,
523
+ "field_count": len(properties),
524
+ "required_field_count": len(required_fields),
525
+ "provider_configs": json_schema_extra.get("provider_configs", []),
526
+ "embedding_fields": json_schema_extra.get("embedding_fields", []),
527
+ "category": json_schema_extra.get("category"),
528
+ }
529
+
530
+ return {
531
+ "text": text_summary,
532
+ "metadata": extraction_metadata,
533
+ "schema_data": schema_data,
534
+ "is_schema": True,
535
+ }
536
+
537
+
538
+ class ImageProvider(ContentProvider):
539
+ """
540
+ Image content provider with vision LLM analysis and CLIP embeddings.
541
+
542
+ Features:
543
+ - Tier-based vision analysis (gold tier always gets analysis)
544
+ - Sampling-based vision analysis for non-gold users
545
+ - Vision LLM description generation (Anthropic, Gemini, OpenAI)
546
+ - Future: CLIP embeddings for semantic image search
547
+
548
+ Process:
549
+ 1. Check user tier and sampling rate
550
+ 2. If eligible, run vision LLM analysis
551
+ 3. Extract image metadata (dimensions, format)
552
+ 4. Return markdown description or basic metadata
553
+ 5. Save to ImageResource table (not Resource)
554
+
555
+ Vision analysis is expensive, so it's gated by:
556
+ - User tier (gold = always, silver/free = sampled)
557
+ - Sample rate setting (0.0 = never, 1.0 = always)
558
+ """
559
+
560
+ def __init__(self, user_tier: Optional[str] = None):
561
+ """
562
+ Initialize image provider.
563
+
564
+ Args:
565
+ user_tier: User tier (free, silver, gold) for vision gating
566
+ """
567
+ self.user_tier = user_tier
568
+
569
+ @property
570
+ def name(self) -> str:
571
+ return "image"
572
+
573
+ def _should_analyze_with_vision(self, sample_rate: float) -> bool:
574
+ """
575
+ Determine if image should get vision LLM analysis.
576
+
577
+ Args:
578
+ sample_rate: Sampling rate from settings (0.0-1.0)
579
+
580
+ Returns:
581
+ True if should analyze, False otherwise
582
+ """
583
+ # Import here to avoid circular dependency
584
+ from rem.models.entities import UserTier
585
+
586
+ # Gold tier always gets vision analysis
587
+ if self.user_tier == UserTier.GOLD.value:
588
+ logger.info("Gold tier user - vision analysis enabled")
589
+ return True
590
+
591
+ # For non-gold users, use sampling
592
+ if sample_rate > 0.0:
593
+ should_analyze = random.random() < sample_rate
594
+ if should_analyze:
595
+ logger.info(f"Vision analysis sampled (rate={sample_rate})")
596
+ return should_analyze
597
+
598
+ return False
599
+
600
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
601
+ """
602
+ Extract image content with optional vision LLM analysis.
603
+
604
+ Args:
605
+ content: Image file bytes
606
+ metadata: File metadata (size, type, etc.)
607
+
608
+ Returns:
609
+ dict with:
610
+ - text: Markdown description (if vision enabled) or basic metadata
611
+ - metadata: Extraction metadata (dimensions, format, vision info)
612
+ - image_specific: Additional image metadata for ImageResource
613
+
614
+ Raises:
615
+ RuntimeError: If vision analysis fails
616
+ """
617
+ # Import settings here to avoid circular dependency
618
+ from rem.settings import settings
619
+
620
+ # Extract basic image metadata using PIL
621
+ try:
622
+ from PIL import Image
623
+ import io
624
+
625
+ img = Image.open(io.BytesIO(content))
626
+ image_width = img.width
627
+ image_height = img.height
628
+ image_format = img.format or "UNKNOWN"
629
+ except ImportError:
630
+ logger.warning("PIL not available - image metadata extraction disabled")
631
+ image_width = None
632
+ image_height = None
633
+ image_format = None
634
+ except Exception as e:
635
+ logger.warning(f"Failed to extract image metadata: {e}")
636
+ image_width = None
637
+ image_height = None
638
+ image_format = None
639
+
640
+ # Check if vision analysis should be performed
641
+ sample_rate = settings.content.image_vllm_sample_rate
642
+ should_analyze = self._should_analyze_with_vision(sample_rate)
643
+
644
+ vision_description = None
645
+ vision_provider = None
646
+ vision_model = None
647
+
648
+ if should_analyze:
649
+ # Perform vision LLM analysis
650
+ try:
651
+ from rem.utils.vision import ImageAnalyzer, VisionProvider
652
+
653
+ # Get provider from settings
654
+ provider_str = settings.content.image_vllm_provider.lower()
655
+ provider_map = {
656
+ "anthropic": VisionProvider.ANTHROPIC,
657
+ "gemini": VisionProvider.GEMINI,
658
+ "openai": VisionProvider.OPENAI,
659
+ }
660
+ provider = provider_map.get(provider_str, VisionProvider.ANTHROPIC)
661
+
662
+ # Create analyzer
663
+ analyzer = ImageAnalyzer(
664
+ provider=provider,
665
+ model=settings.content.image_vllm_model,
666
+ )
667
+
668
+ # Write bytes to temp file for analysis
669
+ content_type = metadata.get("content_type", "image/png")
670
+ extension_map = {
671
+ "image/png": ".png",
672
+ "image/jpeg": ".jpg",
673
+ "image/gif": ".gif",
674
+ "image/webp": ".webp",
675
+ }
676
+ extension = extension_map.get(content_type, ".png")
677
+
678
+ with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
679
+ tmp.write(content)
680
+ tmp_path = Path(tmp.name)
681
+
682
+ try:
683
+ # Analyze image
684
+ result = analyzer.analyze_image(tmp_path)
685
+ vision_description = result.description
686
+ vision_provider = result.provider.value
687
+ vision_model = result.model
688
+
689
+ logger.info(f"Vision analysis complete: {len(vision_description)} chars")
690
+ finally:
691
+ # Clean up temp file
692
+ tmp_path.unlink(missing_ok=True)
693
+
694
+ except ImportError as e:
695
+ logger.warning(f"Vision analysis not available: {e}")
696
+ except Exception as e:
697
+ logger.error(f"Vision analysis failed: {e}")
698
+
699
+ # Build text content
700
+ if vision_description:
701
+ # Use vision description as primary content
702
+ text = f"# Image Analysis\n\n{vision_description}"
703
+ if image_width and image_height:
704
+ text += f"\n\n**Image Details:** {image_width}x{image_height} {image_format}"
705
+ else:
706
+ # Fallback to basic metadata
707
+ if image_width and image_height:
708
+ text = f"**Image:** {image_width}x{image_height} {image_format}"
709
+ else:
710
+ text = "**Image:** Metadata extraction unavailable"
711
+
712
+ # Generate CLIP embedding (if Jina API key available)
713
+ clip_embedding = None
714
+ clip_dimensions = None
715
+ clip_tokens = None
716
+
717
+ try:
718
+ from rem.utils.clip_embeddings import JinaCLIPEmbedder
719
+
720
+ # Only attempt CLIP embeddings if using Jina provider
721
+ if settings.content.clip_provider != "jina":
722
+ logger.debug(
723
+ f"CLIP provider set to '{settings.content.clip_provider}' - "
724
+ "skipping Jina embeddings (self-hosted not yet implemented)"
725
+ )
726
+ else:
727
+ embedder = JinaCLIPEmbedder(
728
+ api_key=settings.content.jina_api_key,
729
+ model=settings.content.clip_model,
730
+ )
731
+
732
+ if embedder.is_available():
733
+ # Write bytes to temp file for CLIP embedding
734
+ content_type = metadata.get("content_type", "image/png")
735
+ extension_map = {
736
+ "image/png": ".png",
737
+ "image/jpeg": ".jpg",
738
+ "image/gif": ".gif",
739
+ "image/webp": ".webp",
740
+ }
741
+ extension = extension_map.get(content_type, ".png")
742
+
743
+ with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
744
+ tmp.write(content)
745
+ tmp_path = Path(tmp.name)
746
+
747
+ try:
748
+ # Generate CLIP embedding
749
+ result = embedder.embed_image(tmp_path)
750
+ if result:
751
+ clip_embedding = result.embedding # type: ignore[attr-defined]
752
+ clip_dimensions = result.dimensions # type: ignore[attr-defined]
753
+ clip_tokens = result.tokens_used # type: ignore[attr-defined]
754
+ logger.info(
755
+ f"CLIP embedding generated: {clip_dimensions} dims, {clip_tokens} tokens"
756
+ )
757
+ finally:
758
+ # Clean up temp file
759
+ tmp_path.unlink(missing_ok=True)
760
+ else:
761
+ logger.debug(
762
+ "CLIP embeddings disabled - set CONTENT__JINA_API_KEY to enable. "
763
+ "Get free API key at https://jina.ai/embeddings/"
764
+ )
765
+
766
+ except ImportError:
767
+ logger.debug("CLIP embedding module not available")
768
+ except Exception as e:
769
+ logger.warning(f"CLIP embedding generation failed (non-fatal): {e}")
770
+
771
+ # Build extraction metadata
772
+ extraction_metadata = {
773
+ "parser": "image_provider",
774
+ "vision_enabled": vision_description is not None,
775
+ "vision_provider": vision_provider,
776
+ "vision_model": vision_model,
777
+ "image_width": image_width,
778
+ "image_height": image_height,
779
+ "image_format": image_format,
780
+ "clip_enabled": clip_embedding is not None,
781
+ "clip_dimensions": clip_dimensions,
782
+ "clip_tokens": clip_tokens,
783
+ }
784
+
785
+ # Add image-specific metadata for ImageResource
786
+ image_specific = {
787
+ "image_width": image_width,
788
+ "image_height": image_height,
789
+ "image_format": image_format,
790
+ "vision_description": vision_description,
791
+ "vision_provider": vision_provider,
792
+ "vision_model": vision_model,
793
+ "clip_embedding": clip_embedding,
794
+ "clip_dimensions": clip_dimensions,
795
+ }
796
+
797
+ return {
798
+ "text": text,
799
+ "metadata": extraction_metadata,
800
+ "image_specific": image_specific,
801
+ }