remdb 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +566 -0
  44. rem/cli/commands/configure.py +497 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1302 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +96 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +676 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +336 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.3.0.dist-info/METADATA +1455 -0
  185. remdb-0.3.0.dist-info/RECORD +187 -0
  186. remdb-0.3.0.dist-info/WHEEL +4 -0
  187. remdb-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,806 @@
1
+ """Content provider plugins for different file types."""
2
+
3
+ import json
4
+ import multiprocessing
5
+ import os
6
+ import random
7
+ import subprocess
8
+ import sys
9
+ import tempfile
10
+ from abc import ABC, abstractmethod
11
+ from pathlib import Path
12
+ from typing import Any, Optional
13
+
14
+ from loguru import logger
15
+
16
+
17
+ class ContentProvider(ABC):
18
+ """Base class for content extraction providers."""
19
+
20
+ @property
21
+ @abstractmethod
22
+ def name(self) -> str:
23
+ """Provider name for logging/debugging."""
24
+ pass
25
+
26
+ @abstractmethod
27
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
28
+ """
29
+ Extract text content from file bytes.
30
+
31
+ Args:
32
+ content: Raw file bytes
33
+ metadata: File metadata (size, type, etc.)
34
+
35
+ Returns:
36
+ dict with:
37
+ - text: Extracted text content
38
+ - metadata: Additional metadata from extraction (optional)
39
+ """
40
+ pass
41
+
42
+
43
+ class TextProvider(ContentProvider):
44
+ """
45
+ Text content provider for plain text formats.
46
+
47
+ Supports:
48
+ - Markdown (.md, .markdown) - With heading detection
49
+ - JSON (.json) - Pretty-printed text extraction
50
+ - YAML (.yaml, .yml) - Text extraction
51
+ - Plain text (.txt) - Direct UTF-8 extraction
52
+ - Code files (.py, .js, .ts, etc.) - Source code as text
53
+
54
+ Simple UTF-8 text extraction with basic metadata.
55
+ Future: Could add frontmatter parsing, JSON schema validation, etc.
56
+ """
57
+
58
+ @property
59
+ def name(self) -> str:
60
+ return "text"
61
+
62
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
63
+ """
64
+ Extract text content from plain text files.
65
+
66
+ Args:
67
+ content: Text file bytes
68
+ metadata: File metadata
69
+
70
+ Returns:
71
+ dict with text and optional metadata (line count, headings for markdown, etc.)
72
+ """
73
+ # Decode UTF-8 (with fallback to latin-1)
74
+ try:
75
+ text = content.decode("utf-8")
76
+ except UnicodeDecodeError:
77
+ logger.debug("UTF-8 decode failed, falling back to latin-1")
78
+ text = content.decode("latin-1")
79
+
80
+ # Basic text analysis
81
+ lines = text.split("\n")
82
+
83
+ # Detect headings (for markdown files)
84
+ headings = [line for line in lines if line.strip().startswith("#")]
85
+
86
+ extraction_metadata = {
87
+ "line_count": len(lines),
88
+ "heading_count": len(headings) if headings else None,
89
+ "char_count": len(text),
90
+ "encoding": "utf-8",
91
+ }
92
+
93
+ return {
94
+ "text": text,
95
+ "metadata": extraction_metadata,
96
+ }
97
+
98
+
99
+ class DocProvider(ContentProvider):
100
+ """
101
+ Document content provider using Kreuzberg.
102
+
103
+ Supports multiple document formats via Kreuzberg:
104
+ - PDF (.pdf) - Text extraction with OCR fallback
105
+ - Word (.docx) - Native format support
106
+ - PowerPoint (.pptx) - Slide content extraction
107
+ - Excel (.xlsx) - Spreadsheet data extraction
108
+ - Images (.png, .jpg) - OCR text extraction
109
+
110
+ Handles:
111
+ - Text extraction with OCR fallback
112
+ - Table detection and extraction
113
+ - Daemon process workaround for multiprocessing restrictions
114
+ """
115
+
116
+ @property
117
+ def name(self) -> str:
118
+ return "doc"
119
+
120
+ def _is_daemon_process(self) -> bool:
121
+ """Check if running in a daemon process."""
122
+ try:
123
+ return multiprocessing.current_process().daemon
124
+ except Exception:
125
+ return False
126
+
127
+ def _parse_in_subprocess(self, file_path: Path) -> dict:
128
+ """Run kreuzberg in a separate subprocess to bypass daemon restrictions."""
129
+ script = """
130
+ import json
131
+ import sys
132
+ from pathlib import Path
133
+ from kreuzberg import ExtractionConfig, extract_file_sync
134
+
135
+ # Parse document with table extraction
136
+ config = ExtractionConfig(
137
+ extract_tables=True,
138
+ chunk_content=False,
139
+ extract_keywords=False,
140
+ )
141
+
142
+ result = extract_file_sync(Path(sys.argv[1]), config=config)
143
+
144
+ # Serialize result to JSON
145
+ output = {
146
+ 'content': result.content,
147
+ 'tables': [
148
+ {
149
+ 'page_number': t.get('page_number', 0),
150
+ 'text': t.get('text', ''),
151
+ }
152
+ for t in result.tables
153
+ ],
154
+ 'metadata': result.metadata
155
+ }
156
+ print(json.dumps(output))
157
+ """
158
+
159
+ # Run in subprocess
160
+ result = subprocess.run(
161
+ [sys.executable, "-c", script, str(file_path)],
162
+ capture_output=True,
163
+ text=True,
164
+ timeout=300, # 5 minute timeout
165
+ )
166
+
167
+ if result.returncode != 0:
168
+ raise RuntimeError(f"Subprocess parsing failed: {result.stderr}")
169
+
170
+ return json.loads(result.stdout)
171
+
172
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
173
+ """
174
+ Extract document content using Kreuzberg.
175
+
176
+ Args:
177
+ content: Document file bytes
178
+ metadata: File metadata (should include content_type or extension)
179
+
180
+ Returns:
181
+ dict with text and extraction metadata
182
+ """
183
+ # Write bytes to temp file for kreuzberg
184
+ # Detect extension from metadata
185
+ content_type = metadata.get("content_type", "")
186
+ extension_map = {
187
+ "application/pdf": ".pdf",
188
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
189
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
190
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
191
+ "image/png": ".png",
192
+ "image/jpeg": ".jpg",
193
+ }
194
+ suffix = extension_map.get(content_type, ".pdf") # Default to PDF
195
+
196
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
197
+ tmp.write(content)
198
+ tmp_path = Path(tmp.name)
199
+
200
+ try:
201
+ # Check if running in daemon process
202
+ if self._is_daemon_process():
203
+ logger.info("Daemon process detected - using subprocess workaround for document parsing")
204
+ try:
205
+ result_dict = self._parse_in_subprocess(tmp_path)
206
+ text = result_dict["content"]
207
+ extraction_metadata = {
208
+ "table_count": len(result_dict["tables"]),
209
+ "parser": "kreuzberg_subprocess",
210
+ "file_extension": tmp_path.suffix,
211
+ }
212
+ except Exception as e:
213
+ logger.error(f"Subprocess parsing failed: {e}. Falling back to text-only.")
214
+ # Fallback to simple text extraction
215
+ from kreuzberg import ExtractionConfig, extract_file_sync
216
+ config = ExtractionConfig(extract_tables=False)
217
+ result = extract_file_sync(tmp_path, config=config)
218
+ text = result.content
219
+ extraction_metadata = {
220
+ "parser": "kreuzberg_fallback",
221
+ "file_extension": tmp_path.suffix,
222
+ }
223
+ else:
224
+ # Normal execution (not in daemon)
225
+ from kreuzberg import ExtractionConfig, extract_file_sync
226
+ config = ExtractionConfig(
227
+ extract_tables=True,
228
+ chunk_content=False,
229
+ extract_keywords=False,
230
+ )
231
+ result = extract_file_sync(tmp_path, config=config)
232
+ text = result.content
233
+ extraction_metadata = {
234
+ "table_count": len(result.tables),
235
+ "parser": "kreuzberg",
236
+ "file_extension": tmp_path.suffix,
237
+ }
238
+
239
+ return {
240
+ "text": text,
241
+ "metadata": extraction_metadata,
242
+ }
243
+
244
+ finally:
245
+ # Clean up temp file
246
+ tmp_path.unlink(missing_ok=True)
247
+
248
+
249
+ class AudioProvider(ContentProvider):
250
+ """
251
+ Audio content provider using AudioChunker + OpenAI Whisper.
252
+
253
+ Handles:
254
+ - Audio chunking by silence near minute boundaries
255
+ - Transcription via OpenAI Whisper API
256
+ - Converts chunks to markdown format
257
+ - Supports WAV, M4A, MP3, FLAC, OGG (via pydub + ffmpeg)
258
+
259
+ Process:
260
+ 1. Write audio bytes to temp file
261
+ 2. Chunk audio by silence (AudioChunker)
262
+ 3. Transcribe chunks (AudioTranscriber)
263
+ 4. Combine into markdown format with timestamps
264
+ 5. Clean up temp files
265
+
266
+ Returns markdown-formatted transcription that integrates
267
+ seamlessly with ContentService's markdown → chunk → embed pipeline.
268
+ """
269
+
270
+ @property
271
+ def name(self) -> str:
272
+ return "audio"
273
+
274
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
275
+ """
276
+ Extract audio content via transcription.
277
+
278
+ Args:
279
+ content: Audio file bytes
280
+ metadata: File metadata (size, type, etc.)
281
+
282
+ Returns:
283
+ dict with:
284
+ - text: Markdown-formatted transcription with timestamps
285
+ - metadata: Extraction metadata (chunk_count, duration, cost)
286
+
287
+ Raises:
288
+ RuntimeError: If transcription fails or pydub not available
289
+ ValueError: If OpenAI API key missing
290
+ """
291
+ # Handle empty or invalid content
292
+ if not content or len(content) < 44: # WAV header is minimum 44 bytes
293
+ logger.warning("Audio content too small to be valid WAV file")
294
+ return {
295
+ "text": "[Invalid or empty audio file]",
296
+ "metadata": {"error": "invalid_content", "size": len(content)},
297
+ }
298
+
299
+ # Check for OpenAI API key
300
+ api_key = os.getenv("OPENAI_API_KEY")
301
+ if not api_key:
302
+ logger.warning("No OPENAI_API_KEY found - audio transcription disabled")
303
+ return {
304
+ "text": "[Audio transcription requires OPENAI_API_KEY environment variable]",
305
+ "metadata": {"error": "missing_api_key"},
306
+ }
307
+
308
+ # Import audio services (lazy import)
309
+ try:
310
+ from rem.services.audio import AudioChunker, AudioTranscriber
311
+ except ImportError as e:
312
+ logger.error(f"Audio services not available: {e}")
313
+ return {
314
+ "text": "[Audio processing requires: pip install rem[audio]]",
315
+ "metadata": {"error": "missing_dependencies"},
316
+ }
317
+
318
+ # Write bytes to temp file
319
+ # Detect extension from metadata or use .wav as fallback
320
+ content_type = metadata.get("content_type", "audio/wav")
321
+ extension_map = {
322
+ "audio/wav": ".wav",
323
+ "audio/mpeg": ".mp3",
324
+ "audio/mp4": ".m4a",
325
+ "audio/x-m4a": ".m4a",
326
+ "audio/flac": ".flac",
327
+ "audio/ogg": ".ogg",
328
+ }
329
+ extension = extension_map.get(content_type, ".wav")
330
+
331
+ with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
332
+ tmp.write(content)
333
+ tmp_path = Path(tmp.name)
334
+
335
+ try:
336
+ logger.info(f"Processing audio file: {tmp_path.name} ({len(content) / 1024 / 1024:.1f} MB)")
337
+
338
+ # Step 1: Chunk audio by silence
339
+ chunker = AudioChunker(
340
+ target_chunk_seconds=60.0,
341
+ chunk_window_seconds=2.0,
342
+ silence_threshold_db=-40.0,
343
+ min_silence_ms=500,
344
+ )
345
+
346
+ chunks = chunker.chunk_audio(tmp_path)
347
+ logger.info(f"Created {len(chunks)} audio chunks")
348
+
349
+ # Step 2: Transcribe chunks
350
+ transcriber = AudioTranscriber(api_key=api_key)
351
+ results = transcriber.transcribe_chunks(chunks)
352
+ logger.info(f"Transcribed {len(results)} chunks")
353
+
354
+ # Step 3: Combine into markdown format
355
+ # Format: Each chunk becomes a section with timestamp
356
+ markdown_parts = []
357
+ for result in results:
358
+ timestamp = f"{result.start_seconds:.1f}s - {result.end_seconds:.1f}s"
359
+ markdown_parts.append(f"## [{timestamp}]\n\n{result.text}\n")
360
+
361
+ markdown_text = "\n".join(markdown_parts)
362
+
363
+ # Calculate metadata
364
+ total_duration = sum(r.duration_seconds for r in results)
365
+ estimated_cost = (total_duration / 60) * 0.006 # $0.006 per minute
366
+ successful_chunks = sum(1 for r in results if r.confidence > 0)
367
+
368
+ extraction_metadata = {
369
+ "chunk_count": len(chunks),
370
+ "transcribed_chunks": successful_chunks,
371
+ "duration_seconds": total_duration,
372
+ "estimated_cost": estimated_cost,
373
+ "parser": "whisper_api",
374
+ }
375
+
376
+ logger.info(
377
+ f"Transcription complete: {successful_chunks}/{len(chunks)} chunks, "
378
+ f"${estimated_cost:.3f} cost"
379
+ )
380
+
381
+ return {
382
+ "text": markdown_text,
383
+ "metadata": extraction_metadata,
384
+ }
385
+
386
+ except Exception as e:
387
+ logger.error(f"Audio extraction failed: {e}")
388
+ raise RuntimeError(f"Audio transcription failed: {e}") from e
389
+
390
+ finally:
391
+ # Clean up temp file and chunks
392
+ try:
393
+ tmp_path.unlink(missing_ok=True)
394
+ if 'chunker' in locals() and 'chunks' in locals():
395
+ chunker.cleanup_chunks(chunks)
396
+ except Exception as e:
397
+ logger.warning(f"Cleanup failed: {e}")
398
+
399
+
400
+ class SchemaProvider(ContentProvider):
401
+ """
402
+ Schema content provider for agent/evaluator schemas.
403
+
404
+ Detects and processes YAML/JSON files containing:
405
+ - Agent schemas (type: object with json_schema_extra.kind: agent and json_schema_extra.name: <name>)
406
+ - Evaluator schemas (type: object with json_schema_extra.kind: evaluator and json_schema_extra.name: <name>)
407
+
408
+ Stores schemas in the schemas table with deterministic IDs for upsert by name.
409
+
410
+ Pattern:
411
+ - Checks for schema markers (type: object + kind + name)
412
+ - Generates deterministic ID for upsert (tenant+schema_name)
413
+ - Stores full schema JSON in schemas table
414
+ - Extracts metadata (version, tags, provider_configs, embedding_fields)
415
+ """
416
+
417
+ @property
418
+ def name(self) -> str:
419
+ return "schema"
420
+
421
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
422
+ """
423
+ Extract and validate agent/evaluator schema.
424
+
425
+ Args:
426
+ content: YAML or JSON file bytes
427
+ metadata: File metadata
428
+
429
+ Returns:
430
+ dict with:
431
+ - text: Human-readable schema summary
432
+ - metadata: Schema metadata
433
+ - schema_data: Full schema dict for storage
434
+ - is_schema: True if valid schema detected
435
+
436
+ Raises:
437
+ ValueError: If schema is invalid
438
+ """
439
+ import json
440
+ import yaml
441
+ from uuid import uuid5, NAMESPACE_DNS
442
+
443
+ # Decode content
444
+ try:
445
+ text = content.decode("utf-8")
446
+ except UnicodeDecodeError:
447
+ text = content.decode("latin-1")
448
+
449
+ # Try to parse as YAML/JSON
450
+ if metadata.get("content_type") == "application/json":
451
+ schema_data = json.loads(text) # Raises JSONDecodeError on invalid JSON
452
+ else:
453
+ # Try YAML first (supports both YAML and JSON)
454
+ schema_data = yaml.safe_load(text) # Raises yaml.YAMLError on invalid YAML
455
+
456
+ # Check if it's a schema (type: object + json_schema_extra.kind + json_schema_extra.name)
457
+ if not isinstance(schema_data, dict):
458
+ return {
459
+ "text": text,
460
+ "metadata": {"parser": "schema_fallback"},
461
+ "is_schema": False,
462
+ }
463
+
464
+ # Check for schema markers
465
+ is_object_type = schema_data.get("type") == "object"
466
+ json_schema_extra = schema_data.get("json_schema_extra", {})
467
+ kind = json_schema_extra.get("kind", "")
468
+ schema_name = json_schema_extra.get("name", "")
469
+
470
+ # Must have type: object, kind (agent or evaluator), and name
471
+ is_agent_schema = is_object_type and kind == "agent" and schema_name
472
+ is_evaluator_schema = is_object_type and kind == "evaluator" and schema_name
473
+
474
+ if not (is_agent_schema or is_evaluator_schema):
475
+ return {
476
+ "text": text,
477
+ "metadata": {"parser": "schema_fallback"},
478
+ "is_schema": False,
479
+ }
480
+
481
+ # Extract schema metadata
482
+ schema_type = kind # "agent" or "evaluator"
483
+ version = json_schema_extra.get("version", "1.0.0")
484
+ tags = json_schema_extra.get("tags", [])
485
+
486
+ # Use name directly (already in kebab-case format)
487
+ short_name = schema_name
488
+
489
+ # Build human-readable summary
490
+ description = schema_data.get("description", "No description provided")
491
+ description_preview = description[:200] + "..." if len(description) > 200 else description
492
+
493
+ properties = schema_data.get("properties", {})
494
+ required_fields = schema_data.get("required", [])
495
+
496
+ summary_parts = [
497
+ f"# {schema_type.title()} Schema: {short_name}",
498
+ f"**Version:** {version}",
499
+ f"**Name:** {schema_name}",
500
+ f"**Kind:** {kind}",
501
+ "",
502
+ "## Description",
503
+ description_preview,
504
+ "",
505
+ "## Output Fields",
506
+ ]
507
+
508
+ for field_name, field_spec in list(properties.items())[:10]: # Limit to 10 fields
509
+ field_type = field_spec.get("type", "unknown")
510
+ field_desc = field_spec.get("description", "")
511
+ required = " (required)" if field_name in required_fields else ""
512
+ summary_parts.append(f"- **{field_name}**: {field_type}{required} - {field_desc[:50]}")
513
+
514
+ if len(properties) > 10:
515
+ summary_parts.append(f"- ... and {len(properties) - 10} more fields")
516
+
517
+ text_summary = "\n".join(summary_parts)
518
+
519
+ # Extract additional metadata
520
+ extraction_metadata = {
521
+ "parser": "schema",
522
+ "schema_type": schema_type,
523
+ "short_name": short_name,
524
+ "version": version,
525
+ "kind": kind,
526
+ "name": schema_name,
527
+ "tags": tags,
528
+ "field_count": len(properties),
529
+ "required_field_count": len(required_fields),
530
+ "provider_configs": json_schema_extra.get("provider_configs", []),
531
+ "embedding_fields": json_schema_extra.get("embedding_fields", []),
532
+ "category": json_schema_extra.get("category"),
533
+ }
534
+
535
+ return {
536
+ "text": text_summary,
537
+ "metadata": extraction_metadata,
538
+ "schema_data": schema_data,
539
+ "is_schema": True,
540
+ }
541
+
542
+
543
+ class ImageProvider(ContentProvider):
544
+ """
545
+ Image content provider with vision LLM analysis and CLIP embeddings.
546
+
547
+ Features:
548
+ - Tier-based vision analysis (gold tier always gets analysis)
549
+ - Sampling-based vision analysis for non-gold users
550
+ - Vision LLM description generation (Anthropic, Gemini, OpenAI)
551
+ - Future: CLIP embeddings for semantic image search
552
+
553
+ Process:
554
+ 1. Check user tier and sampling rate
555
+ 2. If eligible, run vision LLM analysis
556
+ 3. Extract image metadata (dimensions, format)
557
+ 4. Return markdown description or basic metadata
558
+ 5. Save to ImageResource table (not Resource)
559
+
560
+ Vision analysis is expensive, so it's gated by:
561
+ - User tier (gold = always, silver/free = sampled)
562
+ - Sample rate setting (0.0 = never, 1.0 = always)
563
+ """
564
+
565
+ def __init__(self, user_tier: Optional[str] = None):
566
+ """
567
+ Initialize image provider.
568
+
569
+ Args:
570
+ user_tier: User tier (free, silver, gold) for vision gating
571
+ """
572
+ self.user_tier = user_tier
573
+
574
+ @property
575
+ def name(self) -> str:
576
+ return "image"
577
+
578
+ def _should_analyze_with_vision(self, sample_rate: float) -> bool:
579
+ """
580
+ Determine if image should get vision LLM analysis.
581
+
582
+ Args:
583
+ sample_rate: Sampling rate from settings (0.0-1.0)
584
+
585
+ Returns:
586
+ True if should analyze, False otherwise
587
+ """
588
+ # Import here to avoid circular dependency
589
+ from rem.models.entities import UserTier
590
+
591
+ # Gold tier always gets vision analysis
592
+ if self.user_tier == UserTier.GOLD.value:
593
+ logger.info("Gold tier user - vision analysis enabled")
594
+ return True
595
+
596
+ # For non-gold users, use sampling
597
+ if sample_rate > 0.0:
598
+ should_analyze = random.random() < sample_rate
599
+ if should_analyze:
600
+ logger.info(f"Vision analysis sampled (rate={sample_rate})")
601
+ return should_analyze
602
+
603
+ return False
604
+
605
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
606
+ """
607
+ Extract image content with optional vision LLM analysis.
608
+
609
+ Args:
610
+ content: Image file bytes
611
+ metadata: File metadata (size, type, etc.)
612
+
613
+ Returns:
614
+ dict with:
615
+ - text: Markdown description (if vision enabled) or basic metadata
616
+ - metadata: Extraction metadata (dimensions, format, vision info)
617
+ - image_specific: Additional image metadata for ImageResource
618
+
619
+ Raises:
620
+ RuntimeError: If vision analysis fails
621
+ """
622
+ # Import settings here to avoid circular dependency
623
+ from rem.settings import settings
624
+
625
+ # Extract basic image metadata using PIL
626
+ try:
627
+ from PIL import Image
628
+ import io
629
+
630
+ img = Image.open(io.BytesIO(content))
631
+ image_width = img.width
632
+ image_height = img.height
633
+ image_format = img.format or "UNKNOWN"
634
+ except ImportError:
635
+ logger.warning("PIL not available - image metadata extraction disabled")
636
+ image_width = None
637
+ image_height = None
638
+ image_format = None
639
+ except Exception as e:
640
+ logger.warning(f"Failed to extract image metadata: {e}")
641
+ image_width = None
642
+ image_height = None
643
+ image_format = None
644
+
645
+ # Check if vision analysis should be performed
646
+ sample_rate = settings.content.image_vllm_sample_rate
647
+ should_analyze = self._should_analyze_with_vision(sample_rate)
648
+
649
+ vision_description = None
650
+ vision_provider = None
651
+ vision_model = None
652
+
653
+ if should_analyze:
654
+ # Perform vision LLM analysis
655
+ try:
656
+ from rem.utils.vision import ImageAnalyzer, VisionProvider
657
+
658
+ # Get provider from settings
659
+ provider_str = settings.content.image_vllm_provider.lower()
660
+ provider_map = {
661
+ "anthropic": VisionProvider.ANTHROPIC,
662
+ "gemini": VisionProvider.GEMINI,
663
+ "openai": VisionProvider.OPENAI,
664
+ }
665
+ provider = provider_map.get(provider_str, VisionProvider.ANTHROPIC)
666
+
667
+ # Create analyzer
668
+ analyzer = ImageAnalyzer(
669
+ provider=provider,
670
+ model=settings.content.image_vllm_model,
671
+ )
672
+
673
+ # Write bytes to temp file for analysis
674
+ content_type = metadata.get("content_type", "image/png")
675
+ extension_map = {
676
+ "image/png": ".png",
677
+ "image/jpeg": ".jpg",
678
+ "image/gif": ".gif",
679
+ "image/webp": ".webp",
680
+ }
681
+ extension = extension_map.get(content_type, ".png")
682
+
683
+ with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
684
+ tmp.write(content)
685
+ tmp_path = Path(tmp.name)
686
+
687
+ try:
688
+ # Analyze image
689
+ result = analyzer.analyze_image(tmp_path)
690
+ vision_description = result.description
691
+ vision_provider = result.provider.value
692
+ vision_model = result.model
693
+
694
+ logger.info(f"Vision analysis complete: {len(vision_description)} chars")
695
+ finally:
696
+ # Clean up temp file
697
+ tmp_path.unlink(missing_ok=True)
698
+
699
+ except ImportError as e:
700
+ logger.warning(f"Vision analysis not available: {e}")
701
+ except Exception as e:
702
+ logger.error(f"Vision analysis failed: {e}")
703
+
704
+ # Build text content
705
+ if vision_description:
706
+ # Use vision description as primary content
707
+ text = f"# Image Analysis\n\n{vision_description}"
708
+ if image_width and image_height:
709
+ text += f"\n\n**Image Details:** {image_width}x{image_height} {image_format}"
710
+ else:
711
+ # Fallback to basic metadata
712
+ if image_width and image_height:
713
+ text = f"**Image:** {image_width}x{image_height} {image_format}"
714
+ else:
715
+ text = "**Image:** Metadata extraction unavailable"
716
+
717
+ # Generate CLIP embedding (if Jina API key available)
718
+ clip_embedding = None
719
+ clip_dimensions = None
720
+ clip_tokens = None
721
+
722
+ try:
723
+ from rem.utils.clip_embeddings import JinaCLIPEmbedder
724
+
725
+ # Only attempt CLIP embeddings if using Jina provider
726
+ if settings.content.clip_provider != "jina":
727
+ logger.debug(
728
+ f"CLIP provider set to '{settings.content.clip_provider}' - "
729
+ "skipping Jina embeddings (self-hosted not yet implemented)"
730
+ )
731
+ else:
732
+ embedder = JinaCLIPEmbedder(
733
+ api_key=settings.content.jina_api_key,
734
+ model=settings.content.clip_model,
735
+ )
736
+
737
+ if embedder.is_available():
738
+ # Write bytes to temp file for CLIP embedding
739
+ content_type = metadata.get("content_type", "image/png")
740
+ extension_map = {
741
+ "image/png": ".png",
742
+ "image/jpeg": ".jpg",
743
+ "image/gif": ".gif",
744
+ "image/webp": ".webp",
745
+ }
746
+ extension = extension_map.get(content_type, ".png")
747
+
748
+ with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
749
+ tmp.write(content)
750
+ tmp_path = Path(tmp.name)
751
+
752
+ try:
753
+ # Generate CLIP embedding
754
+ result = embedder.embed_image(tmp_path)
755
+ if result:
756
+ clip_embedding = result.embedding # type: ignore[attr-defined]
757
+ clip_dimensions = result.dimensions # type: ignore[attr-defined]
758
+ clip_tokens = result.tokens_used # type: ignore[attr-defined]
759
+ logger.info(
760
+ f"CLIP embedding generated: {clip_dimensions} dims, {clip_tokens} tokens"
761
+ )
762
+ finally:
763
+ # Clean up temp file
764
+ tmp_path.unlink(missing_ok=True)
765
+ else:
766
+ logger.debug(
767
+ "CLIP embeddings disabled - set CONTENT__JINA_API_KEY to enable. "
768
+ "Get free API key at https://jina.ai/embeddings/"
769
+ )
770
+
771
+ except ImportError:
772
+ logger.debug("CLIP embedding module not available")
773
+ except Exception as e:
774
+ logger.warning(f"CLIP embedding generation failed (non-fatal): {e}")
775
+
776
+ # Build extraction metadata
777
+ extraction_metadata = {
778
+ "parser": "image_provider",
779
+ "vision_enabled": vision_description is not None,
780
+ "vision_provider": vision_provider,
781
+ "vision_model": vision_model,
782
+ "image_width": image_width,
783
+ "image_height": image_height,
784
+ "image_format": image_format,
785
+ "clip_enabled": clip_embedding is not None,
786
+ "clip_dimensions": clip_dimensions,
787
+ "clip_tokens": clip_tokens,
788
+ }
789
+
790
+ # Add image-specific metadata for ImageResource
791
+ image_specific = {
792
+ "image_width": image_width,
793
+ "image_height": image_height,
794
+ "image_format": image_format,
795
+ "vision_description": vision_description,
796
+ "vision_provider": vision_provider,
797
+ "vision_model": vision_model,
798
+ "clip_embedding": clip_embedding,
799
+ "clip_dimensions": clip_dimensions,
800
+ }
801
+
802
+ return {
803
+ "text": text,
804
+ "metadata": extraction_metadata,
805
+ "image_specific": image_specific,
806
+ }