remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,760 @@
1
+ """Content provider plugins for different file types."""
2
+
3
+ import json
4
+ import multiprocessing
5
+ import random
6
+ import subprocess
7
+ import sys
8
+ from abc import ABC, abstractmethod
9
+ from pathlib import Path
10
+ from typing import Any, Optional
11
+
12
+ from loguru import logger
13
+
14
+ from rem.utils.constants import (
15
+ AUDIO_CHUNK_TARGET_SECONDS,
16
+ AUDIO_CHUNK_WINDOW_SECONDS,
17
+ MIN_SILENCE_MS,
18
+ SILENCE_THRESHOLD_DB,
19
+ SUBPROCESS_TIMEOUT_SECONDS,
20
+ WAV_HEADER_MIN_BYTES,
21
+ WHISPER_COST_PER_MINUTE,
22
+ )
23
+ from rem.utils.files import temp_file_from_bytes
24
+ from rem.utils.mime_types import get_extension
25
+
26
+
27
+ class ContentProvider(ABC):
28
+ """Base class for content extraction providers."""
29
+
30
+ @property
31
+ @abstractmethod
32
+ def name(self) -> str:
33
+ """Provider name for logging/debugging."""
34
+ pass
35
+
36
+ @abstractmethod
37
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
38
+ """
39
+ Extract text content from file bytes.
40
+
41
+ Args:
42
+ content: Raw file bytes
43
+ metadata: File metadata (size, type, etc.)
44
+
45
+ Returns:
46
+ dict with:
47
+ - text: Extracted text content
48
+ - metadata: Additional metadata from extraction (optional)
49
+ """
50
+ pass
51
+
52
+
53
+ class TextProvider(ContentProvider):
54
+ """
55
+ Text content provider for plain text formats.
56
+
57
+ Supports:
58
+ - Markdown (.md, .markdown) - With heading detection
59
+ - JSON (.json) - Pretty-printed text extraction
60
+ - YAML (.yaml, .yml) - Text extraction
61
+ - Plain text (.txt) - Direct UTF-8 extraction
62
+ - Code files (.py, .js, .ts, etc.) - Source code as text
63
+
64
+ Simple UTF-8 text extraction with basic metadata.
65
+ Future: Could add frontmatter parsing, JSON schema validation, etc.
66
+ """
67
+
68
+ @property
69
+ def name(self) -> str:
70
+ return "text"
71
+
72
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
73
+ """
74
+ Extract text content from plain text files.
75
+
76
+ Args:
77
+ content: Text file bytes
78
+ metadata: File metadata
79
+
80
+ Returns:
81
+ dict with text and optional metadata (line count, headings for markdown, etc.)
82
+ """
83
+ # Decode UTF-8 (with fallback to latin-1)
84
+ try:
85
+ text = content.decode("utf-8")
86
+ except UnicodeDecodeError:
87
+ logger.debug("UTF-8 decode failed, falling back to latin-1")
88
+ text = content.decode("latin-1")
89
+
90
+ # Basic text analysis
91
+ lines = text.split("\n")
92
+
93
+ # Detect headings (for markdown files)
94
+ headings = [line for line in lines if line.strip().startswith("#")]
95
+
96
+ extraction_metadata = {
97
+ "line_count": len(lines),
98
+ "heading_count": len(headings) if headings else None,
99
+ "char_count": len(text),
100
+ "encoding": "utf-8",
101
+ }
102
+
103
+ return {
104
+ "text": text,
105
+ "metadata": extraction_metadata,
106
+ }
107
+
108
+
109
+ class DocProvider(ContentProvider):
110
+ """
111
+ Document content provider using Kreuzberg.
112
+
113
+ Supports multiple document formats via Kreuzberg:
114
+ - PDF (.pdf) - Text extraction with OCR fallback
115
+ - Word (.docx) - Native format support
116
+ - PowerPoint (.pptx) - Slide content extraction
117
+ - Excel (.xlsx) - Spreadsheet data extraction
118
+ - Images (.png, .jpg) - OCR text extraction
119
+
120
+ Handles:
121
+ - Text extraction with OCR fallback
122
+ - Table detection and extraction
123
+ - Daemon process workaround for multiprocessing restrictions
124
+ """
125
+
126
+ @property
127
+ def name(self) -> str:
128
+ return "doc"
129
+
130
+ def _is_daemon_process(self) -> bool:
131
+ """Check if running in a daemon process."""
132
+ try:
133
+ return multiprocessing.current_process().daemon
134
+ except Exception:
135
+ return False
136
+
137
+ def _parse_in_subprocess(self, file_path: Path) -> dict:
138
+ """Run kreuzberg in a separate subprocess to bypass daemon restrictions."""
139
+ script = """
140
+ import json
141
+ import sys
142
+ from pathlib import Path
143
+ from kreuzberg import ExtractionConfig, extract_file_sync
144
+
145
+ # Parse document with kreuzberg 3.x
146
+ config = ExtractionConfig(
147
+ extract_tables=True,
148
+ chunk_content=False,
149
+ extract_keywords=False,
150
+ )
151
+
152
+ result = extract_file_sync(Path(sys.argv[1]), config=config)
153
+
154
+ # Serialize result to JSON
155
+ output = {
156
+ 'content': result.content,
157
+ 'tables': [t.model_dump() for t in result.tables] if result.tables else [],
158
+ 'metadata': result.metadata
159
+ }
160
+ print(json.dumps(output))
161
+ """
162
+
163
+ # Run in subprocess
164
+ result = subprocess.run(
165
+ [sys.executable, "-c", script, str(file_path)],
166
+ capture_output=True,
167
+ text=True,
168
+ timeout=SUBPROCESS_TIMEOUT_SECONDS,
169
+ )
170
+
171
+ if result.returncode != 0:
172
+ raise RuntimeError(f"Subprocess parsing failed: {result.stderr}")
173
+
174
+ return json.loads(result.stdout)
175
+
176
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
177
+ """
178
+ Extract document content using Kreuzberg.
179
+
180
+ Args:
181
+ content: Document file bytes
182
+ metadata: File metadata (should include content_type or extension)
183
+
184
+ Returns:
185
+ dict with text and extraction metadata
186
+ """
187
+ # Write bytes to temp file for kreuzberg
188
+ # Detect extension from metadata
189
+ content_type = metadata.get("content_type", "")
190
+ suffix = get_extension(content_type, default=".pdf")
191
+
192
+ with temp_file_from_bytes(content, suffix=suffix) as tmp_path:
193
+ # Check if running in daemon process
194
+ if self._is_daemon_process():
195
+ logger.info("Daemon process detected - using subprocess workaround for document parsing")
196
+ try:
197
+ result_dict = self._parse_in_subprocess(tmp_path)
198
+ text = result_dict["content"]
199
+ extraction_metadata = {
200
+ "table_count": len(result_dict["tables"]),
201
+ "parser": "kreuzberg_subprocess",
202
+ "file_extension": tmp_path.suffix,
203
+ }
204
+ except Exception as e:
205
+ logger.error(f"Subprocess parsing failed: {e}. Falling back to text-only.")
206
+ # Fallback to simple text extraction (kreuzberg 3.x API)
207
+ from kreuzberg import ExtractionConfig, extract_file_sync
208
+ config = ExtractionConfig(extract_tables=False)
209
+ result = extract_file_sync(tmp_path, config=config)
210
+ text = result.content
211
+ extraction_metadata = {
212
+ "parser": "kreuzberg_fallback",
213
+ "file_extension": tmp_path.suffix,
214
+ }
215
+ else:
216
+ # Normal execution (not in daemon) - kreuzberg 4.x with native ONNX/Rust
217
+ from kreuzberg import ExtractionConfig, extract_file_sync
218
+ config = ExtractionConfig(
219
+ enable_quality_processing=True, # Enables table extraction with native ONNX
220
+ chunk_content=False, # We handle chunking ourselves
221
+ extract_tables=False, # Disable table extraction to avoid PyTorch dependency
222
+ )
223
+ result = extract_file_sync(tmp_path, config=config)
224
+ text = result.content
225
+ extraction_metadata = {
226
+ "table_count": len(result.tables) if result.tables else 0,
227
+ "parser": "kreuzberg",
228
+ "file_extension": tmp_path.suffix,
229
+ }
230
+
231
+ return {
232
+ "text": text,
233
+ "metadata": extraction_metadata,
234
+ }
235
+
236
+
237
+ class AudioProvider(ContentProvider):
238
+ """
239
+ Audio content provider using AudioChunker + OpenAI Whisper.
240
+
241
+ Handles:
242
+ - Audio chunking by silence near minute boundaries
243
+ - Transcription via OpenAI Whisper API
244
+ - Converts chunks to markdown format
245
+ - Supports WAV, M4A, MP3, FLAC, OGG (via pydub + ffmpeg)
246
+
247
+ Process:
248
+ 1. Write audio bytes to temp file
249
+ 2. Chunk audio by silence (AudioChunker)
250
+ 3. Transcribe chunks (AudioTranscriber)
251
+ 4. Combine into markdown format with timestamps
252
+ 5. Clean up temp files
253
+
254
+ Returns markdown-formatted transcription that integrates
255
+ seamlessly with ContentService's markdown → chunk → embed pipeline.
256
+ """
257
+
258
+ @property
259
+ def name(self) -> str:
260
+ return "audio"
261
+
262
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
263
+ """
264
+ Extract audio content via transcription.
265
+
266
+ Args:
267
+ content: Audio file bytes
268
+ metadata: File metadata (size, type, etc.)
269
+
270
+ Returns:
271
+ dict with:
272
+ - text: Markdown-formatted transcription with timestamps
273
+ - metadata: Extraction metadata (chunk_count, duration, cost)
274
+
275
+ Raises:
276
+ RuntimeError: If transcription fails or pydub not available
277
+ ValueError: If OpenAI API key missing
278
+ """
279
+ # Handle empty or invalid content
280
+ if not content or len(content) < WAV_HEADER_MIN_BYTES:
281
+ logger.warning("Audio content too small to be valid WAV file")
282
+ return {
283
+ "text": "[Invalid or empty audio file]",
284
+ "metadata": {"error": "invalid_content", "size": len(content)},
285
+ }
286
+
287
+ # Check for OpenAI API key (use settings)
288
+ from rem.settings import settings
289
+ api_key = settings.llm.openai_api_key
290
+ if not api_key:
291
+ logger.warning("No OpenAI API key found - audio transcription disabled")
292
+ return {
293
+ "text": "[Audio transcription requires LLM__OPENAI_API_KEY to be set]",
294
+ "metadata": {"error": "missing_api_key"},
295
+ }
296
+
297
+ # Import audio services (lazy import)
298
+ try:
299
+ from rem.services.audio import AudioChunker, AudioTranscriber
300
+ except ImportError as e:
301
+ logger.error(f"Audio services not available: {e}")
302
+ return {
303
+ "text": "[Audio processing requires: pip install rem[audio]]",
304
+ "metadata": {"error": "missing_dependencies"},
305
+ }
306
+
307
+ # Write bytes to temp file
308
+ # Detect extension from metadata or use .wav as fallback
309
+ content_type = metadata.get("content_type", "audio/wav")
310
+ extension = get_extension(content_type, default=".wav")
311
+
312
+ chunker = None
313
+ chunks = None
314
+
315
+ with temp_file_from_bytes(content, suffix=extension) as tmp_path:
316
+ try:
317
+ logger.info(f"Processing audio file: {tmp_path.name} ({len(content) / 1024 / 1024:.1f} MB)")
318
+
319
+ # Step 1: Chunk audio by silence
320
+ chunker = AudioChunker(
321
+ target_chunk_seconds=AUDIO_CHUNK_TARGET_SECONDS,
322
+ chunk_window_seconds=AUDIO_CHUNK_WINDOW_SECONDS,
323
+ silence_threshold_db=SILENCE_THRESHOLD_DB,
324
+ min_silence_ms=MIN_SILENCE_MS,
325
+ )
326
+
327
+ chunks = chunker.chunk_audio(tmp_path)
328
+ logger.info(f"Created {len(chunks)} audio chunks")
329
+
330
+ # Step 2: Transcribe chunks
331
+ transcriber = AudioTranscriber(api_key=api_key)
332
+ results = transcriber.transcribe_chunks(chunks)
333
+ logger.info(f"Transcribed {len(results)} chunks")
334
+
335
+ # Step 3: Combine into markdown format
336
+ # Format: Each chunk becomes a section with timestamp
337
+ markdown_parts = []
338
+ for result in results:
339
+ timestamp = f"{result.start_seconds:.1f}s - {result.end_seconds:.1f}s"
340
+ markdown_parts.append(f"## [{timestamp}]\n\n{result.text}\n")
341
+
342
+ markdown_text = "\n".join(markdown_parts)
343
+
344
+ # Calculate metadata
345
+ total_duration = sum(r.duration_seconds for r in results)
346
+ estimated_cost = (total_duration / 60) * WHISPER_COST_PER_MINUTE
347
+ successful_chunks = sum(1 for r in results if r.confidence > 0)
348
+
349
+ extraction_metadata = {
350
+ "chunk_count": len(chunks),
351
+ "transcribed_chunks": successful_chunks,
352
+ "duration_seconds": total_duration,
353
+ "estimated_cost": estimated_cost,
354
+ "parser": "whisper_api",
355
+ }
356
+
357
+ logger.info(
358
+ f"Transcription complete: {successful_chunks}/{len(chunks)} chunks, "
359
+ f"${estimated_cost:.3f} cost"
360
+ )
361
+
362
+ return {
363
+ "text": markdown_text,
364
+ "metadata": extraction_metadata,
365
+ }
366
+
367
+ except Exception as e:
368
+ logger.error(f"Audio extraction failed: {e}")
369
+ raise RuntimeError(f"Audio transcription failed: {e}") from e
370
+
371
+ finally:
372
+ # Clean up audio chunks (temp file cleanup handled by context manager)
373
+ if chunker is not None and chunks is not None:
374
+ try:
375
+ chunker.cleanup_chunks(chunks)
376
+ except Exception as e:
377
+ logger.warning(f"Chunk cleanup failed: {e}")
378
+
379
+
380
+ class SchemaProvider(ContentProvider):
381
+ """
382
+ Schema content provider for agent/evaluator schemas.
383
+
384
+ Detects and processes YAML/JSON files containing:
385
+ - Agent schemas (type: object with json_schema_extra.kind: agent and json_schema_extra.name: <name>)
386
+ - Evaluator schemas (type: object with json_schema_extra.kind: evaluator and json_schema_extra.name: <name>)
387
+
388
+ Stores schemas in the schemas table with deterministic IDs for upsert by name.
389
+
390
+ Pattern:
391
+ - Checks for schema markers (type: object + kind + name)
392
+ - Generates deterministic ID for upsert (tenant+schema_name)
393
+ - Stores full schema JSON in schemas table
394
+ - Extracts metadata (version, tags, provider_configs, embedding_fields)
395
+ """
396
+
397
+ @property
398
+ def name(self) -> str:
399
+ return "schema"
400
+
401
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
402
+ """
403
+ Extract and validate agent/evaluator schema.
404
+
405
+ Args:
406
+ content: YAML or JSON file bytes
407
+ metadata: File metadata
408
+
409
+ Returns:
410
+ dict with:
411
+ - text: Human-readable schema summary
412
+ - metadata: Schema metadata
413
+ - schema_data: Full schema dict for storage
414
+ - is_schema: True if valid schema detected
415
+
416
+ Raises:
417
+ ValueError: If schema is invalid
418
+ """
419
+ import json
420
+ import yaml
421
+ from uuid import uuid5, NAMESPACE_DNS
422
+
423
+ # Decode content
424
+ try:
425
+ text = content.decode("utf-8")
426
+ except UnicodeDecodeError:
427
+ text = content.decode("latin-1")
428
+
429
+ # Try to parse as YAML/JSON
430
+ if metadata.get("content_type") == "application/json":
431
+ schema_data = json.loads(text) # Raises JSONDecodeError on invalid JSON
432
+ else:
433
+ # Try YAML first (supports both YAML and JSON)
434
+ schema_data = yaml.safe_load(text) # Raises yaml.YAMLError on invalid YAML
435
+
436
+ # Check if it's a schema (type: object + json_schema_extra.kind + json_schema_extra.name)
437
+ if not isinstance(schema_data, dict):
438
+ return {
439
+ "text": text,
440
+ "metadata": {"parser": "schema_fallback"},
441
+ "is_schema": False,
442
+ }
443
+
444
+ # Check for schema markers
445
+ is_object_type = schema_data.get("type") == "object"
446
+ json_schema_extra = schema_data.get("json_schema_extra", {})
447
+ kind = json_schema_extra.get("kind", "")
448
+ schema_name = json_schema_extra.get("name", "")
449
+
450
+ # Must have type: object, kind (agent or evaluator), and name
451
+ is_agent_schema = is_object_type and kind == "agent" and schema_name
452
+ is_evaluator_schema = is_object_type and kind == "evaluator" and schema_name
453
+
454
+ if not (is_agent_schema or is_evaluator_schema):
455
+ return {
456
+ "text": text,
457
+ "metadata": {"parser": "schema_fallback"},
458
+ "is_schema": False,
459
+ }
460
+
461
+ # Extract schema metadata
462
+ schema_type = kind # "agent" or "evaluator"
463
+ version = json_schema_extra.get("version", "1.0.0")
464
+ tags = json_schema_extra.get("tags", [])
465
+
466
+ # Use name directly (already in kebab-case format)
467
+ short_name = schema_name
468
+
469
+ # Build human-readable summary
470
+ description = schema_data.get("description", "No description provided")
471
+ description_preview = description[:200] + "..." if len(description) > 200 else description
472
+
473
+ properties = schema_data.get("properties", {})
474
+ required_fields = schema_data.get("required", [])
475
+
476
+ summary_parts = [
477
+ f"# {schema_type.title()} Schema: {short_name}",
478
+ f"**Version:** {version}",
479
+ f"**Name:** {schema_name}",
480
+ f"**Kind:** {kind}",
481
+ "",
482
+ "## Description",
483
+ description_preview,
484
+ "",
485
+ "## Output Fields",
486
+ ]
487
+
488
+ for field_name, field_spec in list(properties.items())[:10]: # Limit to 10 fields
489
+ field_type = field_spec.get("type", "unknown")
490
+ field_desc = field_spec.get("description", "")
491
+ required = " (required)" if field_name in required_fields else ""
492
+ summary_parts.append(f"- **{field_name}**: {field_type}{required} - {field_desc[:50]}")
493
+
494
+ if len(properties) > 10:
495
+ summary_parts.append(f"- ... and {len(properties) - 10} more fields")
496
+
497
+ text_summary = "\n".join(summary_parts)
498
+
499
+ # Extract additional metadata
500
+ extraction_metadata = {
501
+ "parser": "schema",
502
+ "schema_type": schema_type,
503
+ "short_name": short_name,
504
+ "version": version,
505
+ "kind": kind,
506
+ "name": schema_name,
507
+ "tags": tags,
508
+ "field_count": len(properties),
509
+ "required_field_count": len(required_fields),
510
+ "provider_configs": json_schema_extra.get("provider_configs", []),
511
+ "embedding_fields": json_schema_extra.get("embedding_fields", []),
512
+ "category": json_schema_extra.get("category"),
513
+ }
514
+
515
+ return {
516
+ "text": text_summary,
517
+ "metadata": extraction_metadata,
518
+ "schema_data": schema_data,
519
+ "is_schema": True,
520
+ }
521
+
522
+
523
+ class ImageProvider(ContentProvider):
524
+ """
525
+ Image content provider with vision LLM analysis and CLIP embeddings.
526
+
527
+ Features:
528
+ - Tier-based vision analysis (gold tier always gets analysis)
529
+ - Sampling-based vision analysis for non-gold users
530
+ - Vision LLM description generation (Anthropic, Gemini, OpenAI)
531
+ - Future: CLIP embeddings for semantic image search
532
+
533
+ Process:
534
+ 1. Check user tier and sampling rate
535
+ 2. If eligible, run vision LLM analysis
536
+ 3. Extract image metadata (dimensions, format)
537
+ 4. Return markdown description or basic metadata
538
+ 5. Save to ImageResource table (not Resource)
539
+
540
+ Vision analysis is expensive, so it's gated by:
541
+ - User tier (gold = always, silver/free = sampled)
542
+ - Sample rate setting (0.0 = never, 1.0 = always)
543
+ """
544
+
545
+ def __init__(self, user_tier: Optional[str] = None):
546
+ """
547
+ Initialize image provider.
548
+
549
+ Args:
550
+ user_tier: User tier (free, silver, gold) for vision gating
551
+ """
552
+ self.user_tier = user_tier
553
+
554
+ @property
555
+ def name(self) -> str:
556
+ return "image"
557
+
558
+ def _should_analyze_with_vision(self, sample_rate: float) -> bool:
559
+ """
560
+ Determine if image should get vision LLM analysis.
561
+
562
+ Args:
563
+ sample_rate: Sampling rate from settings (0.0-1.0)
564
+
565
+ Returns:
566
+ True if should analyze, False otherwise
567
+ """
568
+ # Import here to avoid circular dependency
569
+ from rem.models.entities import UserTier
570
+
571
+ # Gold tier always gets vision analysis
572
+ if self.user_tier == UserTier.GOLD.value:
573
+ logger.info("Gold tier user - vision analysis enabled")
574
+ return True
575
+
576
+ # For non-gold users, use sampling
577
+ if sample_rate > 0.0:
578
+ should_analyze = random.random() < sample_rate
579
+ if should_analyze:
580
+ logger.info(f"Vision analysis sampled (rate={sample_rate})")
581
+ return should_analyze
582
+
583
+ return False
584
+
585
+ def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
586
+ """
587
+ Extract image content with optional vision LLM analysis.
588
+
589
+ Args:
590
+ content: Image file bytes
591
+ metadata: File metadata (size, type, etc.)
592
+
593
+ Returns:
594
+ dict with:
595
+ - text: Markdown description (if vision enabled) or basic metadata
596
+ - metadata: Extraction metadata (dimensions, format, vision info)
597
+ - image_specific: Additional image metadata for ImageResource
598
+
599
+ Raises:
600
+ RuntimeError: If vision analysis fails
601
+ """
602
+ # Import settings here to avoid circular dependency
603
+ from rem.settings import settings
604
+
605
+ # Extract basic image metadata using PIL
606
+ try:
607
+ from PIL import Image
608
+ import io
609
+
610
+ img = Image.open(io.BytesIO(content))
611
+ image_width = img.width
612
+ image_height = img.height
613
+ image_format = img.format or "UNKNOWN"
614
+ except ImportError:
615
+ logger.warning("PIL not available - image metadata extraction disabled")
616
+ image_width = None
617
+ image_height = None
618
+ image_format = None
619
+ except Exception as e:
620
+ logger.warning(f"Failed to extract image metadata: {e}")
621
+ image_width = None
622
+ image_height = None
623
+ image_format = None
624
+
625
+ # Check if vision analysis should be performed
626
+ sample_rate = settings.content.image_vllm_sample_rate
627
+ should_analyze = self._should_analyze_with_vision(sample_rate)
628
+
629
+ vision_description = None
630
+ vision_provider = None
631
+ vision_model = None
632
+
633
+ if should_analyze:
634
+ # Perform vision LLM analysis
635
+ try:
636
+ from rem.utils.vision import ImageAnalyzer, VisionProvider
637
+
638
+ # Get provider from settings
639
+ provider_str = settings.content.image_vllm_provider.lower()
640
+ provider_map = {
641
+ "anthropic": VisionProvider.ANTHROPIC,
642
+ "gemini": VisionProvider.GEMINI,
643
+ "openai": VisionProvider.OPENAI,
644
+ }
645
+ provider = provider_map.get(provider_str, VisionProvider.ANTHROPIC)
646
+
647
+ # Create analyzer
648
+ analyzer = ImageAnalyzer(
649
+ provider=provider,
650
+ model=settings.content.image_vllm_model,
651
+ )
652
+
653
+ # Write bytes to temp file for analysis
654
+ content_type = metadata.get("content_type", "image/png")
655
+ extension = get_extension(content_type, default=".png")
656
+
657
+ with temp_file_from_bytes(content, suffix=extension) as tmp_path:
658
+ # Analyze image
659
+ result = analyzer.analyze_image(tmp_path)
660
+ vision_description = result.description
661
+ vision_provider = result.provider.value
662
+ vision_model = result.model
663
+
664
+ logger.info(f"Vision analysis complete: {len(vision_description)} chars")
665
+
666
+ except ImportError as e:
667
+ logger.warning(f"Vision analysis not available: {e}")
668
+ except Exception as e:
669
+ logger.error(f"Vision analysis failed: {e}")
670
+
671
+ # Build text content
672
+ if vision_description:
673
+ # Use vision description as primary content
674
+ text = f"# Image Analysis\n\n{vision_description}"
675
+ if image_width and image_height:
676
+ text += f"\n\n**Image Details:** {image_width}x{image_height} {image_format}"
677
+ else:
678
+ # Fallback to basic metadata
679
+ if image_width and image_height:
680
+ text = f"**Image:** {image_width}x{image_height} {image_format}"
681
+ else:
682
+ text = "**Image:** Metadata extraction unavailable"
683
+
684
+ # Generate CLIP embedding (if Jina API key available)
685
+ clip_embedding = None
686
+ clip_dimensions = None
687
+ clip_tokens = None
688
+
689
+ try:
690
+ from rem.utils.clip_embeddings import JinaCLIPEmbedder
691
+
692
+ # Only attempt CLIP embeddings if using Jina provider
693
+ if settings.content.clip_provider != "jina":
694
+ logger.debug(
695
+ f"CLIP provider set to '{settings.content.clip_provider}' - "
696
+ "skipping Jina embeddings (self-hosted not yet implemented)"
697
+ )
698
+ else:
699
+ embedder = JinaCLIPEmbedder(
700
+ api_key=settings.content.jina_api_key,
701
+ model=settings.content.clip_model,
702
+ )
703
+
704
+ if embedder.is_available():
705
+ # Write bytes to temp file for CLIP embedding
706
+ content_type = metadata.get("content_type", "image/png")
707
+ extension = get_extension(content_type, default=".png")
708
+
709
+ with temp_file_from_bytes(content, suffix=extension) as tmp_path:
710
+ # Generate CLIP embedding
711
+ result = embedder.embed_image(tmp_path)
712
+ if result:
713
+ clip_embedding = result.embedding # type: ignore[attr-defined]
714
+ clip_dimensions = result.dimensions # type: ignore[attr-defined]
715
+ clip_tokens = result.tokens_used # type: ignore[attr-defined]
716
+ logger.info(
717
+ f"CLIP embedding generated: {clip_dimensions} dims, {clip_tokens} tokens"
718
+ )
719
+ else:
720
+ logger.debug(
721
+ "CLIP embeddings disabled - set CONTENT__JINA_API_KEY to enable. "
722
+ "Get free API key at https://jina.ai/embeddings/"
723
+ )
724
+
725
+ except ImportError:
726
+ logger.debug("CLIP embedding module not available")
727
+ except Exception as e:
728
+ logger.warning(f"CLIP embedding generation failed (non-fatal): {e}")
729
+
730
+ # Build extraction metadata
731
+ extraction_metadata = {
732
+ "parser": "image_provider",
733
+ "vision_enabled": vision_description is not None,
734
+ "vision_provider": vision_provider,
735
+ "vision_model": vision_model,
736
+ "image_width": image_width,
737
+ "image_height": image_height,
738
+ "image_format": image_format,
739
+ "clip_enabled": clip_embedding is not None,
740
+ "clip_dimensions": clip_dimensions,
741
+ "clip_tokens": clip_tokens,
742
+ }
743
+
744
+ # Add image-specific metadata for ImageResource
745
+ image_specific = {
746
+ "image_width": image_width,
747
+ "image_height": image_height,
748
+ "image_format": image_format,
749
+ "vision_description": vision_description,
750
+ "vision_provider": vision_provider,
751
+ "vision_model": vision_model,
752
+ "clip_embedding": clip_embedding,
753
+ "clip_dimensions": clip_dimensions,
754
+ }
755
+
756
+ return {
757
+ "text": text,
758
+ "metadata": extraction_metadata,
759
+ "image_specific": image_specific,
760
+ }