remdb 0.2.6__py3-none-any.whl → 0.3.118__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -2
- rem/agentic/README.md +76 -0
- rem/agentic/__init__.py +15 -0
- rem/agentic/agents/__init__.py +16 -2
- rem/agentic/agents/sse_simulator.py +500 -0
- rem/agentic/context.py +28 -22
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/tool_wrapper.py +29 -3
- rem/agentic/otel/setup.py +92 -4
- rem/agentic/providers/phoenix.py +32 -43
- rem/agentic/providers/pydantic_ai.py +168 -24
- rem/agentic/schema.py +358 -21
- rem/agentic/tools/rem_tools.py +3 -3
- rem/api/README.md +238 -1
- rem/api/deps.py +255 -0
- rem/api/main.py +154 -37
- rem/api/mcp_router/resources.py +1 -1
- rem/api/mcp_router/server.py +26 -5
- rem/api/mcp_router/tools.py +454 -7
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +494 -0
- rem/api/routers/auth.py +124 -0
- rem/api/routers/chat/completions.py +152 -16
- rem/api/routers/chat/models.py +7 -3
- rem/api/routers/chat/sse_events.py +526 -0
- rem/api/routers/chat/streaming.py +608 -45
- rem/api/routers/dev.py +81 -0
- rem/api/routers/feedback.py +148 -0
- rem/api/routers/messages.py +473 -0
- rem/api/routers/models.py +78 -0
- rem/api/routers/query.py +360 -0
- rem/api/routers/shared_sessions.py +406 -0
- rem/auth/middleware.py +126 -27
- rem/cli/commands/README.md +237 -64
- rem/cli/commands/ask.py +15 -11
- rem/cli/commands/cluster.py +1300 -0
- rem/cli/commands/configure.py +170 -97
- rem/cli/commands/db.py +396 -139
- rem/cli/commands/experiments.py +278 -96
- rem/cli/commands/process.py +22 -15
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +97 -50
- rem/cli/main.py +37 -6
- rem/config.py +2 -2
- rem/models/core/core_model.py +7 -1
- rem/models/core/rem_query.py +5 -2
- rem/models/entities/__init__.py +21 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/message.py +30 -1
- rem/models/entities/session.py +83 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/user.py +10 -3
- rem/registry.py +373 -0
- rem/schemas/agents/rem.yaml +7 -3
- rem/services/content/providers.py +94 -140
- rem/services/content/service.py +115 -24
- rem/services/dreaming/affinity_service.py +2 -16
- rem/services/dreaming/moment_service.py +2 -15
- rem/services/embeddings/api.py +24 -17
- rem/services/embeddings/worker.py +16 -16
- rem/services/phoenix/EXPERIMENT_DESIGN.md +3 -3
- rem/services/phoenix/client.py +252 -19
- rem/services/postgres/README.md +159 -15
- rem/services/postgres/__init__.py +2 -1
- rem/services/postgres/diff_service.py +531 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +427 -129
- rem/services/postgres/repository.py +132 -0
- rem/services/postgres/schema_generator.py +291 -9
- rem/services/postgres/service.py +6 -6
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +14 -0
- rem/services/rem/parser.py +44 -9
- rem/services/rem/service.py +36 -2
- rem/services/session/compression.py +17 -1
- rem/services/session/reload.py +1 -1
- rem/services/user_service.py +98 -0
- rem/settings.py +169 -22
- rem/sql/background_indexes.sql +21 -16
- rem/sql/migrations/001_install.sql +387 -54
- rem/sql/migrations/002_install_models.sql +2320 -393
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +548 -0
- rem/utils/__init__.py +18 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/embeddings.py +17 -4
- rem/utils/files.py +167 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +156 -1
- rem/utils/schema_loader.py +284 -21
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +3 -1
- rem/utils/vision.py +9 -14
- rem/workers/README.md +14 -14
- rem/workers/__init__.py +2 -1
- rem/workers/db_maintainer.py +74 -0
- rem/workers/unlogged_maintainer.py +463 -0
- {remdb-0.2.6.dist-info → remdb-0.3.118.dist-info}/METADATA +598 -171
- {remdb-0.2.6.dist-info → remdb-0.3.118.dist-info}/RECORD +102 -73
- {remdb-0.2.6.dist-info → remdb-0.3.118.dist-info}/WHEEL +1 -1
- rem/sql/002_install_models.sql +0 -1068
- rem/sql/install_models.sql +0 -1038
- {remdb-0.2.6.dist-info → remdb-0.3.118.dist-info}/entry_points.txt +0 -0
|
@@ -2,17 +2,27 @@
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import multiprocessing
|
|
5
|
-
import os
|
|
6
5
|
import random
|
|
7
6
|
import subprocess
|
|
8
7
|
import sys
|
|
9
|
-
import tempfile
|
|
10
8
|
from abc import ABC, abstractmethod
|
|
11
9
|
from pathlib import Path
|
|
12
10
|
from typing import Any, Optional
|
|
13
11
|
|
|
14
12
|
from loguru import logger
|
|
15
13
|
|
|
14
|
+
from rem.utils.constants import (
|
|
15
|
+
AUDIO_CHUNK_TARGET_SECONDS,
|
|
16
|
+
AUDIO_CHUNK_WINDOW_SECONDS,
|
|
17
|
+
MIN_SILENCE_MS,
|
|
18
|
+
SILENCE_THRESHOLD_DB,
|
|
19
|
+
SUBPROCESS_TIMEOUT_SECONDS,
|
|
20
|
+
WAV_HEADER_MIN_BYTES,
|
|
21
|
+
WHISPER_COST_PER_MINUTE,
|
|
22
|
+
)
|
|
23
|
+
from rem.utils.files import temp_file_from_bytes
|
|
24
|
+
from rem.utils.mime_types import get_extension
|
|
25
|
+
|
|
16
26
|
|
|
17
27
|
class ContentProvider(ABC):
|
|
18
28
|
"""Base class for content extraction providers."""
|
|
@@ -132,7 +142,7 @@ import sys
|
|
|
132
142
|
from pathlib import Path
|
|
133
143
|
from kreuzberg import ExtractionConfig, extract_file_sync
|
|
134
144
|
|
|
135
|
-
# Parse document with
|
|
145
|
+
# Parse document with kreuzberg 3.x
|
|
136
146
|
config = ExtractionConfig(
|
|
137
147
|
extract_tables=True,
|
|
138
148
|
chunk_content=False,
|
|
@@ -144,13 +154,7 @@ result = extract_file_sync(Path(sys.argv[1]), config=config)
|
|
|
144
154
|
# Serialize result to JSON
|
|
145
155
|
output = {
|
|
146
156
|
'content': result.content,
|
|
147
|
-
'tables': [
|
|
148
|
-
{
|
|
149
|
-
'page_number': t.get('page_number', 0),
|
|
150
|
-
'text': t.get('text', ''),
|
|
151
|
-
}
|
|
152
|
-
for t in result.tables
|
|
153
|
-
],
|
|
157
|
+
'tables': [t.model_dump() for t in result.tables] if result.tables else [],
|
|
154
158
|
'metadata': result.metadata
|
|
155
159
|
}
|
|
156
160
|
print(json.dumps(output))
|
|
@@ -161,7 +165,7 @@ print(json.dumps(output))
|
|
|
161
165
|
[sys.executable, "-c", script, str(file_path)],
|
|
162
166
|
capture_output=True,
|
|
163
167
|
text=True,
|
|
164
|
-
timeout=
|
|
168
|
+
timeout=SUBPROCESS_TIMEOUT_SECONDS,
|
|
165
169
|
)
|
|
166
170
|
|
|
167
171
|
if result.returncode != 0:
|
|
@@ -183,21 +187,9 @@ print(json.dumps(output))
|
|
|
183
187
|
# Write bytes to temp file for kreuzberg
|
|
184
188
|
# Detect extension from metadata
|
|
185
189
|
content_type = metadata.get("content_type", "")
|
|
186
|
-
|
|
187
|
-
"application/pdf": ".pdf",
|
|
188
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
189
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
190
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
191
|
-
"image/png": ".png",
|
|
192
|
-
"image/jpeg": ".jpg",
|
|
193
|
-
}
|
|
194
|
-
suffix = extension_map.get(content_type, ".pdf") # Default to PDF
|
|
195
|
-
|
|
196
|
-
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
|
197
|
-
tmp.write(content)
|
|
198
|
-
tmp_path = Path(tmp.name)
|
|
190
|
+
suffix = get_extension(content_type, default=".pdf")
|
|
199
191
|
|
|
200
|
-
|
|
192
|
+
with temp_file_from_bytes(content, suffix=suffix) as tmp_path:
|
|
201
193
|
# Check if running in daemon process
|
|
202
194
|
if self._is_daemon_process():
|
|
203
195
|
logger.info("Daemon process detected - using subprocess workaround for document parsing")
|
|
@@ -211,7 +203,7 @@ print(json.dumps(output))
|
|
|
211
203
|
}
|
|
212
204
|
except Exception as e:
|
|
213
205
|
logger.error(f"Subprocess parsing failed: {e}. Falling back to text-only.")
|
|
214
|
-
# Fallback to simple text extraction
|
|
206
|
+
# Fallback to simple text extraction (kreuzberg 3.x API)
|
|
215
207
|
from kreuzberg import ExtractionConfig, extract_file_sync
|
|
216
208
|
config = ExtractionConfig(extract_tables=False)
|
|
217
209
|
result = extract_file_sync(tmp_path, config=config)
|
|
@@ -221,17 +213,17 @@ print(json.dumps(output))
|
|
|
221
213
|
"file_extension": tmp_path.suffix,
|
|
222
214
|
}
|
|
223
215
|
else:
|
|
224
|
-
# Normal execution (not in daemon)
|
|
216
|
+
# Normal execution (not in daemon) - kreuzberg 4.x with native ONNX/Rust
|
|
225
217
|
from kreuzberg import ExtractionConfig, extract_file_sync
|
|
226
218
|
config = ExtractionConfig(
|
|
227
|
-
|
|
228
|
-
chunk_content=False,
|
|
229
|
-
|
|
219
|
+
enable_quality_processing=True, # Enables table extraction with native ONNX
|
|
220
|
+
chunk_content=False, # We handle chunking ourselves
|
|
221
|
+
extract_tables=False, # Disable table extraction to avoid PyTorch dependency
|
|
230
222
|
)
|
|
231
223
|
result = extract_file_sync(tmp_path, config=config)
|
|
232
224
|
text = result.content
|
|
233
225
|
extraction_metadata = {
|
|
234
|
-
"table_count": len(result.tables),
|
|
226
|
+
"table_count": len(result.tables) if result.tables else 0,
|
|
235
227
|
"parser": "kreuzberg",
|
|
236
228
|
"file_extension": tmp_path.suffix,
|
|
237
229
|
}
|
|
@@ -241,10 +233,6 @@ print(json.dumps(output))
|
|
|
241
233
|
"metadata": extraction_metadata,
|
|
242
234
|
}
|
|
243
235
|
|
|
244
|
-
finally:
|
|
245
|
-
# Clean up temp file
|
|
246
|
-
tmp_path.unlink(missing_ok=True)
|
|
247
|
-
|
|
248
236
|
|
|
249
237
|
class AudioProvider(ContentProvider):
|
|
250
238
|
"""
|
|
@@ -289,19 +277,20 @@ class AudioProvider(ContentProvider):
|
|
|
289
277
|
ValueError: If OpenAI API key missing
|
|
290
278
|
"""
|
|
291
279
|
# Handle empty or invalid content
|
|
292
|
-
if not content or len(content) <
|
|
280
|
+
if not content or len(content) < WAV_HEADER_MIN_BYTES:
|
|
293
281
|
logger.warning("Audio content too small to be valid WAV file")
|
|
294
282
|
return {
|
|
295
283
|
"text": "[Invalid or empty audio file]",
|
|
296
284
|
"metadata": {"error": "invalid_content", "size": len(content)},
|
|
297
285
|
}
|
|
298
286
|
|
|
299
|
-
# Check for OpenAI API key
|
|
300
|
-
|
|
287
|
+
# Check for OpenAI API key (use settings)
|
|
288
|
+
from rem.settings import settings
|
|
289
|
+
api_key = settings.llm.openai_api_key
|
|
301
290
|
if not api_key:
|
|
302
|
-
logger.warning("No
|
|
291
|
+
logger.warning("No OpenAI API key found - audio transcription disabled")
|
|
303
292
|
return {
|
|
304
|
-
"text": "[Audio transcription requires
|
|
293
|
+
"text": "[Audio transcription requires LLM__OPENAI_API_KEY to be set]",
|
|
305
294
|
"metadata": {"error": "missing_api_key"},
|
|
306
295
|
}
|
|
307
296
|
|
|
@@ -318,83 +307,74 @@ class AudioProvider(ContentProvider):
|
|
|
318
307
|
# Write bytes to temp file
|
|
319
308
|
# Detect extension from metadata or use .wav as fallback
|
|
320
309
|
content_type = metadata.get("content_type", "audio/wav")
|
|
321
|
-
|
|
322
|
-
"audio/wav": ".wav",
|
|
323
|
-
"audio/mpeg": ".mp3",
|
|
324
|
-
"audio/mp4": ".m4a",
|
|
325
|
-
"audio/x-m4a": ".m4a",
|
|
326
|
-
"audio/flac": ".flac",
|
|
327
|
-
"audio/ogg": ".ogg",
|
|
328
|
-
}
|
|
329
|
-
extension = extension_map.get(content_type, ".wav")
|
|
310
|
+
extension = get_extension(content_type, default=".wav")
|
|
330
311
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
tmp_path = Path(tmp.name)
|
|
312
|
+
chunker = None
|
|
313
|
+
chunks = None
|
|
334
314
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
chunks = chunker.chunk_audio(tmp_path)
|
|
347
|
-
logger.info(f"Created {len(chunks)} audio chunks")
|
|
348
|
-
|
|
349
|
-
# Step 2: Transcribe chunks
|
|
350
|
-
transcriber = AudioTranscriber(api_key=api_key)
|
|
351
|
-
results = transcriber.transcribe_chunks(chunks)
|
|
352
|
-
logger.info(f"Transcribed {len(results)} chunks")
|
|
353
|
-
|
|
354
|
-
# Step 3: Combine into markdown format
|
|
355
|
-
# Format: Each chunk becomes a section with timestamp
|
|
356
|
-
markdown_parts = []
|
|
357
|
-
for result in results:
|
|
358
|
-
timestamp = f"{result.start_seconds:.1f}s - {result.end_seconds:.1f}s"
|
|
359
|
-
markdown_parts.append(f"## [{timestamp}]\n\n{result.text}\n")
|
|
360
|
-
|
|
361
|
-
markdown_text = "\n".join(markdown_parts)
|
|
362
|
-
|
|
363
|
-
# Calculate metadata
|
|
364
|
-
total_duration = sum(r.duration_seconds for r in results)
|
|
365
|
-
estimated_cost = (total_duration / 60) * 0.006 # $0.006 per minute
|
|
366
|
-
successful_chunks = sum(1 for r in results if r.confidence > 0)
|
|
367
|
-
|
|
368
|
-
extraction_metadata = {
|
|
369
|
-
"chunk_count": len(chunks),
|
|
370
|
-
"transcribed_chunks": successful_chunks,
|
|
371
|
-
"duration_seconds": total_duration,
|
|
372
|
-
"estimated_cost": estimated_cost,
|
|
373
|
-
"parser": "whisper_api",
|
|
374
|
-
}
|
|
315
|
+
with temp_file_from_bytes(content, suffix=extension) as tmp_path:
|
|
316
|
+
try:
|
|
317
|
+
logger.info(f"Processing audio file: {tmp_path.name} ({len(content) / 1024 / 1024:.1f} MB)")
|
|
318
|
+
|
|
319
|
+
# Step 1: Chunk audio by silence
|
|
320
|
+
chunker = AudioChunker(
|
|
321
|
+
target_chunk_seconds=AUDIO_CHUNK_TARGET_SECONDS,
|
|
322
|
+
chunk_window_seconds=AUDIO_CHUNK_WINDOW_SECONDS,
|
|
323
|
+
silence_threshold_db=SILENCE_THRESHOLD_DB,
|
|
324
|
+
min_silence_ms=MIN_SILENCE_MS,
|
|
325
|
+
)
|
|
375
326
|
|
|
376
|
-
|
|
377
|
-
f"
|
|
378
|
-
f"${estimated_cost:.3f} cost"
|
|
379
|
-
)
|
|
327
|
+
chunks = chunker.chunk_audio(tmp_path)
|
|
328
|
+
logger.info(f"Created {len(chunks)} audio chunks")
|
|
380
329
|
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
330
|
+
# Step 2: Transcribe chunks
|
|
331
|
+
transcriber = AudioTranscriber(api_key=api_key)
|
|
332
|
+
results = transcriber.transcribe_chunks(chunks)
|
|
333
|
+
logger.info(f"Transcribed {len(results)} chunks")
|
|
385
334
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
335
|
+
# Step 3: Combine into markdown format
|
|
336
|
+
# Format: Each chunk becomes a section with timestamp
|
|
337
|
+
markdown_parts = []
|
|
338
|
+
for result in results:
|
|
339
|
+
timestamp = f"{result.start_seconds:.1f}s - {result.end_seconds:.1f}s"
|
|
340
|
+
markdown_parts.append(f"## [{timestamp}]\n\n{result.text}\n")
|
|
341
|
+
|
|
342
|
+
markdown_text = "\n".join(markdown_parts)
|
|
343
|
+
|
|
344
|
+
# Calculate metadata
|
|
345
|
+
total_duration = sum(r.duration_seconds for r in results)
|
|
346
|
+
estimated_cost = (total_duration / 60) * WHISPER_COST_PER_MINUTE
|
|
347
|
+
successful_chunks = sum(1 for r in results if r.confidence > 0)
|
|
348
|
+
|
|
349
|
+
extraction_metadata = {
|
|
350
|
+
"chunk_count": len(chunks),
|
|
351
|
+
"transcribed_chunks": successful_chunks,
|
|
352
|
+
"duration_seconds": total_duration,
|
|
353
|
+
"estimated_cost": estimated_cost,
|
|
354
|
+
"parser": "whisper_api",
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
logger.info(
|
|
358
|
+
f"Transcription complete: {successful_chunks}/{len(chunks)} chunks, "
|
|
359
|
+
f"${estimated_cost:.3f} cost"
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
return {
|
|
363
|
+
"text": markdown_text,
|
|
364
|
+
"metadata": extraction_metadata,
|
|
365
|
+
}
|
|
389
366
|
|
|
390
|
-
finally:
|
|
391
|
-
# Clean up temp file and chunks
|
|
392
|
-
try:
|
|
393
|
-
tmp_path.unlink(missing_ok=True)
|
|
394
|
-
if 'chunker' in locals() and 'chunks' in locals():
|
|
395
|
-
chunker.cleanup_chunks(chunks)
|
|
396
367
|
except Exception as e:
|
|
397
|
-
logger.
|
|
368
|
+
logger.error(f"Audio extraction failed: {e}")
|
|
369
|
+
raise RuntimeError(f"Audio transcription failed: {e}") from e
|
|
370
|
+
|
|
371
|
+
finally:
|
|
372
|
+
# Clean up audio chunks (temp file cleanup handled by context manager)
|
|
373
|
+
if chunker is not None and chunks is not None:
|
|
374
|
+
try:
|
|
375
|
+
chunker.cleanup_chunks(chunks)
|
|
376
|
+
except Exception as e:
|
|
377
|
+
logger.warning(f"Chunk cleanup failed: {e}")
|
|
398
378
|
|
|
399
379
|
|
|
400
380
|
class SchemaProvider(ContentProvider):
|
|
@@ -672,19 +652,9 @@ class ImageProvider(ContentProvider):
|
|
|
672
652
|
|
|
673
653
|
# Write bytes to temp file for analysis
|
|
674
654
|
content_type = metadata.get("content_type", "image/png")
|
|
675
|
-
|
|
676
|
-
"image/png": ".png",
|
|
677
|
-
"image/jpeg": ".jpg",
|
|
678
|
-
"image/gif": ".gif",
|
|
679
|
-
"image/webp": ".webp",
|
|
680
|
-
}
|
|
681
|
-
extension = extension_map.get(content_type, ".png")
|
|
682
|
-
|
|
683
|
-
with tempfile.NamedTemporaryFile(suffix=extension, delete=False) as tmp:
|
|
684
|
-
tmp.write(content)
|
|
685
|
-
tmp_path = Path(tmp.name)
|
|
655
|
+
extension = get_extension(content_type, default=".png")
|
|
686
656
|
|
|
687
|
-
|
|
657
|
+
with temp_file_from_bytes(content, suffix=extension) as tmp_path:
|
|
688
658
|
# Analyze image
|
|
689
659
|
result = analyzer.analyze_image(tmp_path)
|
|
690
660
|
vision_description = result.description
|
|
@@ -692,9 +662,6 @@ class ImageProvider(ContentProvider):
|
|
|
692
662
|
vision_model = result.model
|
|
693
663
|
|
|
694
664
|
logger.info(f"Vision analysis complete: {len(vision_description)} chars")
|
|
695
|
-
finally:
|
|
696
|
-
# Clean up temp file
|
|
697
|
-
tmp_path.unlink(missing_ok=True)
|
|
698
665
|
|
|
699
666
|
except ImportError as e:
|
|
700
667
|
logger.warning(f"Vision analysis not available: {e}")
|
|
@@ -737,19 +704,9 @@ class ImageProvider(ContentProvider):
|
|
|
737
704
|
if embedder.is_available():
|
|
738
705
|
# Write bytes to temp file for CLIP embedding
|
|
739
706
|
content_type = metadata.get("content_type", "image/png")
|
|
740
|
-
|
|
741
|
-
"image/png": ".png",
|
|
742
|
-
"image/jpeg": ".jpg",
|
|
743
|
-
"image/gif": ".gif",
|
|
744
|
-
"image/webp": ".webp",
|
|
745
|
-
}
|
|
746
|
-
extension = extension_map.get(content_type, ".png")
|
|
707
|
+
extension = get_extension(content_type, default=".png")
|
|
747
708
|
|
|
748
|
-
with
|
|
749
|
-
tmp.write(content)
|
|
750
|
-
tmp_path = Path(tmp.name)
|
|
751
|
-
|
|
752
|
-
try:
|
|
709
|
+
with temp_file_from_bytes(content, suffix=extension) as tmp_path:
|
|
753
710
|
# Generate CLIP embedding
|
|
754
711
|
result = embedder.embed_image(tmp_path)
|
|
755
712
|
if result:
|
|
@@ -759,9 +716,6 @@ class ImageProvider(ContentProvider):
|
|
|
759
716
|
logger.info(
|
|
760
717
|
f"CLIP embedding generated: {clip_dimensions} dims, {clip_tokens} tokens"
|
|
761
718
|
)
|
|
762
|
-
finally:
|
|
763
|
-
# Clean up temp file
|
|
764
|
-
tmp_path.unlink(missing_ok=True)
|
|
765
719
|
else:
|
|
766
720
|
logger.debug(
|
|
767
721
|
"CLIP embeddings disabled - set CONTENT__JINA_API_KEY to enable. "
|
rem/services/content/service.py
CHANGED
|
@@ -159,13 +159,22 @@ class ContentService:
|
|
|
159
159
|
|
|
160
160
|
extracted_content = provider.extract(content_bytes, metadata)
|
|
161
161
|
|
|
162
|
-
|
|
162
|
+
# Build result with standard fields
|
|
163
|
+
result = {
|
|
163
164
|
"uri": uri,
|
|
164
165
|
"content": extracted_content["text"],
|
|
165
166
|
"metadata": {**metadata, **extracted_content.get("metadata", {})},
|
|
166
167
|
"provider": provider.name,
|
|
167
168
|
}
|
|
168
169
|
|
|
170
|
+
# Preserve schema-specific fields if present (from SchemaProvider)
|
|
171
|
+
if "is_schema" in extracted_content:
|
|
172
|
+
result["is_schema"] = extracted_content["is_schema"]
|
|
173
|
+
if "schema_data" in extracted_content:
|
|
174
|
+
result["schema_data"] = extracted_content["schema_data"]
|
|
175
|
+
|
|
176
|
+
return result
|
|
177
|
+
|
|
169
178
|
except ClientError as e:
|
|
170
179
|
error_code = e.response.get("Error", {}).get("Code", "")
|
|
171
180
|
if error_code == "NoSuchKey":
|
|
@@ -221,13 +230,22 @@ class ContentService:
|
|
|
221
230
|
provider = self._get_provider(file_path.suffix)
|
|
222
231
|
extracted_content = provider.extract(content_bytes, metadata)
|
|
223
232
|
|
|
224
|
-
|
|
233
|
+
# Build result with standard fields
|
|
234
|
+
result = {
|
|
225
235
|
"uri": str(file_path.absolute()),
|
|
226
236
|
"content": extracted_content["text"],
|
|
227
237
|
"metadata": {**metadata, **extracted_content.get("metadata", {})},
|
|
228
238
|
"provider": provider.name,
|
|
229
239
|
}
|
|
230
240
|
|
|
241
|
+
# Preserve schema-specific fields if present (from SchemaProvider)
|
|
242
|
+
if "is_schema" in extracted_content:
|
|
243
|
+
result["is_schema"] = extracted_content["is_schema"]
|
|
244
|
+
if "schema_data" in extracted_content:
|
|
245
|
+
result["schema_data"] = extracted_content["schema_data"]
|
|
246
|
+
|
|
247
|
+
return result
|
|
248
|
+
|
|
231
249
|
def _get_provider(self, suffix: str) -> ContentProvider:
|
|
232
250
|
"""Get content provider for file extension."""
|
|
233
251
|
suffix_lower = suffix.lower()
|
|
@@ -260,6 +278,7 @@ class ContentService:
|
|
|
260
278
|
category: str | None = None,
|
|
261
279
|
tags: list[str] | None = None,
|
|
262
280
|
is_local_server: bool = False,
|
|
281
|
+
resource_type: str | None = None,
|
|
263
282
|
) -> dict[str, Any]:
|
|
264
283
|
"""
|
|
265
284
|
Complete file ingestion pipeline: read → store → parse → chunk → embed.
|
|
@@ -304,6 +323,9 @@ class ContentService:
|
|
|
304
323
|
category: Optional category tag (document, code, audio, etc.)
|
|
305
324
|
tags: Optional list of tags
|
|
306
325
|
is_local_server: True if running as local/stdio MCP server
|
|
326
|
+
resource_type: Optional resource type (case-insensitive). Supports:
|
|
327
|
+
- "resource", "resources", "Resource" → Resource (default)
|
|
328
|
+
- "domain-resource", "domain_resource", "DomainResource" → DomainResource
|
|
307
329
|
|
|
308
330
|
Returns:
|
|
309
331
|
dict with:
|
|
@@ -348,11 +370,32 @@ class ContentService:
|
|
|
348
370
|
file_size = len(file_content)
|
|
349
371
|
logger.info(f"Read {file_size} bytes from {file_uri} (source: {source_type})")
|
|
350
372
|
|
|
351
|
-
# Step
|
|
373
|
+
# Step 1.5: Early schema detection for YAML/JSON files
|
|
374
|
+
# Skip File entity creation for schemas (agents/evaluators)
|
|
375
|
+
file_suffix = Path(file_name).suffix.lower()
|
|
376
|
+
if file_suffix in ['.yaml', '.yml', '.json']:
|
|
377
|
+
import yaml
|
|
378
|
+
import json
|
|
379
|
+
try:
|
|
380
|
+
content_text = file_content.decode('utf-8') if isinstance(file_content, bytes) else file_content
|
|
381
|
+
data = yaml.safe_load(content_text) if file_suffix in ['.yaml', '.yml'] else json.loads(content_text)
|
|
382
|
+
if isinstance(data, dict):
|
|
383
|
+
json_schema_extra = data.get('json_schema_extra', {})
|
|
384
|
+
kind = json_schema_extra.get('kind', '')
|
|
385
|
+
if kind in ['agent', 'evaluator']:
|
|
386
|
+
# Route directly to schema processing, skip File entity
|
|
387
|
+
logger.info(f"Detected {kind} schema: {file_name}, routing to _process_schema")
|
|
388
|
+
result = self.process_uri(file_uri)
|
|
389
|
+
return await self._process_schema(result, file_uri, user_id)
|
|
390
|
+
except Exception as e:
|
|
391
|
+
logger.debug(f"Early schema detection failed for {file_name}: {e}")
|
|
392
|
+
# Fall through to standard file processing
|
|
393
|
+
|
|
394
|
+
# Step 2: Write to internal storage (public or user-scoped)
|
|
352
395
|
file_id = str(uuid4())
|
|
353
396
|
storage_uri, internal_key, content_type, _ = await fs_service.write_to_internal_storage(
|
|
354
397
|
content=file_content,
|
|
355
|
-
tenant_id=user_id, #
|
|
398
|
+
tenant_id=user_id or "public", # Storage path: public/ or user_id/
|
|
356
399
|
file_name=file_name,
|
|
357
400
|
file_id=file_id,
|
|
358
401
|
)
|
|
@@ -361,7 +404,7 @@ class ContentService:
|
|
|
361
404
|
# Step 3: Create File entity
|
|
362
405
|
file_entity = File(
|
|
363
406
|
id=file_id,
|
|
364
|
-
tenant_id=user_id, #
|
|
407
|
+
tenant_id=user_id, # None = public/shared
|
|
365
408
|
user_id=user_id,
|
|
366
409
|
name=file_name,
|
|
367
410
|
uri=storage_uri,
|
|
@@ -400,6 +443,7 @@ class ContentService:
|
|
|
400
443
|
processing_result = await self.process_and_save(
|
|
401
444
|
uri=storage_uri,
|
|
402
445
|
user_id=user_id,
|
|
446
|
+
resource_type=resource_type,
|
|
403
447
|
)
|
|
404
448
|
processing_status = processing_result.get("status", "completed")
|
|
405
449
|
resources_created = processing_result.get("chunk_count", 0)
|
|
@@ -441,7 +485,12 @@ class ContentService:
|
|
|
441
485
|
"message": f"File ingested and {processing_status}. Created {resources_created} resources.",
|
|
442
486
|
}
|
|
443
487
|
|
|
444
|
-
async def process_and_save(
|
|
488
|
+
async def process_and_save(
|
|
489
|
+
self,
|
|
490
|
+
uri: str,
|
|
491
|
+
user_id: str | None = None,
|
|
492
|
+
resource_type: str | None = None,
|
|
493
|
+
) -> dict[str, Any]:
|
|
445
494
|
"""
|
|
446
495
|
Process file end-to-end: extract → markdown → chunk → save.
|
|
447
496
|
|
|
@@ -456,6 +505,8 @@ class ContentService:
|
|
|
456
505
|
Args:
|
|
457
506
|
uri: File URI (s3://bucket/key or local path)
|
|
458
507
|
user_id: Optional user ID for multi-tenancy
|
|
508
|
+
resource_type: Optional resource type (case-insensitive). Defaults to "Resource".
|
|
509
|
+
Supports: resource, domain-resource, domain_resource, DomainResource, etc.
|
|
459
510
|
|
|
460
511
|
Returns:
|
|
461
512
|
dict with file metadata and chunk count
|
|
@@ -470,8 +521,9 @@ class ContentService:
|
|
|
470
521
|
file_suffix = Path(uri).suffix.lower()
|
|
471
522
|
if file_suffix in ['.yaml', '.yml', '.json']:
|
|
472
523
|
# Check if schema provider detected a valid schema
|
|
473
|
-
|
|
474
|
-
|
|
524
|
+
# is_schema flag is at top level of result (preserved from SchemaProvider)
|
|
525
|
+
if result.get('is_schema'):
|
|
526
|
+
logger.info(f"🔧 Custom provider flow initiated: kind={result.get('metadata', {}).get('kind')} for {filename}")
|
|
475
527
|
return await self._process_schema(result, uri, user_id)
|
|
476
528
|
|
|
477
529
|
# Check for engram kind in raw data
|
|
@@ -507,7 +559,7 @@ class ContentService:
|
|
|
507
559
|
size_bytes=result["metadata"].get("size"),
|
|
508
560
|
mime_type=result["metadata"].get("content_type"),
|
|
509
561
|
processing_status="completed",
|
|
510
|
-
tenant_id=user_id
|
|
562
|
+
tenant_id=user_id, # None = public/shared
|
|
511
563
|
user_id=user_id,
|
|
512
564
|
)
|
|
513
565
|
|
|
@@ -515,28 +567,66 @@ class ContentService:
|
|
|
515
567
|
await self.file_repo.upsert(file)
|
|
516
568
|
logger.info(f"Saved File: {filename}")
|
|
517
569
|
|
|
518
|
-
#
|
|
519
|
-
|
|
520
|
-
|
|
570
|
+
# Resolve resource model class from type parameter (case-insensitive)
|
|
571
|
+
from typing import cast, Type
|
|
572
|
+
from pydantic import BaseModel
|
|
573
|
+
from rem.utils.model_helpers import model_from_arbitrary_casing, get_table_name
|
|
574
|
+
|
|
575
|
+
resource_model: Type[BaseModel] = Resource # Default
|
|
576
|
+
if resource_type:
|
|
577
|
+
try:
|
|
578
|
+
resource_model = model_from_arbitrary_casing(resource_type)
|
|
579
|
+
logger.info(f"Using resource model: {resource_model.__name__}")
|
|
580
|
+
except ValueError as e:
|
|
581
|
+
logger.warning(f"Invalid resource_type '{resource_type}', using default Resource: {e}")
|
|
582
|
+
resource_model = Resource
|
|
583
|
+
|
|
584
|
+
# Get table name for the resolved model
|
|
585
|
+
table_name = get_table_name(resource_model)
|
|
586
|
+
|
|
587
|
+
# Create resource entities for each chunk
|
|
588
|
+
resources: list[BaseModel] = [
|
|
589
|
+
resource_model(
|
|
521
590
|
name=f"{filename}#chunk-{i}",
|
|
522
591
|
uri=f"{uri}#chunk-{i}",
|
|
523
592
|
ordinal=i,
|
|
524
593
|
content=chunk,
|
|
525
594
|
category="document",
|
|
526
|
-
tenant_id=user_id
|
|
595
|
+
tenant_id=user_id, # None = public/shared
|
|
527
596
|
user_id=user_id,
|
|
528
597
|
)
|
|
529
598
|
for i, chunk in enumerate(chunks)
|
|
530
599
|
]
|
|
531
600
|
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
601
|
+
# Save resources to the appropriate table
|
|
602
|
+
if resources:
|
|
603
|
+
from rem.services.postgres import get_postgres_service
|
|
604
|
+
|
|
605
|
+
postgres = get_postgres_service()
|
|
606
|
+
if postgres:
|
|
607
|
+
await postgres.connect()
|
|
608
|
+
try:
|
|
609
|
+
await postgres.batch_upsert(
|
|
610
|
+
records=cast(list[BaseModel | dict], resources),
|
|
611
|
+
model=resource_model,
|
|
612
|
+
table_name=table_name,
|
|
613
|
+
entity_key_field="name",
|
|
614
|
+
embeddable_fields=["content"],
|
|
615
|
+
generate_embeddings=True,
|
|
616
|
+
)
|
|
617
|
+
logger.info(f"Saved {len(resources)} {resource_model.__name__} chunks to {table_name}")
|
|
618
|
+
logger.info(f"Queued {len(resources)} embedding generation tasks for content field")
|
|
619
|
+
finally:
|
|
620
|
+
await postgres.disconnect()
|
|
621
|
+
elif self.resource_repo:
|
|
622
|
+
# Fallback to injected repo (only works for default Resource)
|
|
623
|
+
await self.resource_repo.upsert(
|
|
624
|
+
resources,
|
|
625
|
+
embeddable_fields=["content"],
|
|
626
|
+
generate_embeddings=True,
|
|
627
|
+
)
|
|
628
|
+
logger.info(f"Saved {len(resources)} Resource chunks")
|
|
629
|
+
logger.info(f"Queued {len(resources)} embedding generation tasks for content field")
|
|
540
630
|
|
|
541
631
|
return {
|
|
542
632
|
"file": file.model_dump(),
|
|
@@ -576,9 +666,10 @@ class ContentService:
|
|
|
576
666
|
# IMPORTANT: category field distinguishes agents from evaluators
|
|
577
667
|
# - kind=agent → category="agent" (AI agents with tools/resources)
|
|
578
668
|
# - kind=evaluator → category="evaluator" (LLM-as-a-Judge evaluators)
|
|
669
|
+
# Schemas (agents/evaluators) default to system tenant for shared access
|
|
579
670
|
schema_entity = Schema(
|
|
580
|
-
tenant_id=
|
|
581
|
-
user_id=
|
|
671
|
+
tenant_id="system",
|
|
672
|
+
user_id=None,
|
|
582
673
|
name=name,
|
|
583
674
|
spec=schema_data,
|
|
584
675
|
category=kind, # Maps kind → category for database filtering
|
|
@@ -648,7 +739,7 @@ class ContentService:
|
|
|
648
739
|
processor = EngramProcessor(postgres)
|
|
649
740
|
result = await processor.process_engram(
|
|
650
741
|
data=data,
|
|
651
|
-
tenant_id=user_id
|
|
742
|
+
tenant_id=user_id, # None = public/shared
|
|
652
743
|
user_id=user_id,
|
|
653
744
|
)
|
|
654
745
|
logger.info(f"✅ Engram processed: {result.get('resource_id')} with {len(result.get('moment_ids', []))} moments")
|
|
@@ -8,12 +8,11 @@ vector similarity (fast) or LLM analysis (intelligent).
|
|
|
8
8
|
import json
|
|
9
9
|
from datetime import datetime, timedelta
|
|
10
10
|
from enum import Enum
|
|
11
|
-
from pathlib import Path
|
|
12
11
|
from typing import Any, Optional
|
|
13
12
|
|
|
14
|
-
import yaml
|
|
15
13
|
from loguru import logger
|
|
16
14
|
|
|
15
|
+
from ...utils.schema_loader import load_agent_schema
|
|
17
16
|
from ...agentic.providers.pydantic_ai import create_agent
|
|
18
17
|
from ...agentic.serialization import serialize_agent_result
|
|
19
18
|
from ...models.core import QueryType, RemQuery, SearchParameters
|
|
@@ -125,20 +124,7 @@ async def build_affinity(
|
|
|
125
124
|
# Load LLM agent for relationship assessment if needed
|
|
126
125
|
affinity_agent = None
|
|
127
126
|
if mode == AffinityMode.LLM:
|
|
128
|
-
|
|
129
|
-
Path(__file__).parent.parent.parent
|
|
130
|
-
/ "schemas"
|
|
131
|
-
/ "agents"
|
|
132
|
-
/ "resource-affinity-assessor.yaml"
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
if not schema_path.exists():
|
|
136
|
-
raise FileNotFoundError(
|
|
137
|
-
f"ResourceAffinityAssessor schema not found: {schema_path}"
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
with open(schema_path) as f:
|
|
141
|
-
agent_schema = yaml.safe_load(f)
|
|
127
|
+
agent_schema = load_agent_schema("resource-affinity-assessor")
|
|
142
128
|
|
|
143
129
|
affinity_agent_runtime = await create_agent(
|
|
144
130
|
agent_schema_override=agent_schema,
|