remdb 0.3.180__py3-none-any.whl → 0.3.230__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/agentic/README.md +36 -2
- rem/agentic/context.py +173 -0
- rem/agentic/context_builder.py +12 -2
- rem/agentic/mcp/tool_wrapper.py +2 -2
- rem/agentic/providers/pydantic_ai.py +1 -1
- rem/agentic/schema.py +2 -2
- rem/api/main.py +1 -1
- rem/api/mcp_router/server.py +4 -0
- rem/api/mcp_router/tools.py +542 -166
- rem/api/routers/admin.py +30 -4
- rem/api/routers/auth.py +106 -10
- rem/api/routers/chat/child_streaming.py +379 -0
- rem/api/routers/chat/completions.py +74 -37
- rem/api/routers/chat/sse_events.py +7 -3
- rem/api/routers/chat/streaming.py +352 -257
- rem/api/routers/chat/streaming_utils.py +327 -0
- rem/api/routers/common.py +18 -0
- rem/api/routers/dev.py +7 -1
- rem/api/routers/feedback.py +9 -1
- rem/api/routers/messages.py +176 -38
- rem/api/routers/models.py +9 -1
- rem/api/routers/query.py +12 -1
- rem/api/routers/shared_sessions.py +16 -0
- rem/auth/jwt.py +19 -4
- rem/auth/middleware.py +42 -28
- rem/cli/README.md +62 -0
- rem/cli/commands/ask.py +61 -81
- rem/cli/commands/db.py +55 -31
- rem/cli/commands/process.py +171 -43
- rem/models/entities/ontology.py +18 -20
- rem/schemas/agents/rem.yaml +1 -1
- rem/services/content/service.py +18 -5
- rem/services/embeddings/worker.py +26 -12
- rem/services/postgres/__init__.py +28 -3
- rem/services/postgres/diff_service.py +57 -5
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
- rem/services/postgres/register_type.py +11 -10
- rem/services/postgres/repository.py +39 -29
- rem/services/postgres/schema_generator.py +5 -5
- rem/services/postgres/sql_builder.py +6 -5
- rem/services/session/__init__.py +8 -1
- rem/services/session/compression.py +40 -2
- rem/services/session/pydantic_messages.py +292 -0
- rem/settings.py +28 -0
- rem/sql/migrations/001_install.sql +125 -7
- rem/sql/migrations/002_install_models.sql +159 -149
- rem/sql/migrations/004_cache_system.sql +7 -275
- rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
- rem/utils/schema_loader.py +79 -51
- {remdb-0.3.180.dist-info → remdb-0.3.230.dist-info}/METADATA +2 -2
- {remdb-0.3.180.dist-info → remdb-0.3.230.dist-info}/RECORD +54 -48
- {remdb-0.3.180.dist-info → remdb-0.3.230.dist-info}/WHEEL +0 -0
- {remdb-0.3.180.dist-info → remdb-0.3.230.dist-info}/entry_points.txt +0 -0
rem/cli/README.md
CHANGED
|
@@ -434,6 +434,68 @@ Ensure you're using the correct model format:
|
|
|
434
434
|
- OpenAI: `openai:gpt-4o-mini`, `openai:gpt-4o`
|
|
435
435
|
- Anthropic: `anthropic:claude-sonnet-4-5-20250929`
|
|
436
436
|
|
|
437
|
+
## Data Visibility: PUBLIC vs PRIVATE
|
|
438
|
+
|
|
439
|
+
**IMPORTANT: All ingested data is PUBLIC by default.** This is the correct behavior
|
|
440
|
+
for shared knowledge bases (ontologies, procedures, reference data).
|
|
441
|
+
|
|
442
|
+
### Why PUBLIC by Default?
|
|
443
|
+
|
|
444
|
+
Most data in REM should be searchable by all users:
|
|
445
|
+
- Clinical ontologies (disorders, symptoms, drugs)
|
|
446
|
+
- Procedures and protocols (SCID-5, PHQ-9, etc.)
|
|
447
|
+
- Reference documentation
|
|
448
|
+
- Shared domain knowledge
|
|
449
|
+
|
|
450
|
+
The `rem_lookup()` function searches for data where `user_id IS NULL`, which means
|
|
451
|
+
public data. If you set `user_id` on data, it becomes invisible to other users.
|
|
452
|
+
|
|
453
|
+
### Ingesting Public Data (Default)
|
|
454
|
+
|
|
455
|
+
```bash
|
|
456
|
+
# Standard ingestion - data is PUBLIC
|
|
457
|
+
rem process ingest ontology/procedures/ --table ontologies
|
|
458
|
+
|
|
459
|
+
# From S3 - also PUBLIC
|
|
460
|
+
rem process ingest s3://bucket/docs/reference.pdf
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
### Ingesting Private Data (Rare)
|
|
464
|
+
|
|
465
|
+
Private data requires explicit `--make-private` flag:
|
|
466
|
+
|
|
467
|
+
```bash
|
|
468
|
+
# Private user data - requires --make-private and --user-id
|
|
469
|
+
rem process ingest personal-notes.md --make-private --user-id user-123
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
**When to use private data:**
|
|
473
|
+
- User-uploaded personal documents
|
|
474
|
+
- Session-specific content
|
|
475
|
+
- User notes and annotations
|
|
476
|
+
|
|
477
|
+
**NEVER use private data for:**
|
|
478
|
+
- Ontologies and reference material
|
|
479
|
+
- Clinical procedures and protocols
|
|
480
|
+
- Shared knowledge bases
|
|
481
|
+
- Anything that should be searchable by agents
|
|
482
|
+
|
|
483
|
+
### Common Mistake
|
|
484
|
+
|
|
485
|
+
If agents can't find data via `search_rem`, the most common cause is that the data
|
|
486
|
+
was ingested with a `user_id` set. Check with:
|
|
487
|
+
|
|
488
|
+
```sql
|
|
489
|
+
SELECT name, user_id FROM ontologies WHERE name = 'phq-9-procedure';
|
|
490
|
+
-- user_id should be NULL for public data
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
Fix by setting user_id to NULL:
|
|
494
|
+
```sql
|
|
495
|
+
UPDATE ontologies SET user_id = NULL WHERE user_id IS NOT NULL;
|
|
496
|
+
UPDATE kv_store SET user_id = NULL WHERE entity_type = 'ontologies' AND user_id IS NOT NULL;
|
|
497
|
+
```
|
|
498
|
+
|
|
437
499
|
## Next Steps
|
|
438
500
|
|
|
439
501
|
1. **Implement Schema Registry**
|
rem/cli/commands/ask.py
CHANGED
|
@@ -71,16 +71,18 @@ async def run_agent_streaming(
|
|
|
71
71
|
max_turns: int = 10,
|
|
72
72
|
context: AgentContext | None = None,
|
|
73
73
|
max_iterations: int | None = None,
|
|
74
|
+
user_message: str | None = None,
|
|
74
75
|
) -> None:
|
|
75
76
|
"""
|
|
76
|
-
Run agent in streaming mode using
|
|
77
|
+
Run agent in streaming mode using the SAME code path as the API.
|
|
77
78
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
79
|
+
This uses stream_openai_response_with_save from the API to ensure:
|
|
80
|
+
1. Tool calls are saved as separate "tool" messages (not embedded in content)
|
|
81
|
+
2. Assistant response is clean text only (no [Calling: ...] markers)
|
|
82
|
+
3. CLI testing is equivalent to API testing
|
|
83
|
+
|
|
84
|
+
The CLI displays tool calls as [Calling: tool_name] for visibility,
|
|
85
|
+
but these are NOT saved to the database.
|
|
84
86
|
|
|
85
87
|
Args:
|
|
86
88
|
agent: Pydantic AI agent
|
|
@@ -88,88 +90,66 @@ async def run_agent_streaming(
|
|
|
88
90
|
max_turns: Maximum turns for agent execution (not used in current API)
|
|
89
91
|
context: Optional AgentContext for session persistence
|
|
90
92
|
max_iterations: Maximum iterations/requests (from agent schema or settings)
|
|
93
|
+
user_message: The user's original message (for database storage)
|
|
91
94
|
"""
|
|
92
|
-
|
|
93
|
-
from rem.
|
|
95
|
+
import json
|
|
96
|
+
from rem.api.routers.chat.streaming import stream_openai_response_with_save, save_user_message
|
|
94
97
|
|
|
95
98
|
logger.info("Running agent in streaming mode...")
|
|
96
99
|
|
|
97
100
|
try:
|
|
98
|
-
#
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
# Accumulate assistant response for session persistence
|
|
103
|
-
assistant_response_parts = []
|
|
104
|
-
|
|
105
|
-
# Use agent.iter() to get complete execution with tool calls
|
|
106
|
-
usage_limits = UsageLimits(request_limit=max_iterations) if max_iterations else None
|
|
107
|
-
async with agent.iter(prompt, usage_limits=usage_limits) as agent_run:
|
|
108
|
-
async for node in agent_run:
|
|
109
|
-
# Check if this is a model request node (includes tool calls and text)
|
|
110
|
-
if PydanticAgent.is_model_request_node(node):
|
|
111
|
-
# Stream events from model request
|
|
112
|
-
request_stream: Any
|
|
113
|
-
async with node.stream(agent_run.ctx) as request_stream:
|
|
114
|
-
async for event in request_stream:
|
|
115
|
-
# Tool call start event
|
|
116
|
-
if isinstance(event, PartStartEvent) and isinstance(
|
|
117
|
-
event.part, ToolCallPart
|
|
118
|
-
):
|
|
119
|
-
tool_marker = f"\n[Calling: {event.part.tool_name}]"
|
|
120
|
-
print(tool_marker, flush=True)
|
|
121
|
-
assistant_response_parts.append(tool_marker)
|
|
122
|
-
|
|
123
|
-
# Text content delta
|
|
124
|
-
elif isinstance(event, PartDeltaEvent) and isinstance(
|
|
125
|
-
event.delta, TextPartDelta
|
|
126
|
-
):
|
|
127
|
-
print(event.delta.content_delta, end="", flush=True)
|
|
128
|
-
assistant_response_parts.append(event.delta.content_delta)
|
|
129
|
-
|
|
130
|
-
print("\n") # Final newline after streaming
|
|
131
|
-
|
|
132
|
-
# Get final result from agent_run
|
|
133
|
-
result = agent_run.result
|
|
134
|
-
if hasattr(result, "output"):
|
|
135
|
-
logger.info("Final structured result:")
|
|
136
|
-
output = result.output
|
|
137
|
-
from rem.agentic.serialization import serialize_agent_result
|
|
138
|
-
output_json = json.dumps(serialize_agent_result(output), indent=2)
|
|
139
|
-
print(output_json)
|
|
140
|
-
assistant_response_parts.append(f"\n{output_json}")
|
|
141
|
-
|
|
142
|
-
# Save session messages (if session_id provided and postgres enabled)
|
|
143
|
-
if context and context.session_id and settings.postgres.enabled:
|
|
144
|
-
from ...services.session.compression import SessionMessageStore
|
|
145
|
-
|
|
146
|
-
# Extract just the user query from prompt
|
|
147
|
-
# Prompt format from ContextBuilder: system + history + user message
|
|
148
|
-
# We need to extract the last user message
|
|
149
|
-
user_message_content = prompt.split("\n\n")[-1] if "\n\n" in prompt else prompt
|
|
150
|
-
|
|
151
|
-
user_message = {
|
|
152
|
-
"role": "user",
|
|
153
|
-
"content": user_message_content,
|
|
154
|
-
"timestamp": to_iso_with_z(utc_now()),
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
assistant_message = {
|
|
158
|
-
"role": "assistant",
|
|
159
|
-
"content": "".join(assistant_response_parts),
|
|
160
|
-
"timestamp": to_iso_with_z(utc_now()),
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
# Store messages with compression
|
|
164
|
-
store = SessionMessageStore(user_id=context.user_id or settings.test.effective_user_id)
|
|
165
|
-
await store.store_session_messages(
|
|
101
|
+
# Save user message BEFORE streaming (same as API, using shared utility)
|
|
102
|
+
if context and context.session_id and user_message:
|
|
103
|
+
await save_user_message(
|
|
166
104
|
session_id=context.session_id,
|
|
167
|
-
messages=[user_message, assistant_message],
|
|
168
105
|
user_id=context.user_id,
|
|
169
|
-
|
|
106
|
+
content=user_message,
|
|
170
107
|
)
|
|
171
108
|
|
|
172
|
-
|
|
109
|
+
# Use the API streaming code path for consistency
|
|
110
|
+
# This properly handles tool calls and message persistence
|
|
111
|
+
model_name = getattr(agent, 'model', 'unknown')
|
|
112
|
+
if hasattr(model_name, 'model_name'):
|
|
113
|
+
model_name = model_name.model_name
|
|
114
|
+
elif hasattr(model_name, 'name'):
|
|
115
|
+
model_name = model_name.name
|
|
116
|
+
else:
|
|
117
|
+
model_name = str(model_name)
|
|
118
|
+
|
|
119
|
+
async for chunk in stream_openai_response_with_save(
|
|
120
|
+
agent=agent.agent if hasattr(agent, 'agent') else agent,
|
|
121
|
+
prompt=prompt,
|
|
122
|
+
model=model_name,
|
|
123
|
+
session_id=context.session_id if context else None,
|
|
124
|
+
user_id=context.user_id if context else None,
|
|
125
|
+
agent_context=context,
|
|
126
|
+
):
|
|
127
|
+
# Parse SSE chunks for CLI display
|
|
128
|
+
if chunk.startswith("event: tool_call"):
|
|
129
|
+
# Extract tool call info from next data line
|
|
130
|
+
continue
|
|
131
|
+
elif chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
|
|
132
|
+
try:
|
|
133
|
+
data_str = chunk[6:].strip()
|
|
134
|
+
if data_str:
|
|
135
|
+
data = json.loads(data_str)
|
|
136
|
+
# Check for tool_call event
|
|
137
|
+
if data.get("type") == "tool_call":
|
|
138
|
+
tool_name = data.get("tool_name", "tool")
|
|
139
|
+
status = data.get("status", "")
|
|
140
|
+
if status == "started":
|
|
141
|
+
print(f"\n[Calling: {tool_name}]", flush=True)
|
|
142
|
+
# Check for text content (OpenAI format)
|
|
143
|
+
elif "choices" in data and data["choices"]:
|
|
144
|
+
delta = data["choices"][0].get("delta", {})
|
|
145
|
+
content = delta.get("content")
|
|
146
|
+
if content:
|
|
147
|
+
print(content, end="", flush=True)
|
|
148
|
+
except (json.JSONDecodeError, KeyError, IndexError):
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
print("\n") # Final newline after streaming
|
|
152
|
+
logger.info("Final structured result:")
|
|
173
153
|
|
|
174
154
|
except Exception as e:
|
|
175
155
|
logger.error(f"Agent execution failed: {e}")
|
|
@@ -549,7 +529,7 @@ async def _ask_async(
|
|
|
549
529
|
|
|
550
530
|
# Run agent with session persistence
|
|
551
531
|
if stream:
|
|
552
|
-
await run_agent_streaming(agent, prompt, max_turns=max_turns, context=context)
|
|
532
|
+
await run_agent_streaming(agent, prompt, max_turns=max_turns, context=context, user_message=query)
|
|
553
533
|
else:
|
|
554
534
|
await run_agent_non_streaming(
|
|
555
535
|
agent,
|
rem/cli/commands/db.py
CHANGED
|
@@ -375,8 +375,10 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
|
|
|
375
375
|
import polars as pl
|
|
376
376
|
import yaml
|
|
377
377
|
from ...models.core.inline_edge import InlineEdge
|
|
378
|
-
from ...models.entities import
|
|
378
|
+
from ...models.entities import SharedSession
|
|
379
379
|
from ...services.postgres import get_postgres_service
|
|
380
|
+
from ...utils.model_helpers import get_table_name
|
|
381
|
+
from ... import get_model_registry
|
|
380
382
|
|
|
381
383
|
logger.info(f"Loading data from: {file_path}")
|
|
382
384
|
scope_msg = f"user: {user_id}" if user_id else "public"
|
|
@@ -385,13 +387,12 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
|
|
|
385
387
|
suffix = file_path.suffix.lower()
|
|
386
388
|
is_yaml = suffix in {".yaml", ".yml"}
|
|
387
389
|
|
|
388
|
-
#
|
|
390
|
+
# Build MODEL_MAP dynamically from registry
|
|
391
|
+
registry = get_model_registry()
|
|
392
|
+
registry.register_core_models()
|
|
389
393
|
MODEL_MAP = {
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
"resources": Resource,
|
|
393
|
-
"messages": Message,
|
|
394
|
-
"schemas": Schema,
|
|
394
|
+
get_table_name(model): model
|
|
395
|
+
for model in registry.get_model_classes().values()
|
|
395
396
|
}
|
|
396
397
|
|
|
397
398
|
# Non-CoreModel tables that need direct SQL insertion
|
|
@@ -432,12 +433,9 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
|
|
|
432
433
|
logger.info(f"Columns: {list(df.columns)}")
|
|
433
434
|
|
|
434
435
|
# Validate first row against model if table is known
|
|
435
|
-
if table in
|
|
436
|
-
from ...models.entities import Resource, Moment, User, Message, Schema
|
|
436
|
+
if table in MODEL_MAP and rows:
|
|
437
437
|
from ...utils.model_helpers import validate_data_for_model
|
|
438
|
-
|
|
439
|
-
"messages": Message, "schemas": Schema}
|
|
440
|
-
result = validate_data_for_model(model_map[table], rows[0])
|
|
438
|
+
result = validate_data_for_model(MODEL_MAP[table], rows[0])
|
|
441
439
|
if result.extra_fields:
|
|
442
440
|
logger.warning(f"Unknown fields (ignored): {result.extra_fields}")
|
|
443
441
|
if result.valid:
|
|
@@ -457,6 +455,10 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
|
|
|
457
455
|
|
|
458
456
|
await pg.connect()
|
|
459
457
|
|
|
458
|
+
# Start embedding worker for generating embeddings
|
|
459
|
+
if pg.embedding_worker:
|
|
460
|
+
await pg.embedding_worker.start()
|
|
461
|
+
|
|
460
462
|
try:
|
|
461
463
|
total_loaded = 0
|
|
462
464
|
|
|
@@ -467,8 +469,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
|
|
|
467
469
|
# Handle direct insert tables (non-CoreModel)
|
|
468
470
|
if table_name in DIRECT_INSERT_TABLES:
|
|
469
471
|
for row_data in rows:
|
|
470
|
-
|
|
471
|
-
row_data["tenant_id"] = "default"
|
|
472
|
+
# tenant_id is optional - NULL means public/shared
|
|
472
473
|
|
|
473
474
|
if table_name == "shared_sessions":
|
|
474
475
|
await pg.fetch(
|
|
@@ -479,7 +480,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
|
|
|
479
480
|
row_data["session_id"],
|
|
480
481
|
row_data["owner_user_id"],
|
|
481
482
|
row_data["shared_with_user_id"],
|
|
482
|
-
row_data
|
|
483
|
+
row_data.get("tenant_id"), # Optional - NULL means public
|
|
483
484
|
)
|
|
484
485
|
total_loaded += 1
|
|
485
486
|
logger.success(f"Loaded shared_session: {row_data['owner_user_id']} -> {row_data['shared_with_user_id']}")
|
|
@@ -492,10 +493,8 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
|
|
|
492
493
|
model_class = MODEL_MAP[table_name]
|
|
493
494
|
|
|
494
495
|
for row_idx, row_data in enumerate(rows):
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
if "tenant_id" not in row_data and user_id is not None:
|
|
498
|
-
row_data["tenant_id"] = row_data.get("user_id", user_id)
|
|
496
|
+
# tenant_id and user_id are optional - NULL means public/shared data
|
|
497
|
+
# Data files can explicitly set tenant_id/user_id if needed
|
|
499
498
|
|
|
500
499
|
# Convert graph_edges to InlineEdge format if present
|
|
501
500
|
if "graph_edges" in row_data:
|
|
@@ -530,6 +529,14 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
|
|
|
530
529
|
|
|
531
530
|
logger.success(f"Data loaded successfully! Total rows: {total_loaded}")
|
|
532
531
|
|
|
532
|
+
# Wait for embeddings to complete
|
|
533
|
+
if pg.embedding_worker and pg.embedding_worker.running:
|
|
534
|
+
queue_size = pg.embedding_worker.task_queue.qsize()
|
|
535
|
+
if queue_size > 0:
|
|
536
|
+
logger.info(f"Waiting for {queue_size} embeddings to complete...")
|
|
537
|
+
await pg.embedding_worker.stop()
|
|
538
|
+
logger.success("Embeddings generated successfully")
|
|
539
|
+
|
|
533
540
|
finally:
|
|
534
541
|
await pg.disconnect()
|
|
535
542
|
|
|
@@ -634,7 +641,7 @@ async def _diff_async(
|
|
|
634
641
|
|
|
635
642
|
if not result.has_changes:
|
|
636
643
|
click.secho("✓ No schema drift detected", fg="green")
|
|
637
|
-
click.echo(" Database matches
|
|
644
|
+
click.echo(" Database matches source (tables, functions, triggers, views)")
|
|
638
645
|
if result.filtered_count > 0:
|
|
639
646
|
click.echo()
|
|
640
647
|
click.secho(f" ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
|
|
@@ -646,17 +653,34 @@ async def _diff_async(
|
|
|
646
653
|
if result.filtered_count > 0:
|
|
647
654
|
click.secho(f" ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
|
|
648
655
|
click.echo()
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
656
|
+
|
|
657
|
+
# Table/column changes (Alembic)
|
|
658
|
+
if result.summary:
|
|
659
|
+
click.echo("Table Changes:")
|
|
660
|
+
for line in result.summary:
|
|
661
|
+
if line.startswith("+"):
|
|
662
|
+
click.secho(f" {line}", fg="green")
|
|
663
|
+
elif line.startswith("-"):
|
|
664
|
+
click.secho(f" {line}", fg="red")
|
|
665
|
+
elif line.startswith("~"):
|
|
666
|
+
click.secho(f" {line}", fg="yellow")
|
|
667
|
+
else:
|
|
668
|
+
click.echo(f" {line}")
|
|
669
|
+
click.echo()
|
|
670
|
+
|
|
671
|
+
# Programmable object changes (functions, triggers, views)
|
|
672
|
+
if result.programmable_summary:
|
|
673
|
+
click.echo("Programmable Objects (functions/triggers/views):")
|
|
674
|
+
for line in result.programmable_summary:
|
|
675
|
+
if line.startswith("+"):
|
|
676
|
+
click.secho(f" {line}", fg="green")
|
|
677
|
+
elif line.startswith("-"):
|
|
678
|
+
click.secho(f" {line}", fg="red")
|
|
679
|
+
elif line.startswith("~"):
|
|
680
|
+
click.secho(f" {line}", fg="yellow")
|
|
681
|
+
else:
|
|
682
|
+
click.echo(f" {line}")
|
|
683
|
+
click.echo()
|
|
660
684
|
|
|
661
685
|
# Generate migration if requested
|
|
662
686
|
if generate:
|
rem/cli/commands/process.py
CHANGED
|
@@ -11,39 +11,102 @@ from rem.services.content import ContentService
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
@click.command(name="ingest")
|
|
14
|
-
@click.argument("
|
|
15
|
-
@click.option("--
|
|
14
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
15
|
+
@click.option("--table", "-t", default=None, help="Target table (e.g., ontologies, resources). Auto-detected for schemas.")
|
|
16
|
+
@click.option("--make-private", is_flag=True, help="Make data private to a specific user. RARELY NEEDED - most data should be public/shared.")
|
|
17
|
+
@click.option("--user-id", default=None, help="User ID for private data. REQUIRES --make-private flag.")
|
|
16
18
|
@click.option("--category", help="Optional file category")
|
|
17
19
|
@click.option("--tags", help="Optional comma-separated tags")
|
|
20
|
+
@click.option("--pattern", "-p", default="**/*.md", help="Glob pattern for directory ingestion (default: **/*.md)")
|
|
21
|
+
@click.option("--dry-run", is_flag=True, help="Show what would be ingested without making changes")
|
|
18
22
|
def process_ingest(
|
|
19
|
-
|
|
23
|
+
path: str,
|
|
24
|
+
table: str | None,
|
|
25
|
+
make_private: bool,
|
|
20
26
|
user_id: str | None,
|
|
21
27
|
category: str | None,
|
|
22
28
|
tags: str | None,
|
|
29
|
+
pattern: str,
|
|
30
|
+
dry_run: bool,
|
|
23
31
|
):
|
|
24
32
|
"""
|
|
25
|
-
Ingest
|
|
33
|
+
Ingest files into REM (storage + parsing + embedding).
|
|
26
34
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
35
|
+
Supports both single files and directories. For directories, recursively
|
|
36
|
+
processes files matching the pattern (default: **/*.md).
|
|
37
|
+
|
|
38
|
+
**IMPORTANT: Data is PUBLIC by default.** This is the correct behavior for
|
|
39
|
+
shared knowledge bases (ontologies, procedures, reference data). Private
|
|
40
|
+
user-scoped data is rarely needed and requires explicit --make-private flag.
|
|
41
|
+
|
|
42
|
+
Target table is auto-detected for schemas (agent.yaml → schemas table).
|
|
43
|
+
Use --table to explicitly set the target (e.g., ontologies for clinical knowledge).
|
|
33
44
|
|
|
34
45
|
Examples:
|
|
35
46
|
rem process ingest sample.pdf
|
|
36
47
|
rem process ingest contract.docx --category legal --tags contract,2023
|
|
37
48
|
rem process ingest agent.yaml # Auto-detects kind=agent, saves to schemas table
|
|
49
|
+
|
|
50
|
+
# Directory ingestion into ontologies table (PUBLIC - no user-id needed)
|
|
51
|
+
rem process ingest ontology/procedures/scid-5/ --table ontologies
|
|
52
|
+
rem process ingest ontology/ --table ontologies --pattern "**/*.md"
|
|
53
|
+
|
|
54
|
+
# Preview what would be ingested
|
|
55
|
+
rem process ingest ontology/ --table ontologies --dry-run
|
|
56
|
+
|
|
57
|
+
# RARE: Private user-scoped data (requires --make-private)
|
|
58
|
+
rem process ingest private-notes.md --make-private --user-id user-123
|
|
38
59
|
"""
|
|
39
60
|
import asyncio
|
|
61
|
+
|
|
62
|
+
# Validate: user_id requires --make-private flag
|
|
63
|
+
if user_id and not make_private:
|
|
64
|
+
raise click.UsageError(
|
|
65
|
+
"Setting --user-id requires the --make-private flag.\n\n"
|
|
66
|
+
"Data should be PUBLIC by default (no user-id). Private user-scoped data\n"
|
|
67
|
+
"is rarely needed - only use --make-private for truly personal content.\n\n"
|
|
68
|
+
"Example: rem process ingest file.md --make-private --user-id user-123"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# If --make-private is set, user_id is required
|
|
72
|
+
if make_private and not user_id:
|
|
73
|
+
raise click.UsageError(
|
|
74
|
+
"--make-private requires --user-id to specify which user owns the data.\n\n"
|
|
75
|
+
"Example: rem process ingest file.md --make-private --user-id user-123"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Clear user_id if not making private (ensure None for public data)
|
|
79
|
+
effective_user_id = user_id if make_private else None
|
|
80
|
+
from pathlib import Path
|
|
40
81
|
from ...services.content import ContentService
|
|
41
82
|
|
|
42
83
|
async def _ingest():
|
|
43
|
-
# Initialize ContentService with repositories for proper resource saving
|
|
44
84
|
from rem.services.postgres import get_postgres_service
|
|
45
85
|
from rem.services.postgres.repository import Repository
|
|
46
|
-
from rem.models.entities import File, Resource
|
|
86
|
+
from rem.models.entities import File, Resource, Ontology
|
|
87
|
+
|
|
88
|
+
input_path = Path(path)
|
|
89
|
+
tag_list = tags.split(",") if tags else None
|
|
90
|
+
|
|
91
|
+
# Collect files to process
|
|
92
|
+
if input_path.is_dir():
|
|
93
|
+
files_to_process = list(input_path.glob(pattern))
|
|
94
|
+
if not files_to_process:
|
|
95
|
+
logger.error(f"No files matching '{pattern}' found in {input_path}")
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
logger.info(f"Found {len(files_to_process)} files matching '{pattern}'")
|
|
98
|
+
else:
|
|
99
|
+
files_to_process = [input_path]
|
|
100
|
+
|
|
101
|
+
# Dry run: just show what would be processed
|
|
102
|
+
if dry_run:
|
|
103
|
+
logger.info("DRY RUN - Would ingest:")
|
|
104
|
+
for f in files_to_process[:20]:
|
|
105
|
+
entity_key = f.stem # filename without extension
|
|
106
|
+
logger.info(f" {f} → {table or 'auto-detect'} (key: {entity_key})")
|
|
107
|
+
if len(files_to_process) > 20:
|
|
108
|
+
logger.info(f" ... and {len(files_to_process) - 20} more files")
|
|
109
|
+
return
|
|
47
110
|
|
|
48
111
|
db = get_postgres_service()
|
|
49
112
|
if not db:
|
|
@@ -51,53 +114,118 @@ def process_ingest(
|
|
|
51
114
|
await db.connect()
|
|
52
115
|
|
|
53
116
|
try:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
user_id=user_id,
|
|
65
|
-
category=category,
|
|
66
|
-
tags=tag_list,
|
|
67
|
-
is_local_server=True, # CLI is local
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
# Handle schema ingestion (agents/evaluators)
|
|
71
|
-
if result.get("schema_name"):
|
|
72
|
-
logger.success(f"Schema ingested: {result['schema_name']} (kind={result.get('kind', 'agent')})")
|
|
73
|
-
logger.info(f"Version: {result.get('version', '1.0.0')}")
|
|
74
|
-
# Handle file ingestion
|
|
75
|
-
elif result.get("processing_status") == "completed":
|
|
76
|
-
logger.success(f"File ingested: {result['file_name']}")
|
|
77
|
-
logger.info(f"File ID: {result['file_id']}")
|
|
78
|
-
logger.info(f"Resources created: {result['resources_created']}")
|
|
117
|
+
# Direct table ingestion (ontologies, etc.)
|
|
118
|
+
if table:
|
|
119
|
+
await _ingest_to_table(
|
|
120
|
+
db=db,
|
|
121
|
+
files=files_to_process,
|
|
122
|
+
table_name=table,
|
|
123
|
+
user_id=effective_user_id,
|
|
124
|
+
category=category,
|
|
125
|
+
tag_list=tag_list,
|
|
126
|
+
)
|
|
79
127
|
else:
|
|
80
|
-
|
|
81
|
-
|
|
128
|
+
# Standard file ingestion via ContentService
|
|
129
|
+
file_repo = Repository(File, "files", db=db)
|
|
130
|
+
resource_repo = Repository(Resource, "resources", db=db)
|
|
131
|
+
service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
|
|
132
|
+
|
|
133
|
+
for file_path in files_to_process:
|
|
134
|
+
scope_msg = f"user: {effective_user_id}" if effective_user_id else "public"
|
|
135
|
+
logger.info(f"Ingesting: {file_path} ({scope_msg})")
|
|
136
|
+
|
|
137
|
+
result = await service.ingest_file(
|
|
138
|
+
file_uri=str(file_path),
|
|
139
|
+
user_id=effective_user_id,
|
|
140
|
+
category=category,
|
|
141
|
+
tags=tag_list,
|
|
142
|
+
is_local_server=True,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Handle schema ingestion (agents/evaluators)
|
|
146
|
+
if result.get("schema_name"):
|
|
147
|
+
logger.success(f"Schema: {result['schema_name']} (kind={result.get('kind', 'agent')})")
|
|
148
|
+
elif result.get("processing_status") == "completed":
|
|
149
|
+
logger.success(f"File: {result['file_name']} ({result['resources_created']} resources)")
|
|
150
|
+
else:
|
|
151
|
+
logger.error(f"Failed: {result.get('message', 'Unknown error')}")
|
|
82
152
|
|
|
83
153
|
except Exception as e:
|
|
84
154
|
logger.error(f"Error during ingestion: {e}")
|
|
85
155
|
sys.exit(1)
|
|
86
156
|
finally:
|
|
87
|
-
# Wait for
|
|
157
|
+
# Wait for embedding worker to finish
|
|
88
158
|
from rem.services.embeddings.worker import get_global_embedding_worker
|
|
89
159
|
try:
|
|
90
160
|
worker = get_global_embedding_worker()
|
|
91
161
|
if worker and worker.running and not worker.task_queue.empty():
|
|
92
|
-
logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks
|
|
93
|
-
# Worker.stop() waits for queue to drain (see worker.py line ~148)
|
|
162
|
+
logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks...")
|
|
94
163
|
await worker.stop()
|
|
95
164
|
except RuntimeError:
|
|
96
|
-
# Worker doesn't exist yet - no tasks queued
|
|
97
165
|
pass
|
|
98
166
|
|
|
99
167
|
await db.disconnect()
|
|
100
168
|
|
|
169
|
+
async def _ingest_to_table(db, files, table_name, user_id, category, tag_list):
|
|
170
|
+
"""Direct ingestion of files to a specific table (ontologies, etc.)."""
|
|
171
|
+
from rem.services.postgres.repository import Repository
|
|
172
|
+
from rem import get_model_registry
|
|
173
|
+
from rem.utils.model_helpers import get_table_name
|
|
174
|
+
|
|
175
|
+
# Get model class for table
|
|
176
|
+
registry = get_model_registry()
|
|
177
|
+
registry.register_core_models()
|
|
178
|
+
model_class = None
|
|
179
|
+
for model in registry.get_model_classes().values():
|
|
180
|
+
if get_table_name(model) == table_name:
|
|
181
|
+
model_class = model
|
|
182
|
+
break
|
|
183
|
+
|
|
184
|
+
if not model_class:
|
|
185
|
+
logger.error(f"Unknown table: {table_name}")
|
|
186
|
+
sys.exit(1)
|
|
187
|
+
|
|
188
|
+
repo = Repository(model_class, table_name, db=db)
|
|
189
|
+
processed = 0
|
|
190
|
+
failed = 0
|
|
191
|
+
|
|
192
|
+
for file_path in files:
|
|
193
|
+
try:
|
|
194
|
+
# Read file content
|
|
195
|
+
content = file_path.read_text(encoding="utf-8")
|
|
196
|
+
entity_key = file_path.stem # filename without extension
|
|
197
|
+
|
|
198
|
+
# Build entity based on table
|
|
199
|
+
entity_data = {
|
|
200
|
+
"name": entity_key,
|
|
201
|
+
"content": content,
|
|
202
|
+
"tags": tag_list or [],
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
# Add optional fields
|
|
206
|
+
if category:
|
|
207
|
+
entity_data["category"] = category
|
|
208
|
+
|
|
209
|
+
# Scoping: user_id for private data, "public" for shared
|
|
210
|
+
# tenant_id="public" is the default for shared knowledge bases
|
|
211
|
+
entity_data["tenant_id"] = user_id or "public"
|
|
212
|
+
entity_data["user_id"] = user_id # None = public/shared
|
|
213
|
+
|
|
214
|
+
# For ontologies, add URI
|
|
215
|
+
if table_name == "ontologies":
|
|
216
|
+
entity_data["uri"] = f"file://{file_path.absolute()}"
|
|
217
|
+
|
|
218
|
+
entity = model_class(**entity_data)
|
|
219
|
+
await repo.upsert(entity, embeddable_fields=["content"], generate_embeddings=True)
|
|
220
|
+
processed += 1
|
|
221
|
+
logger.success(f" ✓ {entity_key}")
|
|
222
|
+
|
|
223
|
+
except Exception as e:
|
|
224
|
+
failed += 1
|
|
225
|
+
logger.error(f" ✗ {file_path.name}: {e}")
|
|
226
|
+
|
|
227
|
+
logger.info(f"Completed: {processed} succeeded, {failed} failed")
|
|
228
|
+
|
|
101
229
|
asyncio.run(_ingest())
|
|
102
230
|
|
|
103
231
|
def register_commands(group: click.Group):
|