remdb 0.3.180__py3-none-any.whl → 0.3.230__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. rem/agentic/README.md +36 -2
  2. rem/agentic/context.py +173 -0
  3. rem/agentic/context_builder.py +12 -2
  4. rem/agentic/mcp/tool_wrapper.py +2 -2
  5. rem/agentic/providers/pydantic_ai.py +1 -1
  6. rem/agentic/schema.py +2 -2
  7. rem/api/main.py +1 -1
  8. rem/api/mcp_router/server.py +4 -0
  9. rem/api/mcp_router/tools.py +542 -166
  10. rem/api/routers/admin.py +30 -4
  11. rem/api/routers/auth.py +106 -10
  12. rem/api/routers/chat/child_streaming.py +379 -0
  13. rem/api/routers/chat/completions.py +74 -37
  14. rem/api/routers/chat/sse_events.py +7 -3
  15. rem/api/routers/chat/streaming.py +352 -257
  16. rem/api/routers/chat/streaming_utils.py +327 -0
  17. rem/api/routers/common.py +18 -0
  18. rem/api/routers/dev.py +7 -1
  19. rem/api/routers/feedback.py +9 -1
  20. rem/api/routers/messages.py +176 -38
  21. rem/api/routers/models.py +9 -1
  22. rem/api/routers/query.py +12 -1
  23. rem/api/routers/shared_sessions.py +16 -0
  24. rem/auth/jwt.py +19 -4
  25. rem/auth/middleware.py +42 -28
  26. rem/cli/README.md +62 -0
  27. rem/cli/commands/ask.py +61 -81
  28. rem/cli/commands/db.py +55 -31
  29. rem/cli/commands/process.py +171 -43
  30. rem/models/entities/ontology.py +18 -20
  31. rem/schemas/agents/rem.yaml +1 -1
  32. rem/services/content/service.py +18 -5
  33. rem/services/embeddings/worker.py +26 -12
  34. rem/services/postgres/__init__.py +28 -3
  35. rem/services/postgres/diff_service.py +57 -5
  36. rem/services/postgres/programmable_diff_service.py +635 -0
  37. rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
  38. rem/services/postgres/register_type.py +11 -10
  39. rem/services/postgres/repository.py +39 -29
  40. rem/services/postgres/schema_generator.py +5 -5
  41. rem/services/postgres/sql_builder.py +6 -5
  42. rem/services/session/__init__.py +8 -1
  43. rem/services/session/compression.py +40 -2
  44. rem/services/session/pydantic_messages.py +292 -0
  45. rem/settings.py +28 -0
  46. rem/sql/migrations/001_install.sql +125 -7
  47. rem/sql/migrations/002_install_models.sql +159 -149
  48. rem/sql/migrations/004_cache_system.sql +7 -275
  49. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  50. rem/utils/schema_loader.py +79 -51
  51. {remdb-0.3.180.dist-info → remdb-0.3.230.dist-info}/METADATA +2 -2
  52. {remdb-0.3.180.dist-info → remdb-0.3.230.dist-info}/RECORD +54 -48
  53. {remdb-0.3.180.dist-info → remdb-0.3.230.dist-info}/WHEEL +0 -0
  54. {remdb-0.3.180.dist-info → remdb-0.3.230.dist-info}/entry_points.txt +0 -0
rem/cli/README.md CHANGED
@@ -434,6 +434,68 @@ Ensure you're using the correct model format:
434
434
  - OpenAI: `openai:gpt-4o-mini`, `openai:gpt-4o`
435
435
  - Anthropic: `anthropic:claude-sonnet-4-5-20250929`
436
436
 
437
+ ## Data Visibility: PUBLIC vs PRIVATE
438
+
439
+ **IMPORTANT: All ingested data is PUBLIC by default.** This is the correct behavior
440
+ for shared knowledge bases (ontologies, procedures, reference data).
441
+
442
+ ### Why PUBLIC by Default?
443
+
444
+ Most data in REM should be searchable by all users:
445
+ - Clinical ontologies (disorders, symptoms, drugs)
446
+ - Procedures and protocols (SCID-5, PHQ-9, etc.)
447
+ - Reference documentation
448
+ - Shared domain knowledge
449
+
450
+ The `rem_lookup()` function searches for data where `user_id IS NULL`, which means
451
+ public data. If you set `user_id` on data, it becomes invisible to other users.
452
+
453
+ ### Ingesting Public Data (Default)
454
+
455
+ ```bash
456
+ # Standard ingestion - data is PUBLIC
457
+ rem process ingest ontology/procedures/ --table ontologies
458
+
459
+ # From S3 - also PUBLIC
460
+ rem process ingest s3://bucket/docs/reference.pdf
461
+ ```
462
+
463
+ ### Ingesting Private Data (Rare)
464
+
465
+ Private data requires explicit `--make-private` flag:
466
+
467
+ ```bash
468
+ # Private user data - requires --make-private and --user-id
469
+ rem process ingest personal-notes.md --make-private --user-id user-123
470
+ ```
471
+
472
+ **When to use private data:**
473
+ - User-uploaded personal documents
474
+ - Session-specific content
475
+ - User notes and annotations
476
+
477
+ **NEVER use private data for:**
478
+ - Ontologies and reference material
479
+ - Clinical procedures and protocols
480
+ - Shared knowledge bases
481
+ - Anything that should be searchable by agents
482
+
483
+ ### Common Mistake
484
+
485
+ If agents can't find data via `search_rem`, the most common cause is that the data
486
+ was ingested with a `user_id` set. Check with:
487
+
488
+ ```sql
489
+ SELECT name, user_id FROM ontologies WHERE name = 'phq-9-procedure';
490
+ -- user_id should be NULL for public data
491
+ ```
492
+
493
+ Fix by setting user_id to NULL:
494
+ ```sql
495
+ UPDATE ontologies SET user_id = NULL WHERE user_id IS NOT NULL;
496
+ UPDATE kv_store SET user_id = NULL WHERE entity_type = 'ontologies' AND user_id IS NOT NULL;
497
+ ```
498
+
437
499
  ## Next Steps
438
500
 
439
501
  1. **Implement Schema Registry**
rem/cli/commands/ask.py CHANGED
@@ -71,16 +71,18 @@ async def run_agent_streaming(
71
71
  max_turns: int = 10,
72
72
  context: AgentContext | None = None,
73
73
  max_iterations: int | None = None,
74
+ user_message: str | None = None,
74
75
  ) -> None:
75
76
  """
76
- Run agent in streaming mode using agent.iter() with usage limits.
77
+ Run agent in streaming mode using the SAME code path as the API.
77
78
 
78
- Design Pattern:
79
- - Use agent.iter() for complete execution with tool call visibility
80
- - run_stream() stops after first output, missing tool calls
81
- - Stream tool call markers: [Calling: tool_name]
82
- - Stream text content deltas as they arrive
83
- - Show final structured result
79
+ This uses stream_openai_response_with_save from the API to ensure:
80
+ 1. Tool calls are saved as separate "tool" messages (not embedded in content)
81
+ 2. Assistant response is clean text only (no [Calling: ...] markers)
82
+ 3. CLI testing is equivalent to API testing
83
+
84
+ The CLI displays tool calls as [Calling: tool_name] for visibility,
85
+ but these are NOT saved to the database.
84
86
 
85
87
  Args:
86
88
  agent: Pydantic AI agent
@@ -88,88 +90,66 @@ async def run_agent_streaming(
88
90
  max_turns: Maximum turns for agent execution (not used in current API)
89
91
  context: Optional AgentContext for session persistence
90
92
  max_iterations: Maximum iterations/requests (from agent schema or settings)
93
+ user_message: The user's original message (for database storage)
91
94
  """
92
- from pydantic_ai import UsageLimits
93
- from rem.utils.date_utils import to_iso_with_z, utc_now
95
+ import json
96
+ from rem.api.routers.chat.streaming import stream_openai_response_with_save, save_user_message
94
97
 
95
98
  logger.info("Running agent in streaming mode...")
96
99
 
97
100
  try:
98
- # Import event types for streaming
99
- from pydantic_ai import Agent as PydanticAgent
100
- from pydantic_ai.messages import PartStartEvent, PartDeltaEvent, TextPartDelta, ToolCallPart
101
-
102
- # Accumulate assistant response for session persistence
103
- assistant_response_parts = []
104
-
105
- # Use agent.iter() to get complete execution with tool calls
106
- usage_limits = UsageLimits(request_limit=max_iterations) if max_iterations else None
107
- async with agent.iter(prompt, usage_limits=usage_limits) as agent_run:
108
- async for node in agent_run:
109
- # Check if this is a model request node (includes tool calls and text)
110
- if PydanticAgent.is_model_request_node(node):
111
- # Stream events from model request
112
- request_stream: Any
113
- async with node.stream(agent_run.ctx) as request_stream:
114
- async for event in request_stream:
115
- # Tool call start event
116
- if isinstance(event, PartStartEvent) and isinstance(
117
- event.part, ToolCallPart
118
- ):
119
- tool_marker = f"\n[Calling: {event.part.tool_name}]"
120
- print(tool_marker, flush=True)
121
- assistant_response_parts.append(tool_marker)
122
-
123
- # Text content delta
124
- elif isinstance(event, PartDeltaEvent) and isinstance(
125
- event.delta, TextPartDelta
126
- ):
127
- print(event.delta.content_delta, end="", flush=True)
128
- assistant_response_parts.append(event.delta.content_delta)
129
-
130
- print("\n") # Final newline after streaming
131
-
132
- # Get final result from agent_run
133
- result = agent_run.result
134
- if hasattr(result, "output"):
135
- logger.info("Final structured result:")
136
- output = result.output
137
- from rem.agentic.serialization import serialize_agent_result
138
- output_json = json.dumps(serialize_agent_result(output), indent=2)
139
- print(output_json)
140
- assistant_response_parts.append(f"\n{output_json}")
141
-
142
- # Save session messages (if session_id provided and postgres enabled)
143
- if context and context.session_id and settings.postgres.enabled:
144
- from ...services.session.compression import SessionMessageStore
145
-
146
- # Extract just the user query from prompt
147
- # Prompt format from ContextBuilder: system + history + user message
148
- # We need to extract the last user message
149
- user_message_content = prompt.split("\n\n")[-1] if "\n\n" in prompt else prompt
150
-
151
- user_message = {
152
- "role": "user",
153
- "content": user_message_content,
154
- "timestamp": to_iso_with_z(utc_now()),
155
- }
156
-
157
- assistant_message = {
158
- "role": "assistant",
159
- "content": "".join(assistant_response_parts),
160
- "timestamp": to_iso_with_z(utc_now()),
161
- }
162
-
163
- # Store messages with compression
164
- store = SessionMessageStore(user_id=context.user_id or settings.test.effective_user_id)
165
- await store.store_session_messages(
101
+ # Save user message BEFORE streaming (same as API, using shared utility)
102
+ if context and context.session_id and user_message:
103
+ await save_user_message(
166
104
  session_id=context.session_id,
167
- messages=[user_message, assistant_message],
168
105
  user_id=context.user_id,
169
- compress=True,
106
+ content=user_message,
170
107
  )
171
108
 
172
- logger.debug(f"Saved conversation to session {context.session_id}")
109
+ # Use the API streaming code path for consistency
110
+ # This properly handles tool calls and message persistence
111
+ model_name = getattr(agent, 'model', 'unknown')
112
+ if hasattr(model_name, 'model_name'):
113
+ model_name = model_name.model_name
114
+ elif hasattr(model_name, 'name'):
115
+ model_name = model_name.name
116
+ else:
117
+ model_name = str(model_name)
118
+
119
+ async for chunk in stream_openai_response_with_save(
120
+ agent=agent.agent if hasattr(agent, 'agent') else agent,
121
+ prompt=prompt,
122
+ model=model_name,
123
+ session_id=context.session_id if context else None,
124
+ user_id=context.user_id if context else None,
125
+ agent_context=context,
126
+ ):
127
+ # Parse SSE chunks for CLI display
128
+ if chunk.startswith("event: tool_call"):
129
+ # Extract tool call info from next data line
130
+ continue
131
+ elif chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
132
+ try:
133
+ data_str = chunk[6:].strip()
134
+ if data_str:
135
+ data = json.loads(data_str)
136
+ # Check for tool_call event
137
+ if data.get("type") == "tool_call":
138
+ tool_name = data.get("tool_name", "tool")
139
+ status = data.get("status", "")
140
+ if status == "started":
141
+ print(f"\n[Calling: {tool_name}]", flush=True)
142
+ # Check for text content (OpenAI format)
143
+ elif "choices" in data and data["choices"]:
144
+ delta = data["choices"][0].get("delta", {})
145
+ content = delta.get("content")
146
+ if content:
147
+ print(content, end="", flush=True)
148
+ except (json.JSONDecodeError, KeyError, IndexError):
149
+ pass
150
+
151
+ print("\n") # Final newline after streaming
152
+ logger.info("Final structured result:")
173
153
 
174
154
  except Exception as e:
175
155
  logger.error(f"Agent execution failed: {e}")
@@ -549,7 +529,7 @@ async def _ask_async(
549
529
 
550
530
  # Run agent with session persistence
551
531
  if stream:
552
- await run_agent_streaming(agent, prompt, max_turns=max_turns, context=context)
532
+ await run_agent_streaming(agent, prompt, max_turns=max_turns, context=context, user_message=query)
553
533
  else:
554
534
  await run_agent_non_streaming(
555
535
  agent,
rem/cli/commands/db.py CHANGED
@@ -375,8 +375,10 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
375
375
  import polars as pl
376
376
  import yaml
377
377
  from ...models.core.inline_edge import InlineEdge
378
- from ...models.entities import Resource, Moment, User, Message, SharedSession, Schema
378
+ from ...models.entities import SharedSession
379
379
  from ...services.postgres import get_postgres_service
380
+ from ...utils.model_helpers import get_table_name
381
+ from ... import get_model_registry
380
382
 
381
383
  logger.info(f"Loading data from: {file_path}")
382
384
  scope_msg = f"user: {user_id}" if user_id else "public"
@@ -385,13 +387,12 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
385
387
  suffix = file_path.suffix.lower()
386
388
  is_yaml = suffix in {".yaml", ".yml"}
387
389
 
388
- # Map table names to model classes
390
+ # Build MODEL_MAP dynamically from registry
391
+ registry = get_model_registry()
392
+ registry.register_core_models()
389
393
  MODEL_MAP = {
390
- "users": User,
391
- "moments": Moment,
392
- "resources": Resource,
393
- "messages": Message,
394
- "schemas": Schema,
394
+ get_table_name(model): model
395
+ for model in registry.get_model_classes().values()
395
396
  }
396
397
 
397
398
  # Non-CoreModel tables that need direct SQL insertion
@@ -432,12 +433,9 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
432
433
  logger.info(f"Columns: {list(df.columns)}")
433
434
 
434
435
  # Validate first row against model if table is known
435
- if table in {"users", "moments", "resources", "messages", "schemas"} and rows:
436
- from ...models.entities import Resource, Moment, User, Message, Schema
436
+ if table in MODEL_MAP and rows:
437
437
  from ...utils.model_helpers import validate_data_for_model
438
- model_map = {"users": User, "moments": Moment, "resources": Resource,
439
- "messages": Message, "schemas": Schema}
440
- result = validate_data_for_model(model_map[table], rows[0])
438
+ result = validate_data_for_model(MODEL_MAP[table], rows[0])
441
439
  if result.extra_fields:
442
440
  logger.warning(f"Unknown fields (ignored): {result.extra_fields}")
443
441
  if result.valid:
@@ -457,6 +455,10 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
457
455
 
458
456
  await pg.connect()
459
457
 
458
+ # Start embedding worker for generating embeddings
459
+ if pg.embedding_worker:
460
+ await pg.embedding_worker.start()
461
+
460
462
  try:
461
463
  total_loaded = 0
462
464
 
@@ -467,8 +469,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
467
469
  # Handle direct insert tables (non-CoreModel)
468
470
  if table_name in DIRECT_INSERT_TABLES:
469
471
  for row_data in rows:
470
- if "tenant_id" not in row_data:
471
- row_data["tenant_id"] = "default"
472
+ # tenant_id is optional - NULL means public/shared
472
473
 
473
474
  if table_name == "shared_sessions":
474
475
  await pg.fetch(
@@ -479,7 +480,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
479
480
  row_data["session_id"],
480
481
  row_data["owner_user_id"],
481
482
  row_data["shared_with_user_id"],
482
- row_data["tenant_id"],
483
+ row_data.get("tenant_id"), # Optional - NULL means public
483
484
  )
484
485
  total_loaded += 1
485
486
  logger.success(f"Loaded shared_session: {row_data['owner_user_id']} -> {row_data['shared_with_user_id']}")
@@ -492,10 +493,8 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
492
493
  model_class = MODEL_MAP[table_name]
493
494
 
494
495
  for row_idx, row_data in enumerate(rows):
495
- if "user_id" not in row_data and user_id is not None:
496
- row_data["user_id"] = user_id
497
- if "tenant_id" not in row_data and user_id is not None:
498
- row_data["tenant_id"] = row_data.get("user_id", user_id)
496
+ # tenant_id and user_id are optional - NULL means public/shared data
497
+ # Data files can explicitly set tenant_id/user_id if needed
499
498
 
500
499
  # Convert graph_edges to InlineEdge format if present
501
500
  if "graph_edges" in row_data:
@@ -530,6 +529,14 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
530
529
 
531
530
  logger.success(f"Data loaded successfully! Total rows: {total_loaded}")
532
531
 
532
+ # Wait for embeddings to complete
533
+ if pg.embedding_worker and pg.embedding_worker.running:
534
+ queue_size = pg.embedding_worker.task_queue.qsize()
535
+ if queue_size > 0:
536
+ logger.info(f"Waiting for {queue_size} embeddings to complete...")
537
+ await pg.embedding_worker.stop()
538
+ logger.success("Embeddings generated successfully")
539
+
533
540
  finally:
534
541
  await pg.disconnect()
535
542
 
@@ -634,7 +641,7 @@ async def _diff_async(
634
641
 
635
642
  if not result.has_changes:
636
643
  click.secho("✓ No schema drift detected", fg="green")
637
- click.echo(" Database matches Pydantic models")
644
+ click.echo(" Database matches source (tables, functions, triggers, views)")
638
645
  if result.filtered_count > 0:
639
646
  click.echo()
640
647
  click.secho(f" ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
@@ -646,17 +653,34 @@ async def _diff_async(
646
653
  if result.filtered_count > 0:
647
654
  click.secho(f" ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
648
655
  click.echo()
649
- click.echo("Changes:")
650
- for line in result.summary:
651
- if line.startswith("+"):
652
- click.secho(f" {line}", fg="green")
653
- elif line.startswith("-"):
654
- click.secho(f" {line}", fg="red")
655
- elif line.startswith("~"):
656
- click.secho(f" {line}", fg="yellow")
657
- else:
658
- click.echo(f" {line}")
659
- click.echo()
656
+
657
+ # Table/column changes (Alembic)
658
+ if result.summary:
659
+ click.echo("Table Changes:")
660
+ for line in result.summary:
661
+ if line.startswith("+"):
662
+ click.secho(f" {line}", fg="green")
663
+ elif line.startswith("-"):
664
+ click.secho(f" {line}", fg="red")
665
+ elif line.startswith("~"):
666
+ click.secho(f" {line}", fg="yellow")
667
+ else:
668
+ click.echo(f" {line}")
669
+ click.echo()
670
+
671
+ # Programmable object changes (functions, triggers, views)
672
+ if result.programmable_summary:
673
+ click.echo("Programmable Objects (functions/triggers/views):")
674
+ for line in result.programmable_summary:
675
+ if line.startswith("+"):
676
+ click.secho(f" {line}", fg="green")
677
+ elif line.startswith("-"):
678
+ click.secho(f" {line}", fg="red")
679
+ elif line.startswith("~"):
680
+ click.secho(f" {line}", fg="yellow")
681
+ else:
682
+ click.echo(f" {line}")
683
+ click.echo()
660
684
 
661
685
  # Generate migration if requested
662
686
  if generate:
@@ -11,39 +11,102 @@ from rem.services.content import ContentService
11
11
 
12
12
 
13
13
  @click.command(name="ingest")
14
- @click.argument("file_path", type=click.Path(exists=True))
15
- @click.option("--user-id", default=None, help="User ID to scope file privately (default: public/shared)")
14
+ @click.argument("path", type=click.Path(exists=True))
15
+ @click.option("--table", "-t", default=None, help="Target table (e.g., ontologies, resources). Auto-detected for schemas.")
16
+ @click.option("--make-private", is_flag=True, help="Make data private to a specific user. RARELY NEEDED - most data should be public/shared.")
17
+ @click.option("--user-id", default=None, help="User ID for private data. REQUIRES --make-private flag.")
16
18
  @click.option("--category", help="Optional file category")
17
19
  @click.option("--tags", help="Optional comma-separated tags")
20
+ @click.option("--pattern", "-p", default="**/*.md", help="Glob pattern for directory ingestion (default: **/*.md)")
21
+ @click.option("--dry-run", is_flag=True, help="Show what would be ingested without making changes")
18
22
  def process_ingest(
19
- file_path: str,
23
+ path: str,
24
+ table: str | None,
25
+ make_private: bool,
20
26
  user_id: str | None,
21
27
  category: str | None,
22
28
  tags: str | None,
29
+ pattern: str,
30
+ dry_run: bool,
23
31
  ):
24
32
  """
25
- Ingest a file into REM (storage + parsing + embedding).
33
+ Ingest files into REM (storage + parsing + embedding).
26
34
 
27
- This command performs the full ingestion pipeline:
28
- 1. Reads the file from the local path.
29
- 2. Stores it in the configured storage (local/S3).
30
- 3. Parses the content.
31
- 4. Chunks and embeds the content into Resources.
32
- 5. Creates a File entity record.
35
+ Supports both single files and directories. For directories, recursively
36
+ processes files matching the pattern (default: **/*.md).
37
+
38
+ **IMPORTANT: Data is PUBLIC by default.** This is the correct behavior for
39
+ shared knowledge bases (ontologies, procedures, reference data). Private
40
+ user-scoped data is rarely needed and requires explicit --make-private flag.
41
+
42
+ Target table is auto-detected for schemas (agent.yaml → schemas table).
43
+ Use --table to explicitly set the target (e.g., ontologies for clinical knowledge).
33
44
 
34
45
  Examples:
35
46
  rem process ingest sample.pdf
36
47
  rem process ingest contract.docx --category legal --tags contract,2023
37
48
  rem process ingest agent.yaml # Auto-detects kind=agent, saves to schemas table
49
+
50
+ # Directory ingestion into ontologies table (PUBLIC - no user-id needed)
51
+ rem process ingest ontology/procedures/scid-5/ --table ontologies
52
+ rem process ingest ontology/ --table ontologies --pattern "**/*.md"
53
+
54
+ # Preview what would be ingested
55
+ rem process ingest ontology/ --table ontologies --dry-run
56
+
57
+ # RARE: Private user-scoped data (requires --make-private)
58
+ rem process ingest private-notes.md --make-private --user-id user-123
38
59
  """
39
60
  import asyncio
61
+
62
+ # Validate: user_id requires --make-private flag
63
+ if user_id and not make_private:
64
+ raise click.UsageError(
65
+ "Setting --user-id requires the --make-private flag.\n\n"
66
+ "Data should be PUBLIC by default (no user-id). Private user-scoped data\n"
67
+ "is rarely needed - only use --make-private for truly personal content.\n\n"
68
+ "Example: rem process ingest file.md --make-private --user-id user-123"
69
+ )
70
+
71
+ # If --make-private is set, user_id is required
72
+ if make_private and not user_id:
73
+ raise click.UsageError(
74
+ "--make-private requires --user-id to specify which user owns the data.\n\n"
75
+ "Example: rem process ingest file.md --make-private --user-id user-123"
76
+ )
77
+
78
+ # Clear user_id if not making private (ensure None for public data)
79
+ effective_user_id = user_id if make_private else None
80
+ from pathlib import Path
40
81
  from ...services.content import ContentService
41
82
 
42
83
  async def _ingest():
43
- # Initialize ContentService with repositories for proper resource saving
44
84
  from rem.services.postgres import get_postgres_service
45
85
  from rem.services.postgres.repository import Repository
46
- from rem.models.entities import File, Resource
86
+ from rem.models.entities import File, Resource, Ontology
87
+
88
+ input_path = Path(path)
89
+ tag_list = tags.split(",") if tags else None
90
+
91
+ # Collect files to process
92
+ if input_path.is_dir():
93
+ files_to_process = list(input_path.glob(pattern))
94
+ if not files_to_process:
95
+ logger.error(f"No files matching '{pattern}' found in {input_path}")
96
+ sys.exit(1)
97
+ logger.info(f"Found {len(files_to_process)} files matching '{pattern}'")
98
+ else:
99
+ files_to_process = [input_path]
100
+
101
+ # Dry run: just show what would be processed
102
+ if dry_run:
103
+ logger.info("DRY RUN - Would ingest:")
104
+ for f in files_to_process[:20]:
105
+ entity_key = f.stem # filename without extension
106
+ logger.info(f" {f} → {table or 'auto-detect'} (key: {entity_key})")
107
+ if len(files_to_process) > 20:
108
+ logger.info(f" ... and {len(files_to_process) - 20} more files")
109
+ return
47
110
 
48
111
  db = get_postgres_service()
49
112
  if not db:
@@ -51,53 +114,118 @@ def process_ingest(
51
114
  await db.connect()
52
115
 
53
116
  try:
54
- file_repo = Repository(File, "files", db=db)
55
- resource_repo = Repository(Resource, "resources", db=db)
56
- service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
57
-
58
- tag_list = tags.split(",") if tags else None
59
-
60
- scope_msg = f"user: {user_id}" if user_id else "public"
61
- logger.info(f"Ingesting file: {file_path} ({scope_msg})")
62
- result = await service.ingest_file(
63
- file_uri=file_path,
64
- user_id=user_id,
65
- category=category,
66
- tags=tag_list,
67
- is_local_server=True, # CLI is local
68
- )
69
-
70
- # Handle schema ingestion (agents/evaluators)
71
- if result.get("schema_name"):
72
- logger.success(f"Schema ingested: {result['schema_name']} (kind={result.get('kind', 'agent')})")
73
- logger.info(f"Version: {result.get('version', '1.0.0')}")
74
- # Handle file ingestion
75
- elif result.get("processing_status") == "completed":
76
- logger.success(f"File ingested: {result['file_name']}")
77
- logger.info(f"File ID: {result['file_id']}")
78
- logger.info(f"Resources created: {result['resources_created']}")
117
+ # Direct table ingestion (ontologies, etc.)
118
+ if table:
119
+ await _ingest_to_table(
120
+ db=db,
121
+ files=files_to_process,
122
+ table_name=table,
123
+ user_id=effective_user_id,
124
+ category=category,
125
+ tag_list=tag_list,
126
+ )
79
127
  else:
80
- logger.error(f"Ingestion failed: {result.get('message', 'Unknown error')}")
81
- sys.exit(1)
128
+ # Standard file ingestion via ContentService
129
+ file_repo = Repository(File, "files", db=db)
130
+ resource_repo = Repository(Resource, "resources", db=db)
131
+ service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
132
+
133
+ for file_path in files_to_process:
134
+ scope_msg = f"user: {effective_user_id}" if effective_user_id else "public"
135
+ logger.info(f"Ingesting: {file_path} ({scope_msg})")
136
+
137
+ result = await service.ingest_file(
138
+ file_uri=str(file_path),
139
+ user_id=effective_user_id,
140
+ category=category,
141
+ tags=tag_list,
142
+ is_local_server=True,
143
+ )
144
+
145
+ # Handle schema ingestion (agents/evaluators)
146
+ if result.get("schema_name"):
147
+ logger.success(f"Schema: {result['schema_name']} (kind={result.get('kind', 'agent')})")
148
+ elif result.get("processing_status") == "completed":
149
+ logger.success(f"File: {result['file_name']} ({result['resources_created']} resources)")
150
+ else:
151
+ logger.error(f"Failed: {result.get('message', 'Unknown error')}")
82
152
 
83
153
  except Exception as e:
84
154
  logger.error(f"Error during ingestion: {e}")
85
155
  sys.exit(1)
86
156
  finally:
87
- # Wait for global embedding worker to finish queued tasks
157
+ # Wait for embedding worker to finish
88
158
  from rem.services.embeddings.worker import get_global_embedding_worker
89
159
  try:
90
160
  worker = get_global_embedding_worker()
91
161
  if worker and worker.running and not worker.task_queue.empty():
92
- logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks to complete...")
93
- # Worker.stop() waits for queue to drain (see worker.py line ~148)
162
+ logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks...")
94
163
  await worker.stop()
95
164
  except RuntimeError:
96
- # Worker doesn't exist yet - no tasks queued
97
165
  pass
98
166
 
99
167
  await db.disconnect()
100
168
 
169
+ async def _ingest_to_table(db, files, table_name, user_id, category, tag_list):
170
+ """Direct ingestion of files to a specific table (ontologies, etc.)."""
171
+ from rem.services.postgres.repository import Repository
172
+ from rem import get_model_registry
173
+ from rem.utils.model_helpers import get_table_name
174
+
175
+ # Get model class for table
176
+ registry = get_model_registry()
177
+ registry.register_core_models()
178
+ model_class = None
179
+ for model in registry.get_model_classes().values():
180
+ if get_table_name(model) == table_name:
181
+ model_class = model
182
+ break
183
+
184
+ if not model_class:
185
+ logger.error(f"Unknown table: {table_name}")
186
+ sys.exit(1)
187
+
188
+ repo = Repository(model_class, table_name, db=db)
189
+ processed = 0
190
+ failed = 0
191
+
192
+ for file_path in files:
193
+ try:
194
+ # Read file content
195
+ content = file_path.read_text(encoding="utf-8")
196
+ entity_key = file_path.stem # filename without extension
197
+
198
+ # Build entity based on table
199
+ entity_data = {
200
+ "name": entity_key,
201
+ "content": content,
202
+ "tags": tag_list or [],
203
+ }
204
+
205
+ # Add optional fields
206
+ if category:
207
+ entity_data["category"] = category
208
+
209
+ # Scoping: user_id for private data, "public" for shared
210
+ # tenant_id="public" is the default for shared knowledge bases
211
+ entity_data["tenant_id"] = user_id or "public"
212
+ entity_data["user_id"] = user_id # None = public/shared
213
+
214
+ # For ontologies, add URI
215
+ if table_name == "ontologies":
216
+ entity_data["uri"] = f"file://{file_path.absolute()}"
217
+
218
+ entity = model_class(**entity_data)
219
+ await repo.upsert(entity, embeddable_fields=["content"], generate_embeddings=True)
220
+ processed += 1
221
+ logger.success(f" ✓ {entity_key}")
222
+
223
+ except Exception as e:
224
+ failed += 1
225
+ logger.error(f" ✗ {file_path.name}: {e}")
226
+
227
+ logger.info(f"Completed: {processed} succeeded, {failed} failed")
228
+
101
229
  asyncio.run(_ingest())
102
230
 
103
231
  def register_commands(group: click.Group):