remdb 0.3.181__py3-none-any.whl → 0.3.200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/agentic/context.py +101 -0
- rem/agentic/context_builder.py +12 -2
- rem/api/main.py +1 -1
- rem/api/mcp_router/server.py +4 -0
- rem/api/mcp_router/tools.py +395 -159
- rem/api/routers/chat/completions.py +51 -9
- rem/api/routers/chat/sse_events.py +2 -2
- rem/api/routers/chat/streaming.py +146 -21
- rem/api/routers/messages.py +96 -23
- rem/auth/middleware.py +42 -28
- rem/cli/README.md +62 -0
- rem/cli/commands/db.py +33 -19
- rem/cli/commands/process.py +171 -43
- rem/services/content/service.py +18 -5
- rem/services/postgres/__init__.py +28 -3
- rem/services/postgres/diff_service.py +57 -5
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
- rem/services/postgres/register_type.py +11 -10
- rem/services/session/__init__.py +7 -1
- rem/services/session/pydantic_messages.py +210 -0
- rem/sql/migrations/001_install.sql +115 -7
- rem/sql/migrations/002_install_models.sql +117 -105
- rem/sql/migrations/004_cache_system.sql +7 -275
- rem/utils/schema_loader.py +6 -6
- {remdb-0.3.181.dist-info → remdb-0.3.200.dist-info}/METADATA +1 -1
- {remdb-0.3.181.dist-info → remdb-0.3.200.dist-info}/RECORD +29 -27
- {remdb-0.3.181.dist-info → remdb-0.3.200.dist-info}/WHEEL +0 -0
- {remdb-0.3.181.dist-info → remdb-0.3.200.dist-info}/entry_points.txt +0 -0
rem/auth/middleware.py
CHANGED
|
@@ -14,15 +14,14 @@ Design Pattern:
|
|
|
14
14
|
- MCP paths always require authentication (protected service)
|
|
15
15
|
|
|
16
16
|
Authentication Flow:
|
|
17
|
-
1.
|
|
18
|
-
2.
|
|
19
|
-
3.
|
|
20
|
-
4.
|
|
21
|
-
5. If allow_anonymous=
|
|
22
|
-
6. If allow_anonymous=False: Return 401 / redirect to login
|
|
17
|
+
1. Check JWT/dev token/session for user identity first
|
|
18
|
+
2. If user is admin: bypass API key check (admin privilege)
|
|
19
|
+
3. If API key enabled and user is not admin: Validate X-API-Key header
|
|
20
|
+
4. If allow_anonymous=True: Allow as anonymous (rate-limited)
|
|
21
|
+
5. If allow_anonymous=False: Return 401 / redirect to login
|
|
23
22
|
|
|
24
23
|
IMPORTANT: API key validates ACCESS, JWT identifies USER.
|
|
25
|
-
|
|
24
|
+
Admin users bypass the API key requirement (trusted identity).
|
|
26
25
|
|
|
27
26
|
Access Modes (configured in settings.auth):
|
|
28
27
|
- enabled=true, allow_anonymous=true: Auth available, anonymous gets rate-limited access
|
|
@@ -195,6 +194,12 @@ class AuthMiddleware(BaseHTTPMiddleware):
|
|
|
195
194
|
|
|
196
195
|
return None
|
|
197
196
|
|
|
197
|
+
def _is_admin(self, user: dict | None) -> bool:
|
|
198
|
+
"""Check if user has admin role."""
|
|
199
|
+
if not user:
|
|
200
|
+
return False
|
|
201
|
+
return "admin" in user.get("roles", [])
|
|
202
|
+
|
|
198
203
|
async def dispatch(self, request: Request, call_next):
|
|
199
204
|
"""
|
|
200
205
|
Check authentication for protected paths.
|
|
@@ -219,8 +224,35 @@ class AuthMiddleware(BaseHTTPMiddleware):
|
|
|
219
224
|
if not is_protected or is_excluded:
|
|
220
225
|
return await call_next(request)
|
|
221
226
|
|
|
222
|
-
#
|
|
223
|
-
#
|
|
227
|
+
# Check for user identity FIRST (JWT, dev token, session)
|
|
228
|
+
# This allows admin users to bypass API key requirement
|
|
229
|
+
user = None
|
|
230
|
+
|
|
231
|
+
# Check for JWT token in Authorization header (primary user identity)
|
|
232
|
+
jwt_user = self._check_jwt_token(request)
|
|
233
|
+
if jwt_user:
|
|
234
|
+
user = jwt_user
|
|
235
|
+
|
|
236
|
+
# Check for dev token (non-production only)
|
|
237
|
+
if not user:
|
|
238
|
+
dev_user = self._check_dev_token(request)
|
|
239
|
+
if dev_user:
|
|
240
|
+
user = dev_user
|
|
241
|
+
|
|
242
|
+
# Check for valid session (backward compatibility)
|
|
243
|
+
if not user:
|
|
244
|
+
session_user = request.session.get("user")
|
|
245
|
+
if session_user:
|
|
246
|
+
user = session_user
|
|
247
|
+
|
|
248
|
+
# If user is admin, bypass API key check entirely
|
|
249
|
+
if self._is_admin(user):
|
|
250
|
+
logger.debug(f"Admin user {user.get('email')} bypassing API key check")
|
|
251
|
+
request.state.user = user
|
|
252
|
+
request.state.is_anonymous = False
|
|
253
|
+
return await call_next(request)
|
|
254
|
+
|
|
255
|
+
# API key validation for non-admin users (access control guardrail)
|
|
224
256
|
if settings.api.api_key_enabled:
|
|
225
257
|
api_key = request.headers.get("x-api-key")
|
|
226
258
|
if not api_key:
|
|
@@ -238,27 +270,9 @@ class AuthMiddleware(BaseHTTPMiddleware):
|
|
|
238
270
|
headers={"WWW-Authenticate": 'ApiKey realm="REM API"'},
|
|
239
271
|
)
|
|
240
272
|
logger.debug("X-API-Key validated for access")
|
|
241
|
-
# API key valid - continue to check JWT for user identity
|
|
242
|
-
|
|
243
|
-
# Check for JWT token in Authorization header (primary user identity)
|
|
244
|
-
jwt_user = self._check_jwt_token(request)
|
|
245
|
-
if jwt_user:
|
|
246
|
-
request.state.user = jwt_user
|
|
247
|
-
request.state.is_anonymous = False
|
|
248
|
-
return await call_next(request)
|
|
249
|
-
|
|
250
|
-
# Check for dev token (non-production only)
|
|
251
|
-
dev_user = self._check_dev_token(request)
|
|
252
|
-
if dev_user:
|
|
253
|
-
request.state.user = dev_user
|
|
254
|
-
request.state.is_anonymous = False
|
|
255
|
-
return await call_next(request)
|
|
256
|
-
|
|
257
|
-
# Check for valid session (backward compatibility)
|
|
258
|
-
user = request.session.get("user")
|
|
259
273
|
|
|
274
|
+
# If we have a valid user (non-admin, but passed API key check), allow access
|
|
260
275
|
if user:
|
|
261
|
-
# Authenticated user - add to request state
|
|
262
276
|
request.state.user = user
|
|
263
277
|
request.state.is_anonymous = False
|
|
264
278
|
return await call_next(request)
|
rem/cli/README.md
CHANGED
|
@@ -434,6 +434,68 @@ Ensure you're using the correct model format:
|
|
|
434
434
|
- OpenAI: `openai:gpt-4o-mini`, `openai:gpt-4o`
|
|
435
435
|
- Anthropic: `anthropic:claude-sonnet-4-5-20250929`
|
|
436
436
|
|
|
437
|
+
## Data Visibility: PUBLIC vs PRIVATE
|
|
438
|
+
|
|
439
|
+
**IMPORTANT: All ingested data is PUBLIC by default.** This is the correct behavior
|
|
440
|
+
for shared knowledge bases (ontologies, procedures, reference data).
|
|
441
|
+
|
|
442
|
+
### Why PUBLIC by Default?
|
|
443
|
+
|
|
444
|
+
Most data in REM should be searchable by all users:
|
|
445
|
+
- Clinical ontologies (disorders, symptoms, drugs)
|
|
446
|
+
- Procedures and protocols (SCID-5, PHQ-9, etc.)
|
|
447
|
+
- Reference documentation
|
|
448
|
+
- Shared domain knowledge
|
|
449
|
+
|
|
450
|
+
The `rem_lookup()` function searches for data where `user_id IS NULL`, which means
|
|
451
|
+
public data. If you set `user_id` on data, it becomes invisible to other users.
|
|
452
|
+
|
|
453
|
+
### Ingesting Public Data (Default)
|
|
454
|
+
|
|
455
|
+
```bash
|
|
456
|
+
# Standard ingestion - data is PUBLIC
|
|
457
|
+
rem process ingest ontology/procedures/ --table ontologies
|
|
458
|
+
|
|
459
|
+
# From S3 - also PUBLIC
|
|
460
|
+
rem process ingest s3://bucket/docs/reference.pdf
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
### Ingesting Private Data (Rare)
|
|
464
|
+
|
|
465
|
+
Private data requires explicit `--make-private` flag:
|
|
466
|
+
|
|
467
|
+
```bash
|
|
468
|
+
# Private user data - requires --make-private and --user-id
|
|
469
|
+
rem process ingest personal-notes.md --make-private --user-id user-123
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
**When to use private data:**
|
|
473
|
+
- User-uploaded personal documents
|
|
474
|
+
- Session-specific content
|
|
475
|
+
- User notes and annotations
|
|
476
|
+
|
|
477
|
+
**NEVER use private data for:**
|
|
478
|
+
- Ontologies and reference material
|
|
479
|
+
- Clinical procedures and protocols
|
|
480
|
+
- Shared knowledge bases
|
|
481
|
+
- Anything that should be searchable by agents
|
|
482
|
+
|
|
483
|
+
### Common Mistake
|
|
484
|
+
|
|
485
|
+
If agents can't find data via `search_rem`, the most common cause is that the data
|
|
486
|
+
was ingested with a `user_id` set. Check with:
|
|
487
|
+
|
|
488
|
+
```sql
|
|
489
|
+
SELECT name, user_id FROM ontologies WHERE name = 'phq-9-procedure';
|
|
490
|
+
-- user_id should be NULL for public data
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
Fix by setting user_id to NULL:
|
|
494
|
+
```sql
|
|
495
|
+
UPDATE ontologies SET user_id = NULL WHERE user_id IS NOT NULL;
|
|
496
|
+
UPDATE kv_store SET user_id = NULL WHERE entity_type = 'ontologies' AND user_id IS NOT NULL;
|
|
497
|
+
```
|
|
498
|
+
|
|
437
499
|
## Next Steps
|
|
438
500
|
|
|
439
501
|
1. **Implement Schema Registry**
|
rem/cli/commands/db.py
CHANGED
|
@@ -469,8 +469,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
|
|
|
469
469
|
# Handle direct insert tables (non-CoreModel)
|
|
470
470
|
if table_name in DIRECT_INSERT_TABLES:
|
|
471
471
|
for row_data in rows:
|
|
472
|
-
|
|
473
|
-
row_data["tenant_id"] = "default"
|
|
472
|
+
# tenant_id is optional - NULL means public/shared
|
|
474
473
|
|
|
475
474
|
if table_name == "shared_sessions":
|
|
476
475
|
await pg.fetch(
|
|
@@ -481,7 +480,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
|
|
|
481
480
|
row_data["session_id"],
|
|
482
481
|
row_data["owner_user_id"],
|
|
483
482
|
row_data["shared_with_user_id"],
|
|
484
|
-
row_data
|
|
483
|
+
row_data.get("tenant_id"), # Optional - NULL means public
|
|
485
484
|
)
|
|
486
485
|
total_loaded += 1
|
|
487
486
|
logger.success(f"Loaded shared_session: {row_data['owner_user_id']} -> {row_data['shared_with_user_id']}")
|
|
@@ -494,10 +493,8 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
|
|
|
494
493
|
model_class = MODEL_MAP[table_name]
|
|
495
494
|
|
|
496
495
|
for row_idx, row_data in enumerate(rows):
|
|
497
|
-
# user_id
|
|
498
|
-
#
|
|
499
|
-
if "tenant_id" not in row_data and user_id is not None:
|
|
500
|
-
row_data["tenant_id"] = user_id
|
|
496
|
+
# tenant_id and user_id are optional - NULL means public/shared data
|
|
497
|
+
# Data files can explicitly set tenant_id/user_id if needed
|
|
501
498
|
|
|
502
499
|
# Convert graph_edges to InlineEdge format if present
|
|
503
500
|
if "graph_edges" in row_data:
|
|
@@ -644,7 +641,7 @@ async def _diff_async(
|
|
|
644
641
|
|
|
645
642
|
if not result.has_changes:
|
|
646
643
|
click.secho("✓ No schema drift detected", fg="green")
|
|
647
|
-
click.echo(" Database matches
|
|
644
|
+
click.echo(" Database matches source (tables, functions, triggers, views)")
|
|
648
645
|
if result.filtered_count > 0:
|
|
649
646
|
click.echo()
|
|
650
647
|
click.secho(f" ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
|
|
@@ -656,17 +653,34 @@ async def _diff_async(
|
|
|
656
653
|
if result.filtered_count > 0:
|
|
657
654
|
click.secho(f" ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
|
|
658
655
|
click.echo()
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
656
|
+
|
|
657
|
+
# Table/column changes (Alembic)
|
|
658
|
+
if result.summary:
|
|
659
|
+
click.echo("Table Changes:")
|
|
660
|
+
for line in result.summary:
|
|
661
|
+
if line.startswith("+"):
|
|
662
|
+
click.secho(f" {line}", fg="green")
|
|
663
|
+
elif line.startswith("-"):
|
|
664
|
+
click.secho(f" {line}", fg="red")
|
|
665
|
+
elif line.startswith("~"):
|
|
666
|
+
click.secho(f" {line}", fg="yellow")
|
|
667
|
+
else:
|
|
668
|
+
click.echo(f" {line}")
|
|
669
|
+
click.echo()
|
|
670
|
+
|
|
671
|
+
# Programmable object changes (functions, triggers, views)
|
|
672
|
+
if result.programmable_summary:
|
|
673
|
+
click.echo("Programmable Objects (functions/triggers/views):")
|
|
674
|
+
for line in result.programmable_summary:
|
|
675
|
+
if line.startswith("+"):
|
|
676
|
+
click.secho(f" {line}", fg="green")
|
|
677
|
+
elif line.startswith("-"):
|
|
678
|
+
click.secho(f" {line}", fg="red")
|
|
679
|
+
elif line.startswith("~"):
|
|
680
|
+
click.secho(f" {line}", fg="yellow")
|
|
681
|
+
else:
|
|
682
|
+
click.echo(f" {line}")
|
|
683
|
+
click.echo()
|
|
670
684
|
|
|
671
685
|
# Generate migration if requested
|
|
672
686
|
if generate:
|
rem/cli/commands/process.py
CHANGED
|
@@ -11,39 +11,102 @@ from rem.services.content import ContentService
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
@click.command(name="ingest")
|
|
14
|
-
@click.argument("
|
|
15
|
-
@click.option("--
|
|
14
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
15
|
+
@click.option("--table", "-t", default=None, help="Target table (e.g., ontologies, resources). Auto-detected for schemas.")
|
|
16
|
+
@click.option("--make-private", is_flag=True, help="Make data private to a specific user. RARELY NEEDED - most data should be public/shared.")
|
|
17
|
+
@click.option("--user-id", default=None, help="User ID for private data. REQUIRES --make-private flag.")
|
|
16
18
|
@click.option("--category", help="Optional file category")
|
|
17
19
|
@click.option("--tags", help="Optional comma-separated tags")
|
|
20
|
+
@click.option("--pattern", "-p", default="**/*.md", help="Glob pattern for directory ingestion (default: **/*.md)")
|
|
21
|
+
@click.option("--dry-run", is_flag=True, help="Show what would be ingested without making changes")
|
|
18
22
|
def process_ingest(
|
|
19
|
-
|
|
23
|
+
path: str,
|
|
24
|
+
table: str | None,
|
|
25
|
+
make_private: bool,
|
|
20
26
|
user_id: str | None,
|
|
21
27
|
category: str | None,
|
|
22
28
|
tags: str | None,
|
|
29
|
+
pattern: str,
|
|
30
|
+
dry_run: bool,
|
|
23
31
|
):
|
|
24
32
|
"""
|
|
25
|
-
Ingest
|
|
33
|
+
Ingest files into REM (storage + parsing + embedding).
|
|
26
34
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
35
|
+
Supports both single files and directories. For directories, recursively
|
|
36
|
+
processes files matching the pattern (default: **/*.md).
|
|
37
|
+
|
|
38
|
+
**IMPORTANT: Data is PUBLIC by default.** This is the correct behavior for
|
|
39
|
+
shared knowledge bases (ontologies, procedures, reference data). Private
|
|
40
|
+
user-scoped data is rarely needed and requires explicit --make-private flag.
|
|
41
|
+
|
|
42
|
+
Target table is auto-detected for schemas (agent.yaml → schemas table).
|
|
43
|
+
Use --table to explicitly set the target (e.g., ontologies for clinical knowledge).
|
|
33
44
|
|
|
34
45
|
Examples:
|
|
35
46
|
rem process ingest sample.pdf
|
|
36
47
|
rem process ingest contract.docx --category legal --tags contract,2023
|
|
37
48
|
rem process ingest agent.yaml # Auto-detects kind=agent, saves to schemas table
|
|
49
|
+
|
|
50
|
+
# Directory ingestion into ontologies table (PUBLIC - no user-id needed)
|
|
51
|
+
rem process ingest ontology/procedures/scid-5/ --table ontologies
|
|
52
|
+
rem process ingest ontology/ --table ontologies --pattern "**/*.md"
|
|
53
|
+
|
|
54
|
+
# Preview what would be ingested
|
|
55
|
+
rem process ingest ontology/ --table ontologies --dry-run
|
|
56
|
+
|
|
57
|
+
# RARE: Private user-scoped data (requires --make-private)
|
|
58
|
+
rem process ingest private-notes.md --make-private --user-id user-123
|
|
38
59
|
"""
|
|
39
60
|
import asyncio
|
|
61
|
+
|
|
62
|
+
# Validate: user_id requires --make-private flag
|
|
63
|
+
if user_id and not make_private:
|
|
64
|
+
raise click.UsageError(
|
|
65
|
+
"Setting --user-id requires the --make-private flag.\n\n"
|
|
66
|
+
"Data should be PUBLIC by default (no user-id). Private user-scoped data\n"
|
|
67
|
+
"is rarely needed - only use --make-private for truly personal content.\n\n"
|
|
68
|
+
"Example: rem process ingest file.md --make-private --user-id user-123"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# If --make-private is set, user_id is required
|
|
72
|
+
if make_private and not user_id:
|
|
73
|
+
raise click.UsageError(
|
|
74
|
+
"--make-private requires --user-id to specify which user owns the data.\n\n"
|
|
75
|
+
"Example: rem process ingest file.md --make-private --user-id user-123"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Clear user_id if not making private (ensure None for public data)
|
|
79
|
+
effective_user_id = user_id if make_private else None
|
|
80
|
+
from pathlib import Path
|
|
40
81
|
from ...services.content import ContentService
|
|
41
82
|
|
|
42
83
|
async def _ingest():
|
|
43
|
-
# Initialize ContentService with repositories for proper resource saving
|
|
44
84
|
from rem.services.postgres import get_postgres_service
|
|
45
85
|
from rem.services.postgres.repository import Repository
|
|
46
|
-
from rem.models.entities import File, Resource
|
|
86
|
+
from rem.models.entities import File, Resource, Ontology
|
|
87
|
+
|
|
88
|
+
input_path = Path(path)
|
|
89
|
+
tag_list = tags.split(",") if tags else None
|
|
90
|
+
|
|
91
|
+
# Collect files to process
|
|
92
|
+
if input_path.is_dir():
|
|
93
|
+
files_to_process = list(input_path.glob(pattern))
|
|
94
|
+
if not files_to_process:
|
|
95
|
+
logger.error(f"No files matching '{pattern}' found in {input_path}")
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
logger.info(f"Found {len(files_to_process)} files matching '{pattern}'")
|
|
98
|
+
else:
|
|
99
|
+
files_to_process = [input_path]
|
|
100
|
+
|
|
101
|
+
# Dry run: just show what would be processed
|
|
102
|
+
if dry_run:
|
|
103
|
+
logger.info("DRY RUN - Would ingest:")
|
|
104
|
+
for f in files_to_process[:20]:
|
|
105
|
+
entity_key = f.stem # filename without extension
|
|
106
|
+
logger.info(f" {f} → {table or 'auto-detect'} (key: {entity_key})")
|
|
107
|
+
if len(files_to_process) > 20:
|
|
108
|
+
logger.info(f" ... and {len(files_to_process) - 20} more files")
|
|
109
|
+
return
|
|
47
110
|
|
|
48
111
|
db = get_postgres_service()
|
|
49
112
|
if not db:
|
|
@@ -51,53 +114,118 @@ def process_ingest(
|
|
|
51
114
|
await db.connect()
|
|
52
115
|
|
|
53
116
|
try:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
user_id=user_id,
|
|
65
|
-
category=category,
|
|
66
|
-
tags=tag_list,
|
|
67
|
-
is_local_server=True, # CLI is local
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
# Handle schema ingestion (agents/evaluators)
|
|
71
|
-
if result.get("schema_name"):
|
|
72
|
-
logger.success(f"Schema ingested: {result['schema_name']} (kind={result.get('kind', 'agent')})")
|
|
73
|
-
logger.info(f"Version: {result.get('version', '1.0.0')}")
|
|
74
|
-
# Handle file ingestion
|
|
75
|
-
elif result.get("processing_status") == "completed":
|
|
76
|
-
logger.success(f"File ingested: {result['file_name']}")
|
|
77
|
-
logger.info(f"File ID: {result['file_id']}")
|
|
78
|
-
logger.info(f"Resources created: {result['resources_created']}")
|
|
117
|
+
# Direct table ingestion (ontologies, etc.)
|
|
118
|
+
if table:
|
|
119
|
+
await _ingest_to_table(
|
|
120
|
+
db=db,
|
|
121
|
+
files=files_to_process,
|
|
122
|
+
table_name=table,
|
|
123
|
+
user_id=effective_user_id,
|
|
124
|
+
category=category,
|
|
125
|
+
tag_list=tag_list,
|
|
126
|
+
)
|
|
79
127
|
else:
|
|
80
|
-
|
|
81
|
-
|
|
128
|
+
# Standard file ingestion via ContentService
|
|
129
|
+
file_repo = Repository(File, "files", db=db)
|
|
130
|
+
resource_repo = Repository(Resource, "resources", db=db)
|
|
131
|
+
service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
|
|
132
|
+
|
|
133
|
+
for file_path in files_to_process:
|
|
134
|
+
scope_msg = f"user: {effective_user_id}" if effective_user_id else "public"
|
|
135
|
+
logger.info(f"Ingesting: {file_path} ({scope_msg})")
|
|
136
|
+
|
|
137
|
+
result = await service.ingest_file(
|
|
138
|
+
file_uri=str(file_path),
|
|
139
|
+
user_id=effective_user_id,
|
|
140
|
+
category=category,
|
|
141
|
+
tags=tag_list,
|
|
142
|
+
is_local_server=True,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Handle schema ingestion (agents/evaluators)
|
|
146
|
+
if result.get("schema_name"):
|
|
147
|
+
logger.success(f"Schema: {result['schema_name']} (kind={result.get('kind', 'agent')})")
|
|
148
|
+
elif result.get("processing_status") == "completed":
|
|
149
|
+
logger.success(f"File: {result['file_name']} ({result['resources_created']} resources)")
|
|
150
|
+
else:
|
|
151
|
+
logger.error(f"Failed: {result.get('message', 'Unknown error')}")
|
|
82
152
|
|
|
83
153
|
except Exception as e:
|
|
84
154
|
logger.error(f"Error during ingestion: {e}")
|
|
85
155
|
sys.exit(1)
|
|
86
156
|
finally:
|
|
87
|
-
# Wait for
|
|
157
|
+
# Wait for embedding worker to finish
|
|
88
158
|
from rem.services.embeddings.worker import get_global_embedding_worker
|
|
89
159
|
try:
|
|
90
160
|
worker = get_global_embedding_worker()
|
|
91
161
|
if worker and worker.running and not worker.task_queue.empty():
|
|
92
|
-
logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks
|
|
93
|
-
# Worker.stop() waits for queue to drain (see worker.py line ~148)
|
|
162
|
+
logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks...")
|
|
94
163
|
await worker.stop()
|
|
95
164
|
except RuntimeError:
|
|
96
|
-
# Worker doesn't exist yet - no tasks queued
|
|
97
165
|
pass
|
|
98
166
|
|
|
99
167
|
await db.disconnect()
|
|
100
168
|
|
|
169
|
+
async def _ingest_to_table(db, files, table_name, user_id, category, tag_list):
|
|
170
|
+
"""Direct ingestion of files to a specific table (ontologies, etc.)."""
|
|
171
|
+
from rem.services.postgres.repository import Repository
|
|
172
|
+
from rem import get_model_registry
|
|
173
|
+
from rem.utils.model_helpers import get_table_name
|
|
174
|
+
|
|
175
|
+
# Get model class for table
|
|
176
|
+
registry = get_model_registry()
|
|
177
|
+
registry.register_core_models()
|
|
178
|
+
model_class = None
|
|
179
|
+
for model in registry.get_model_classes().values():
|
|
180
|
+
if get_table_name(model) == table_name:
|
|
181
|
+
model_class = model
|
|
182
|
+
break
|
|
183
|
+
|
|
184
|
+
if not model_class:
|
|
185
|
+
logger.error(f"Unknown table: {table_name}")
|
|
186
|
+
sys.exit(1)
|
|
187
|
+
|
|
188
|
+
repo = Repository(model_class, table_name, db=db)
|
|
189
|
+
processed = 0
|
|
190
|
+
failed = 0
|
|
191
|
+
|
|
192
|
+
for file_path in files:
|
|
193
|
+
try:
|
|
194
|
+
# Read file content
|
|
195
|
+
content = file_path.read_text(encoding="utf-8")
|
|
196
|
+
entity_key = file_path.stem # filename without extension
|
|
197
|
+
|
|
198
|
+
# Build entity based on table
|
|
199
|
+
entity_data = {
|
|
200
|
+
"name": entity_key,
|
|
201
|
+
"content": content,
|
|
202
|
+
"tags": tag_list or [],
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
# Add optional fields
|
|
206
|
+
if category:
|
|
207
|
+
entity_data["category"] = category
|
|
208
|
+
|
|
209
|
+
# Scoping: user_id for private data, None for public/shared
|
|
210
|
+
# tenant_id=None and user_id=None means PUBLIC data (visible to all)
|
|
211
|
+
entity_data["tenant_id"] = user_id # None = public/shared
|
|
212
|
+
entity_data["user_id"] = user_id # None = public/shared
|
|
213
|
+
|
|
214
|
+
# For ontologies, add URI
|
|
215
|
+
if table_name == "ontologies":
|
|
216
|
+
entity_data["uri"] = f"file://{file_path.absolute()}"
|
|
217
|
+
|
|
218
|
+
entity = model_class(**entity_data)
|
|
219
|
+
await repo.upsert(entity, embeddable_fields=["content"], generate_embeddings=True)
|
|
220
|
+
processed += 1
|
|
221
|
+
logger.success(f" ✓ {entity_key}")
|
|
222
|
+
|
|
223
|
+
except Exception as e:
|
|
224
|
+
failed += 1
|
|
225
|
+
logger.error(f" ✗ {file_path.name}: {e}")
|
|
226
|
+
|
|
227
|
+
logger.info(f"Completed: {processed} succeeded, {failed} failed")
|
|
228
|
+
|
|
101
229
|
asyncio.run(_ingest())
|
|
102
230
|
|
|
103
231
|
def register_commands(group: click.Group):
|
rem/services/content/service.py
CHANGED
|
@@ -274,7 +274,7 @@ class ContentService:
|
|
|
274
274
|
async def ingest_file(
|
|
275
275
|
self,
|
|
276
276
|
file_uri: str,
|
|
277
|
-
user_id: str,
|
|
277
|
+
user_id: str | None = None,
|
|
278
278
|
category: str | None = None,
|
|
279
279
|
tags: list[str] | None = None,
|
|
280
280
|
is_local_server: bool = False,
|
|
@@ -283,6 +283,10 @@ class ContentService:
|
|
|
283
283
|
"""
|
|
284
284
|
Complete file ingestion pipeline: read → store → parse → chunk → embed.
|
|
285
285
|
|
|
286
|
+
**IMPORTANT: Data is PUBLIC by default (user_id=None).**
|
|
287
|
+
This is correct for shared knowledge bases (ontologies, procedures, reference data).
|
|
288
|
+
Private user-scoped data is rarely needed - only set user_id for truly personal content.
|
|
289
|
+
|
|
286
290
|
**CENTRALIZED INGESTION**: This is the single entry point for all file ingestion
|
|
287
291
|
in REM. It handles:
|
|
288
292
|
|
|
@@ -319,7 +323,9 @@ class ContentService:
|
|
|
319
323
|
|
|
320
324
|
Args:
|
|
321
325
|
file_uri: Source file location (local path, s3://, or https://)
|
|
322
|
-
user_id: User identifier for data
|
|
326
|
+
user_id: User identifier for PRIVATE data only. Default None = PUBLIC/shared.
|
|
327
|
+
Leave as None for shared knowledge bases, ontologies, reference data.
|
|
328
|
+
Only set for truly private user-specific content.
|
|
323
329
|
category: Optional category tag (document, code, audio, etc.)
|
|
324
330
|
tags: Optional list of tags
|
|
325
331
|
is_local_server: True if running as local/stdio MCP server
|
|
@@ -347,12 +353,19 @@ class ContentService:
|
|
|
347
353
|
|
|
348
354
|
Example:
|
|
349
355
|
>>> service = ContentService()
|
|
356
|
+
>>> # PUBLIC data (default) - visible to all users
|
|
350
357
|
>>> result = await service.ingest_file(
|
|
351
|
-
... file_uri="s3://bucket/
|
|
352
|
-
...
|
|
353
|
-
... category="legal"
|
|
358
|
+
... file_uri="s3://bucket/procedure.pdf",
|
|
359
|
+
... category="medical"
|
|
354
360
|
... )
|
|
355
361
|
>>> print(f"Created {result['resources_created']} searchable chunks")
|
|
362
|
+
>>>
|
|
363
|
+
>>> # PRIVATE data (rare) - only for user-specific content
|
|
364
|
+
>>> result = await service.ingest_file(
|
|
365
|
+
... file_uri="s3://bucket/personal-notes.pdf",
|
|
366
|
+
... user_id="user-123", # Only this user can access
|
|
367
|
+
... category="personal"
|
|
368
|
+
... )
|
|
356
369
|
"""
|
|
357
370
|
from pathlib import Path
|
|
358
371
|
from uuid import uuid4
|
|
@@ -3,22 +3,47 @@ PostgreSQL service for CloudNativePG database operations.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from .diff_service import DiffService, SchemaDiff
|
|
6
|
+
from .programmable_diff_service import (
|
|
7
|
+
DiffResult,
|
|
8
|
+
ObjectDiff,
|
|
9
|
+
ObjectType,
|
|
10
|
+
ProgrammableDiffService,
|
|
11
|
+
)
|
|
6
12
|
from .repository import Repository
|
|
7
13
|
from .service import PostgresService
|
|
8
14
|
|
|
9
15
|
|
|
16
|
+
_postgres_instance: PostgresService | None = None
|
|
17
|
+
|
|
18
|
+
|
|
10
19
|
def get_postgres_service() -> PostgresService | None:
|
|
11
20
|
"""
|
|
12
|
-
Get PostgresService instance.
|
|
21
|
+
Get PostgresService singleton instance.
|
|
13
22
|
|
|
14
23
|
Returns None if Postgres is disabled.
|
|
24
|
+
Uses singleton pattern to prevent connection pool exhaustion.
|
|
15
25
|
"""
|
|
26
|
+
global _postgres_instance
|
|
27
|
+
|
|
16
28
|
from ...settings import settings
|
|
17
29
|
|
|
18
30
|
if not settings.postgres.enabled:
|
|
19
31
|
return None
|
|
20
32
|
|
|
21
|
-
|
|
33
|
+
if _postgres_instance is None:
|
|
34
|
+
_postgres_instance = PostgresService()
|
|
35
|
+
|
|
36
|
+
return _postgres_instance
|
|
22
37
|
|
|
23
38
|
|
|
24
|
-
__all__ = [
|
|
39
|
+
__all__ = [
|
|
40
|
+
"DiffResult",
|
|
41
|
+
"DiffService",
|
|
42
|
+
"ObjectDiff",
|
|
43
|
+
"ObjectType",
|
|
44
|
+
"PostgresService",
|
|
45
|
+
"ProgrammableDiffService",
|
|
46
|
+
"Repository",
|
|
47
|
+
"SchemaDiff",
|
|
48
|
+
"get_postgres_service",
|
|
49
|
+
]
|