remdb 0.3.181__py3-none-any.whl → 0.3.223__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (48) hide show
  1. rem/agentic/README.md +262 -2
  2. rem/agentic/context.py +173 -0
  3. rem/agentic/context_builder.py +12 -2
  4. rem/agentic/mcp/tool_wrapper.py +2 -2
  5. rem/agentic/providers/pydantic_ai.py +1 -1
  6. rem/agentic/schema.py +2 -2
  7. rem/api/main.py +1 -1
  8. rem/api/mcp_router/server.py +4 -0
  9. rem/api/mcp_router/tools.py +542 -170
  10. rem/api/routers/admin.py +30 -4
  11. rem/api/routers/auth.py +106 -10
  12. rem/api/routers/chat/completions.py +66 -18
  13. rem/api/routers/chat/sse_events.py +7 -3
  14. rem/api/routers/chat/streaming.py +254 -22
  15. rem/api/routers/common.py +18 -0
  16. rem/api/routers/dev.py +7 -1
  17. rem/api/routers/feedback.py +9 -1
  18. rem/api/routers/messages.py +176 -38
  19. rem/api/routers/models.py +9 -1
  20. rem/api/routers/query.py +12 -1
  21. rem/api/routers/shared_sessions.py +16 -0
  22. rem/auth/jwt.py +19 -4
  23. rem/auth/middleware.py +42 -28
  24. rem/cli/README.md +62 -0
  25. rem/cli/commands/db.py +33 -19
  26. rem/cli/commands/process.py +171 -43
  27. rem/models/entities/ontology.py +18 -20
  28. rem/schemas/agents/rem.yaml +1 -1
  29. rem/services/content/service.py +18 -5
  30. rem/services/postgres/__init__.py +28 -3
  31. rem/services/postgres/diff_service.py +57 -5
  32. rem/services/postgres/programmable_diff_service.py +635 -0
  33. rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
  34. rem/services/postgres/register_type.py +11 -10
  35. rem/services/postgres/repository.py +14 -4
  36. rem/services/session/__init__.py +8 -1
  37. rem/services/session/compression.py +40 -2
  38. rem/services/session/pydantic_messages.py +276 -0
  39. rem/settings.py +28 -0
  40. rem/sql/migrations/001_install.sql +125 -7
  41. rem/sql/migrations/002_install_models.sql +136 -126
  42. rem/sql/migrations/004_cache_system.sql +7 -275
  43. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  44. rem/utils/schema_loader.py +6 -6
  45. {remdb-0.3.181.dist-info → remdb-0.3.223.dist-info}/METADATA +1 -1
  46. {remdb-0.3.181.dist-info → remdb-0.3.223.dist-info}/RECORD +48 -44
  47. {remdb-0.3.181.dist-info → remdb-0.3.223.dist-info}/WHEEL +0 -0
  48. {remdb-0.3.181.dist-info → remdb-0.3.223.dist-info}/entry_points.txt +0 -0
rem/cli/README.md CHANGED
@@ -434,6 +434,68 @@ Ensure you're using the correct model format:
434
434
  - OpenAI: `openai:gpt-4o-mini`, `openai:gpt-4o`
435
435
  - Anthropic: `anthropic:claude-sonnet-4-5-20250929`
436
436
 
437
+ ## Data Visibility: PUBLIC vs PRIVATE
438
+
439
+ **IMPORTANT: All ingested data is PUBLIC by default.** This is the correct behavior
440
+ for shared knowledge bases (ontologies, procedures, reference data).
441
+
442
+ ### Why PUBLIC by Default?
443
+
444
+ Most data in REM should be searchable by all users:
445
+ - Clinical ontologies (disorders, symptoms, drugs)
446
+ - Procedures and protocols (SCID-5, PHQ-9, etc.)
447
+ - Reference documentation
448
+ - Shared domain knowledge
449
+
450
+ The `rem_lookup()` function searches for data where `user_id IS NULL`, which means
451
+ public data. If you set `user_id` on data, it becomes invisible to other users.
452
+
453
+ ### Ingesting Public Data (Default)
454
+
455
+ ```bash
456
+ # Standard ingestion - data is PUBLIC
457
+ rem process ingest ontology/procedures/ --table ontologies
458
+
459
+ # From S3 - also PUBLIC
460
+ rem process ingest s3://bucket/docs/reference.pdf
461
+ ```
462
+
463
+ ### Ingesting Private Data (Rare)
464
+
465
+ Private data requires explicit `--make-private` flag:
466
+
467
+ ```bash
468
+ # Private user data - requires --make-private and --user-id
469
+ rem process ingest personal-notes.md --make-private --user-id user-123
470
+ ```
471
+
472
+ **When to use private data:**
473
+ - User-uploaded personal documents
474
+ - Session-specific content
475
+ - User notes and annotations
476
+
477
+ **NEVER use private data for:**
478
+ - Ontologies and reference material
479
+ - Clinical procedures and protocols
480
+ - Shared knowledge bases
481
+ - Anything that should be searchable by agents
482
+
483
+ ### Common Mistake
484
+
485
+ If agents can't find data via `search_rem`, the most common cause is that the data
486
+ was ingested with a `user_id` set. Check with:
487
+
488
+ ```sql
489
+ SELECT name, user_id FROM ontologies WHERE name = 'phq-9-procedure';
490
+ -- user_id should be NULL for public data
491
+ ```
492
+
493
+ Fix by setting user_id to NULL:
494
+ ```sql
495
+ UPDATE ontologies SET user_id = NULL WHERE user_id IS NOT NULL;
496
+ UPDATE kv_store SET user_id = NULL WHERE entity_type = 'ontologies' AND user_id IS NOT NULL;
497
+ ```
498
+
437
499
  ## Next Steps
438
500
 
439
501
  1. **Implement Schema Registry**
rem/cli/commands/db.py CHANGED
@@ -469,8 +469,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
469
469
  # Handle direct insert tables (non-CoreModel)
470
470
  if table_name in DIRECT_INSERT_TABLES:
471
471
  for row_data in rows:
472
- if "tenant_id" not in row_data:
473
- row_data["tenant_id"] = "default"
472
+ # tenant_id is optional - NULL means public/shared
474
473
 
475
474
  if table_name == "shared_sessions":
476
475
  await pg.fetch(
@@ -481,7 +480,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
481
480
  row_data["session_id"],
482
481
  row_data["owner_user_id"],
483
482
  row_data["shared_with_user_id"],
484
- row_data["tenant_id"],
483
+ row_data.get("tenant_id"), # Optional - NULL means public
485
484
  )
486
485
  total_loaded += 1
487
486
  logger.success(f"Loaded shared_session: {row_data['owner_user_id']} -> {row_data['shared_with_user_id']}")
@@ -494,10 +493,8 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
494
493
  model_class = MODEL_MAP[table_name]
495
494
 
496
495
  for row_idx, row_data in enumerate(rows):
497
- # user_id stays NULL for public data (accessible by any user)
498
- # Only set tenant_id for scoping - the --user-id flag controls tenant scope
499
- if "tenant_id" not in row_data and user_id is not None:
500
- row_data["tenant_id"] = user_id
496
+ # tenant_id and user_id are optional - NULL means public/shared data
497
+ # Data files can explicitly set tenant_id/user_id if needed
501
498
 
502
499
  # Convert graph_edges to InlineEdge format if present
503
500
  if "graph_edges" in row_data:
@@ -644,7 +641,7 @@ async def _diff_async(
644
641
 
645
642
  if not result.has_changes:
646
643
  click.secho("✓ No schema drift detected", fg="green")
647
- click.echo(" Database matches Pydantic models")
644
+ click.echo(" Database matches source (tables, functions, triggers, views)")
648
645
  if result.filtered_count > 0:
649
646
  click.echo()
650
647
  click.secho(f" ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
@@ -656,17 +653,34 @@ async def _diff_async(
656
653
  if result.filtered_count > 0:
657
654
  click.secho(f" ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
658
655
  click.echo()
659
- click.echo("Changes:")
660
- for line in result.summary:
661
- if line.startswith("+"):
662
- click.secho(f" {line}", fg="green")
663
- elif line.startswith("-"):
664
- click.secho(f" {line}", fg="red")
665
- elif line.startswith("~"):
666
- click.secho(f" {line}", fg="yellow")
667
- else:
668
- click.echo(f" {line}")
669
- click.echo()
656
+
657
+ # Table/column changes (Alembic)
658
+ if result.summary:
659
+ click.echo("Table Changes:")
660
+ for line in result.summary:
661
+ if line.startswith("+"):
662
+ click.secho(f" {line}", fg="green")
663
+ elif line.startswith("-"):
664
+ click.secho(f" {line}", fg="red")
665
+ elif line.startswith("~"):
666
+ click.secho(f" {line}", fg="yellow")
667
+ else:
668
+ click.echo(f" {line}")
669
+ click.echo()
670
+
671
+ # Programmable object changes (functions, triggers, views)
672
+ if result.programmable_summary:
673
+ click.echo("Programmable Objects (functions/triggers/views):")
674
+ for line in result.programmable_summary:
675
+ if line.startswith("+"):
676
+ click.secho(f" {line}", fg="green")
677
+ elif line.startswith("-"):
678
+ click.secho(f" {line}", fg="red")
679
+ elif line.startswith("~"):
680
+ click.secho(f" {line}", fg="yellow")
681
+ else:
682
+ click.echo(f" {line}")
683
+ click.echo()
670
684
 
671
685
  # Generate migration if requested
672
686
  if generate:
@@ -11,39 +11,102 @@ from rem.services.content import ContentService
11
11
 
12
12
 
13
13
  @click.command(name="ingest")
14
- @click.argument("file_path", type=click.Path(exists=True))
15
- @click.option("--user-id", default=None, help="User ID to scope file privately (default: public/shared)")
14
+ @click.argument("path", type=click.Path(exists=True))
15
+ @click.option("--table", "-t", default=None, help="Target table (e.g., ontologies, resources). Auto-detected for schemas.")
16
+ @click.option("--make-private", is_flag=True, help="Make data private to a specific user. RARELY NEEDED - most data should be public/shared.")
17
+ @click.option("--user-id", default=None, help="User ID for private data. REQUIRES --make-private flag.")
16
18
  @click.option("--category", help="Optional file category")
17
19
  @click.option("--tags", help="Optional comma-separated tags")
20
+ @click.option("--pattern", "-p", default="**/*.md", help="Glob pattern for directory ingestion (default: **/*.md)")
21
+ @click.option("--dry-run", is_flag=True, help="Show what would be ingested without making changes")
18
22
  def process_ingest(
19
- file_path: str,
23
+ path: str,
24
+ table: str | None,
25
+ make_private: bool,
20
26
  user_id: str | None,
21
27
  category: str | None,
22
28
  tags: str | None,
29
+ pattern: str,
30
+ dry_run: bool,
23
31
  ):
24
32
  """
25
- Ingest a file into REM (storage + parsing + embedding).
33
+ Ingest files into REM (storage + parsing + embedding).
26
34
 
27
- This command performs the full ingestion pipeline:
28
- 1. Reads the file from the local path.
29
- 2. Stores it in the configured storage (local/S3).
30
- 3. Parses the content.
31
- 4. Chunks and embeds the content into Resources.
32
- 5. Creates a File entity record.
35
+ Supports both single files and directories. For directories, recursively
36
+ processes files matching the pattern (default: **/*.md).
37
+
38
+ **IMPORTANT: Data is PUBLIC by default.** This is the correct behavior for
39
+ shared knowledge bases (ontologies, procedures, reference data). Private
40
+ user-scoped data is rarely needed and requires explicit --make-private flag.
41
+
42
+ Target table is auto-detected for schemas (agent.yaml → schemas table).
43
+ Use --table to explicitly set the target (e.g., ontologies for clinical knowledge).
33
44
 
34
45
  Examples:
35
46
  rem process ingest sample.pdf
36
47
  rem process ingest contract.docx --category legal --tags contract,2023
37
48
  rem process ingest agent.yaml # Auto-detects kind=agent, saves to schemas table
49
+
50
+ # Directory ingestion into ontologies table (PUBLIC - no user-id needed)
51
+ rem process ingest ontology/procedures/scid-5/ --table ontologies
52
+ rem process ingest ontology/ --table ontologies --pattern "**/*.md"
53
+
54
+ # Preview what would be ingested
55
+ rem process ingest ontology/ --table ontologies --dry-run
56
+
57
+ # RARE: Private user-scoped data (requires --make-private)
58
+ rem process ingest private-notes.md --make-private --user-id user-123
38
59
  """
39
60
  import asyncio
61
+
62
+ # Validate: user_id requires --make-private flag
63
+ if user_id and not make_private:
64
+ raise click.UsageError(
65
+ "Setting --user-id requires the --make-private flag.\n\n"
66
+ "Data should be PUBLIC by default (no user-id). Private user-scoped data\n"
67
+ "is rarely needed - only use --make-private for truly personal content.\n\n"
68
+ "Example: rem process ingest file.md --make-private --user-id user-123"
69
+ )
70
+
71
+ # If --make-private is set, user_id is required
72
+ if make_private and not user_id:
73
+ raise click.UsageError(
74
+ "--make-private requires --user-id to specify which user owns the data.\n\n"
75
+ "Example: rem process ingest file.md --make-private --user-id user-123"
76
+ )
77
+
78
+ # Clear user_id if not making private (ensure None for public data)
79
+ effective_user_id = user_id if make_private else None
80
+ from pathlib import Path
40
81
  from ...services.content import ContentService
41
82
 
42
83
  async def _ingest():
43
- # Initialize ContentService with repositories for proper resource saving
44
84
  from rem.services.postgres import get_postgres_service
45
85
  from rem.services.postgres.repository import Repository
46
- from rem.models.entities import File, Resource
86
+ from rem.models.entities import File, Resource, Ontology
87
+
88
+ input_path = Path(path)
89
+ tag_list = tags.split(",") if tags else None
90
+
91
+ # Collect files to process
92
+ if input_path.is_dir():
93
+ files_to_process = list(input_path.glob(pattern))
94
+ if not files_to_process:
95
+ logger.error(f"No files matching '{pattern}' found in {input_path}")
96
+ sys.exit(1)
97
+ logger.info(f"Found {len(files_to_process)} files matching '{pattern}'")
98
+ else:
99
+ files_to_process = [input_path]
100
+
101
+ # Dry run: just show what would be processed
102
+ if dry_run:
103
+ logger.info("DRY RUN - Would ingest:")
104
+ for f in files_to_process[:20]:
105
+ entity_key = f.stem # filename without extension
106
+ logger.info(f" {f} → {table or 'auto-detect'} (key: {entity_key})")
107
+ if len(files_to_process) > 20:
108
+ logger.info(f" ... and {len(files_to_process) - 20} more files")
109
+ return
47
110
 
48
111
  db = get_postgres_service()
49
112
  if not db:
@@ -51,53 +114,118 @@ def process_ingest(
51
114
  await db.connect()
52
115
 
53
116
  try:
54
- file_repo = Repository(File, "files", db=db)
55
- resource_repo = Repository(Resource, "resources", db=db)
56
- service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
57
-
58
- tag_list = tags.split(",") if tags else None
59
-
60
- scope_msg = f"user: {user_id}" if user_id else "public"
61
- logger.info(f"Ingesting file: {file_path} ({scope_msg})")
62
- result = await service.ingest_file(
63
- file_uri=file_path,
64
- user_id=user_id,
65
- category=category,
66
- tags=tag_list,
67
- is_local_server=True, # CLI is local
68
- )
69
-
70
- # Handle schema ingestion (agents/evaluators)
71
- if result.get("schema_name"):
72
- logger.success(f"Schema ingested: {result['schema_name']} (kind={result.get('kind', 'agent')})")
73
- logger.info(f"Version: {result.get('version', '1.0.0')}")
74
- # Handle file ingestion
75
- elif result.get("processing_status") == "completed":
76
- logger.success(f"File ingested: {result['file_name']}")
77
- logger.info(f"File ID: {result['file_id']}")
78
- logger.info(f"Resources created: {result['resources_created']}")
117
+ # Direct table ingestion (ontologies, etc.)
118
+ if table:
119
+ await _ingest_to_table(
120
+ db=db,
121
+ files=files_to_process,
122
+ table_name=table,
123
+ user_id=effective_user_id,
124
+ category=category,
125
+ tag_list=tag_list,
126
+ )
79
127
  else:
80
- logger.error(f"Ingestion failed: {result.get('message', 'Unknown error')}")
81
- sys.exit(1)
128
+ # Standard file ingestion via ContentService
129
+ file_repo = Repository(File, "files", db=db)
130
+ resource_repo = Repository(Resource, "resources", db=db)
131
+ service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
132
+
133
+ for file_path in files_to_process:
134
+ scope_msg = f"user: {effective_user_id}" if effective_user_id else "public"
135
+ logger.info(f"Ingesting: {file_path} ({scope_msg})")
136
+
137
+ result = await service.ingest_file(
138
+ file_uri=str(file_path),
139
+ user_id=effective_user_id,
140
+ category=category,
141
+ tags=tag_list,
142
+ is_local_server=True,
143
+ )
144
+
145
+ # Handle schema ingestion (agents/evaluators)
146
+ if result.get("schema_name"):
147
+ logger.success(f"Schema: {result['schema_name']} (kind={result.get('kind', 'agent')})")
148
+ elif result.get("processing_status") == "completed":
149
+ logger.success(f"File: {result['file_name']} ({result['resources_created']} resources)")
150
+ else:
151
+ logger.error(f"Failed: {result.get('message', 'Unknown error')}")
82
152
 
83
153
  except Exception as e:
84
154
  logger.error(f"Error during ingestion: {e}")
85
155
  sys.exit(1)
86
156
  finally:
87
- # Wait for global embedding worker to finish queued tasks
157
+ # Wait for embedding worker to finish
88
158
  from rem.services.embeddings.worker import get_global_embedding_worker
89
159
  try:
90
160
  worker = get_global_embedding_worker()
91
161
  if worker and worker.running and not worker.task_queue.empty():
92
- logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks to complete...")
93
- # Worker.stop() waits for queue to drain (see worker.py line ~148)
162
+ logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks...")
94
163
  await worker.stop()
95
164
  except RuntimeError:
96
- # Worker doesn't exist yet - no tasks queued
97
165
  pass
98
166
 
99
167
  await db.disconnect()
100
168
 
169
+ async def _ingest_to_table(db, files, table_name, user_id, category, tag_list):
170
+ """Direct ingestion of files to a specific table (ontologies, etc.)."""
171
+ from rem.services.postgres.repository import Repository
172
+ from rem import get_model_registry
173
+ from rem.utils.model_helpers import get_table_name
174
+
175
+ # Get model class for table
176
+ registry = get_model_registry()
177
+ registry.register_core_models()
178
+ model_class = None
179
+ for model in registry.get_model_classes().values():
180
+ if get_table_name(model) == table_name:
181
+ model_class = model
182
+ break
183
+
184
+ if not model_class:
185
+ logger.error(f"Unknown table: {table_name}")
186
+ sys.exit(1)
187
+
188
+ repo = Repository(model_class, table_name, db=db)
189
+ processed = 0
190
+ failed = 0
191
+
192
+ for file_path in files:
193
+ try:
194
+ # Read file content
195
+ content = file_path.read_text(encoding="utf-8")
196
+ entity_key = file_path.stem # filename without extension
197
+
198
+ # Build entity based on table
199
+ entity_data = {
200
+ "name": entity_key,
201
+ "content": content,
202
+ "tags": tag_list or [],
203
+ }
204
+
205
+ # Add optional fields
206
+ if category:
207
+ entity_data["category"] = category
208
+
209
+ # Scoping: user_id for private data, "public" for shared
210
+ # tenant_id="public" is the default for shared knowledge bases
211
+ entity_data["tenant_id"] = user_id or "public"
212
+ entity_data["user_id"] = user_id # None = public/shared
213
+
214
+ # For ontologies, add URI
215
+ if table_name == "ontologies":
216
+ entity_data["uri"] = f"file://{file_path.absolute()}"
217
+
218
+ entity = model_class(**entity_data)
219
+ await repo.upsert(entity, embeddable_fields=["content"], generate_embeddings=True)
220
+ processed += 1
221
+ logger.success(f" ✓ {entity_key}")
222
+
223
+ except Exception as e:
224
+ failed += 1
225
+ logger.error(f" ✗ {file_path.name}: {e}")
226
+
227
+ logger.info(f"Completed: {processed} succeeded, {failed} failed")
228
+
101
229
  asyncio.run(_ingest())
102
230
 
103
231
  def register_commands(group: click.Group):
@@ -103,32 +103,30 @@ class Ontology(CoreModel):
103
103
  tags=["cv", "engineering"]
104
104
  )
105
105
 
106
- # Direct-loaded: Medical knowledge base from git
107
- disorder_ontology = Ontology(
108
- name="panic-disorder",
109
- uri="git://bwolfson-siggie/Siggy-MVP/ontology/disorders/anxiety/panic-disorder.md",
110
- content="# Panic Disorder\\n\\nPanic disorder is characterized by...",
106
+ # Direct-loaded: Knowledge base from git
107
+ api_docs = Ontology(
108
+ name="rest-api-guide",
109
+ uri="git://example-org/docs/api/rest-api-guide.md",
110
+ content="# REST API Guide\\n\\nThis guide covers RESTful API design...",
111
111
  extracted_data={
112
- "type": "disorder",
113
- "category": "anxiety",
114
- "icd10": "F41.0",
115
- "dsm5_criteria": ["A", "B", "C", "D"],
112
+ "type": "documentation",
113
+ "category": "api",
114
+ "version": "2.0",
116
115
  },
117
- tags=["disorder", "anxiety", "dsm5"]
116
+ tags=["api", "rest", "documentation"]
118
117
  )
119
118
 
120
- # Direct-loaded: Clinical procedure from git
121
- scid_node = Ontology(
122
- name="scid-5-f1",
123
- uri="git://bwolfson-siggie/Siggy-MVP/ontology/procedures/scid-5/module-f/scid-5-f1.md",
124
- content="# scid-5-f1: Panic Attack Screening\\n\\n...",
119
+ # Direct-loaded: Technical spec from git
120
+ config_spec = Ontology(
121
+ name="config-schema",
122
+ uri="git://example-org/docs/specs/config-schema.md",
123
+ content="# Configuration Schema\\n\\nThis document defines...",
125
124
  extracted_data={
126
- "type": "procedure",
127
- "module": "F",
128
- "section": "Panic Disorder",
129
- "dsm5_criterion": "Panic Attack Specifier",
125
+ "type": "specification",
126
+ "format": "yaml",
127
+ "version": "1.0",
130
128
  },
131
- tags=["scid-5", "procedure", "anxiety"]
129
+ tags=["config", "schema", "specification"]
132
130
  )
133
131
  """
134
132
 
@@ -124,7 +124,7 @@ json_schema_extra:
124
124
 
125
125
  # Explicit resource declarations for reference data
126
126
  resources:
127
- - uri: rem://schemas
127
+ - uri: rem://agents
128
128
  name: Agent Schemas List
129
129
  description: List all available agent schemas in the system
130
130
  - uri: rem://status
@@ -274,7 +274,7 @@ class ContentService:
274
274
  async def ingest_file(
275
275
  self,
276
276
  file_uri: str,
277
- user_id: str,
277
+ user_id: str | None = None,
278
278
  category: str | None = None,
279
279
  tags: list[str] | None = None,
280
280
  is_local_server: bool = False,
@@ -283,6 +283,10 @@ class ContentService:
283
283
  """
284
284
  Complete file ingestion pipeline: read → store → parse → chunk → embed.
285
285
 
286
+ **IMPORTANT: Data is PUBLIC by default (user_id=None).**
287
+ This is correct for shared knowledge bases (ontologies, procedures, reference data).
288
+ Private user-scoped data is rarely needed - only set user_id for truly personal content.
289
+
286
290
  **CENTRALIZED INGESTION**: This is the single entry point for all file ingestion
287
291
  in REM. It handles:
288
292
 
@@ -319,7 +323,9 @@ class ContentService:
319
323
 
320
324
  Args:
321
325
  file_uri: Source file location (local path, s3://, or https://)
322
- user_id: User identifier for data isolation and ownership
326
+ user_id: User identifier for PRIVATE data only. Default None = PUBLIC/shared.
327
+ Leave as None for shared knowledge bases, ontologies, reference data.
328
+ Only set for truly private user-specific content.
323
329
  category: Optional category tag (document, code, audio, etc.)
324
330
  tags: Optional list of tags
325
331
  is_local_server: True if running as local/stdio MCP server
@@ -347,12 +353,19 @@ class ContentService:
347
353
 
348
354
  Example:
349
355
  >>> service = ContentService()
356
+ >>> # PUBLIC data (default) - visible to all users
350
357
  >>> result = await service.ingest_file(
351
- ... file_uri="s3://bucket/contract.pdf",
352
- ... user_id="user-123",
353
- ... category="legal"
358
+ ... file_uri="s3://bucket/procedure.pdf",
359
+ ... category="medical"
354
360
  ... )
355
361
  >>> print(f"Created {result['resources_created']} searchable chunks")
362
+ >>>
363
+ >>> # PRIVATE data (rare) - only for user-specific content
364
+ >>> result = await service.ingest_file(
365
+ ... file_uri="s3://bucket/personal-notes.pdf",
366
+ ... user_id="user-123", # Only this user can access
367
+ ... category="personal"
368
+ ... )
356
369
  """
357
370
  from pathlib import Path
358
371
  from uuid import uuid4
@@ -3,22 +3,47 @@ PostgreSQL service for CloudNativePG database operations.
3
3
  """
4
4
 
5
5
  from .diff_service import DiffService, SchemaDiff
6
+ from .programmable_diff_service import (
7
+ DiffResult,
8
+ ObjectDiff,
9
+ ObjectType,
10
+ ProgrammableDiffService,
11
+ )
6
12
  from .repository import Repository
7
13
  from .service import PostgresService
8
14
 
9
15
 
16
+ _postgres_instance: PostgresService | None = None
17
+
18
+
10
19
  def get_postgres_service() -> PostgresService | None:
11
20
  """
12
- Get PostgresService instance.
21
+ Get PostgresService singleton instance.
13
22
 
14
23
  Returns None if Postgres is disabled.
24
+ Uses singleton pattern to prevent connection pool exhaustion.
15
25
  """
26
+ global _postgres_instance
27
+
16
28
  from ...settings import settings
17
29
 
18
30
  if not settings.postgres.enabled:
19
31
  return None
20
32
 
21
- return PostgresService()
33
+ if _postgres_instance is None:
34
+ _postgres_instance = PostgresService()
35
+
36
+ return _postgres_instance
22
37
 
23
38
 
24
- __all__ = ["PostgresService", "get_postgres_service", "Repository", "DiffService", "SchemaDiff"]
39
+ __all__ = [
40
+ "DiffResult",
41
+ "DiffService",
42
+ "ObjectDiff",
43
+ "ObjectType",
44
+ "PostgresService",
45
+ "ProgrammableDiffService",
46
+ "Repository",
47
+ "SchemaDiff",
48
+ "get_postgres_service",
49
+ ]