remdb 0.3.171__py3-none-any.whl → 0.3.230__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. rem/agentic/README.md +36 -2
  2. rem/agentic/context.py +173 -0
  3. rem/agentic/context_builder.py +12 -2
  4. rem/agentic/mcp/tool_wrapper.py +39 -16
  5. rem/agentic/providers/pydantic_ai.py +78 -45
  6. rem/agentic/schema.py +6 -5
  7. rem/agentic/tools/rem_tools.py +11 -0
  8. rem/api/main.py +1 -1
  9. rem/api/mcp_router/resources.py +75 -14
  10. rem/api/mcp_router/server.py +31 -24
  11. rem/api/mcp_router/tools.py +621 -166
  12. rem/api/routers/admin.py +30 -4
  13. rem/api/routers/auth.py +114 -15
  14. rem/api/routers/chat/child_streaming.py +379 -0
  15. rem/api/routers/chat/completions.py +74 -37
  16. rem/api/routers/chat/sse_events.py +7 -3
  17. rem/api/routers/chat/streaming.py +352 -257
  18. rem/api/routers/chat/streaming_utils.py +327 -0
  19. rem/api/routers/common.py +18 -0
  20. rem/api/routers/dev.py +7 -1
  21. rem/api/routers/feedback.py +9 -1
  22. rem/api/routers/messages.py +176 -38
  23. rem/api/routers/models.py +9 -1
  24. rem/api/routers/query.py +12 -1
  25. rem/api/routers/shared_sessions.py +16 -0
  26. rem/auth/jwt.py +19 -4
  27. rem/auth/middleware.py +42 -28
  28. rem/cli/README.md +62 -0
  29. rem/cli/commands/ask.py +61 -81
  30. rem/cli/commands/db.py +148 -70
  31. rem/cli/commands/process.py +171 -43
  32. rem/models/entities/ontology.py +91 -101
  33. rem/schemas/agents/rem.yaml +1 -1
  34. rem/services/content/service.py +18 -5
  35. rem/services/email/service.py +11 -2
  36. rem/services/embeddings/worker.py +26 -12
  37. rem/services/postgres/__init__.py +28 -3
  38. rem/services/postgres/diff_service.py +57 -5
  39. rem/services/postgres/programmable_diff_service.py +635 -0
  40. rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
  41. rem/services/postgres/register_type.py +12 -11
  42. rem/services/postgres/repository.py +39 -29
  43. rem/services/postgres/schema_generator.py +5 -5
  44. rem/services/postgres/sql_builder.py +6 -5
  45. rem/services/session/__init__.py +8 -1
  46. rem/services/session/compression.py +40 -2
  47. rem/services/session/pydantic_messages.py +292 -0
  48. rem/settings.py +34 -0
  49. rem/sql/background_indexes.sql +5 -0
  50. rem/sql/migrations/001_install.sql +157 -10
  51. rem/sql/migrations/002_install_models.sql +160 -132
  52. rem/sql/migrations/004_cache_system.sql +7 -275
  53. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  54. rem/utils/model_helpers.py +101 -0
  55. rem/utils/schema_loader.py +79 -51
  56. {remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/METADATA +2 -2
  57. {remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/RECORD +59 -53
  58. {remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/WHEEL +0 -0
  59. {remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/entry_points.txt +0 -0
@@ -11,39 +11,102 @@ from rem.services.content import ContentService
11
11
 
12
12
 
13
13
  @click.command(name="ingest")
14
- @click.argument("file_path", type=click.Path(exists=True))
15
- @click.option("--user-id", default=None, help="User ID to scope file privately (default: public/shared)")
14
+ @click.argument("path", type=click.Path(exists=True))
15
+ @click.option("--table", "-t", default=None, help="Target table (e.g., ontologies, resources). Auto-detected for schemas.")
16
+ @click.option("--make-private", is_flag=True, help="Make data private to a specific user. RARELY NEEDED - most data should be public/shared.")
17
+ @click.option("--user-id", default=None, help="User ID for private data. REQUIRES --make-private flag.")
16
18
  @click.option("--category", help="Optional file category")
17
19
  @click.option("--tags", help="Optional comma-separated tags")
20
+ @click.option("--pattern", "-p", default="**/*.md", help="Glob pattern for directory ingestion (default: **/*.md)")
21
+ @click.option("--dry-run", is_flag=True, help="Show what would be ingested without making changes")
18
22
  def process_ingest(
19
- file_path: str,
23
+ path: str,
24
+ table: str | None,
25
+ make_private: bool,
20
26
  user_id: str | None,
21
27
  category: str | None,
22
28
  tags: str | None,
29
+ pattern: str,
30
+ dry_run: bool,
23
31
  ):
24
32
  """
25
- Ingest a file into REM (storage + parsing + embedding).
33
+ Ingest files into REM (storage + parsing + embedding).
26
34
 
27
- This command performs the full ingestion pipeline:
28
- 1. Reads the file from the local path.
29
- 2. Stores it in the configured storage (local/S3).
30
- 3. Parses the content.
31
- 4. Chunks and embeds the content into Resources.
32
- 5. Creates a File entity record.
35
+ Supports both single files and directories. For directories, recursively
36
+ processes files matching the pattern (default: **/*.md).
37
+
38
+ **IMPORTANT: Data is PUBLIC by default.** This is the correct behavior for
39
+ shared knowledge bases (ontologies, procedures, reference data). Private
40
+ user-scoped data is rarely needed and requires explicit --make-private flag.
41
+
42
+ Target table is auto-detected for schemas (agent.yaml → schemas table).
43
+ Use --table to explicitly set the target (e.g., ontologies for clinical knowledge).
33
44
 
34
45
  Examples:
35
46
  rem process ingest sample.pdf
36
47
  rem process ingest contract.docx --category legal --tags contract,2023
37
48
  rem process ingest agent.yaml # Auto-detects kind=agent, saves to schemas table
49
+
50
+ # Directory ingestion into ontologies table (PUBLIC - no user-id needed)
51
+ rem process ingest ontology/procedures/scid-5/ --table ontologies
52
+ rem process ingest ontology/ --table ontologies --pattern "**/*.md"
53
+
54
+ # Preview what would be ingested
55
+ rem process ingest ontology/ --table ontologies --dry-run
56
+
57
+ # RARE: Private user-scoped data (requires --make-private)
58
+ rem process ingest private-notes.md --make-private --user-id user-123
38
59
  """
39
60
  import asyncio
61
+
62
+ # Validate: user_id requires --make-private flag
63
+ if user_id and not make_private:
64
+ raise click.UsageError(
65
+ "Setting --user-id requires the --make-private flag.\n\n"
66
+ "Data should be PUBLIC by default (no user-id). Private user-scoped data\n"
67
+ "is rarely needed - only use --make-private for truly personal content.\n\n"
68
+ "Example: rem process ingest file.md --make-private --user-id user-123"
69
+ )
70
+
71
+ # If --make-private is set, user_id is required
72
+ if make_private and not user_id:
73
+ raise click.UsageError(
74
+ "--make-private requires --user-id to specify which user owns the data.\n\n"
75
+ "Example: rem process ingest file.md --make-private --user-id user-123"
76
+ )
77
+
78
+ # Clear user_id if not making private (ensure None for public data)
79
+ effective_user_id = user_id if make_private else None
80
+ from pathlib import Path
40
81
  from ...services.content import ContentService
41
82
 
42
83
  async def _ingest():
43
- # Initialize ContentService with repositories for proper resource saving
44
84
  from rem.services.postgres import get_postgres_service
45
85
  from rem.services.postgres.repository import Repository
46
- from rem.models.entities import File, Resource
86
+ from rem.models.entities import File, Resource, Ontology
87
+
88
+ input_path = Path(path)
89
+ tag_list = tags.split(",") if tags else None
90
+
91
+ # Collect files to process
92
+ if input_path.is_dir():
93
+ files_to_process = list(input_path.glob(pattern))
94
+ if not files_to_process:
95
+ logger.error(f"No files matching '{pattern}' found in {input_path}")
96
+ sys.exit(1)
97
+ logger.info(f"Found {len(files_to_process)} files matching '{pattern}'")
98
+ else:
99
+ files_to_process = [input_path]
100
+
101
+ # Dry run: just show what would be processed
102
+ if dry_run:
103
+ logger.info("DRY RUN - Would ingest:")
104
+ for f in files_to_process[:20]:
105
+ entity_key = f.stem # filename without extension
106
+ logger.info(f" {f} → {table or 'auto-detect'} (key: {entity_key})")
107
+ if len(files_to_process) > 20:
108
+ logger.info(f" ... and {len(files_to_process) - 20} more files")
109
+ return
47
110
 
48
111
  db = get_postgres_service()
49
112
  if not db:
@@ -51,53 +114,118 @@ def process_ingest(
51
114
  await db.connect()
52
115
 
53
116
  try:
54
- file_repo = Repository(File, "files", db=db)
55
- resource_repo = Repository(Resource, "resources", db=db)
56
- service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
57
-
58
- tag_list = tags.split(",") if tags else None
59
-
60
- scope_msg = f"user: {user_id}" if user_id else "public"
61
- logger.info(f"Ingesting file: {file_path} ({scope_msg})")
62
- result = await service.ingest_file(
63
- file_uri=file_path,
64
- user_id=user_id,
65
- category=category,
66
- tags=tag_list,
67
- is_local_server=True, # CLI is local
68
- )
69
-
70
- # Handle schema ingestion (agents/evaluators)
71
- if result.get("schema_name"):
72
- logger.success(f"Schema ingested: {result['schema_name']} (kind={result.get('kind', 'agent')})")
73
- logger.info(f"Version: {result.get('version', '1.0.0')}")
74
- # Handle file ingestion
75
- elif result.get("processing_status") == "completed":
76
- logger.success(f"File ingested: {result['file_name']}")
77
- logger.info(f"File ID: {result['file_id']}")
78
- logger.info(f"Resources created: {result['resources_created']}")
117
+ # Direct table ingestion (ontologies, etc.)
118
+ if table:
119
+ await _ingest_to_table(
120
+ db=db,
121
+ files=files_to_process,
122
+ table_name=table,
123
+ user_id=effective_user_id,
124
+ category=category,
125
+ tag_list=tag_list,
126
+ )
79
127
  else:
80
- logger.error(f"Ingestion failed: {result.get('message', 'Unknown error')}")
81
- sys.exit(1)
128
+ # Standard file ingestion via ContentService
129
+ file_repo = Repository(File, "files", db=db)
130
+ resource_repo = Repository(Resource, "resources", db=db)
131
+ service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
132
+
133
+ for file_path in files_to_process:
134
+ scope_msg = f"user: {effective_user_id}" if effective_user_id else "public"
135
+ logger.info(f"Ingesting: {file_path} ({scope_msg})")
136
+
137
+ result = await service.ingest_file(
138
+ file_uri=str(file_path),
139
+ user_id=effective_user_id,
140
+ category=category,
141
+ tags=tag_list,
142
+ is_local_server=True,
143
+ )
144
+
145
+ # Handle schema ingestion (agents/evaluators)
146
+ if result.get("schema_name"):
147
+ logger.success(f"Schema: {result['schema_name']} (kind={result.get('kind', 'agent')})")
148
+ elif result.get("processing_status") == "completed":
149
+ logger.success(f"File: {result['file_name']} ({result['resources_created']} resources)")
150
+ else:
151
+ logger.error(f"Failed: {result.get('message', 'Unknown error')}")
82
152
 
83
153
  except Exception as e:
84
154
  logger.error(f"Error during ingestion: {e}")
85
155
  sys.exit(1)
86
156
  finally:
87
- # Wait for global embedding worker to finish queued tasks
157
+ # Wait for embedding worker to finish
88
158
  from rem.services.embeddings.worker import get_global_embedding_worker
89
159
  try:
90
160
  worker = get_global_embedding_worker()
91
161
  if worker and worker.running and not worker.task_queue.empty():
92
- logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks to complete...")
93
- # Worker.stop() waits for queue to drain (see worker.py line ~148)
162
+ logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks...")
94
163
  await worker.stop()
95
164
  except RuntimeError:
96
- # Worker doesn't exist yet - no tasks queued
97
165
  pass
98
166
 
99
167
  await db.disconnect()
100
168
 
169
+ async def _ingest_to_table(db, files, table_name, user_id, category, tag_list):
170
+ """Direct ingestion of files to a specific table (ontologies, etc.)."""
171
+ from rem.services.postgres.repository import Repository
172
+ from rem import get_model_registry
173
+ from rem.utils.model_helpers import get_table_name
174
+
175
+ # Get model class for table
176
+ registry = get_model_registry()
177
+ registry.register_core_models()
178
+ model_class = None
179
+ for model in registry.get_model_classes().values():
180
+ if get_table_name(model) == table_name:
181
+ model_class = model
182
+ break
183
+
184
+ if not model_class:
185
+ logger.error(f"Unknown table: {table_name}")
186
+ sys.exit(1)
187
+
188
+ repo = Repository(model_class, table_name, db=db)
189
+ processed = 0
190
+ failed = 0
191
+
192
+ for file_path in files:
193
+ try:
194
+ # Read file content
195
+ content = file_path.read_text(encoding="utf-8")
196
+ entity_key = file_path.stem # filename without extension
197
+
198
+ # Build entity based on table
199
+ entity_data = {
200
+ "name": entity_key,
201
+ "content": content,
202
+ "tags": tag_list or [],
203
+ }
204
+
205
+ # Add optional fields
206
+ if category:
207
+ entity_data["category"] = category
208
+
209
+ # Scoping: user_id for private data, "public" for shared
210
+ # tenant_id="public" is the default for shared knowledge bases
211
+ entity_data["tenant_id"] = user_id or "public"
212
+ entity_data["user_id"] = user_id # None = public/shared
213
+
214
+ # For ontologies, add URI
215
+ if table_name == "ontologies":
216
+ entity_data["uri"] = f"file://{file_path.absolute()}"
217
+
218
+ entity = model_class(**entity_data)
219
+ await repo.upsert(entity, embeddable_fields=["content"], generate_embeddings=True)
220
+ processed += 1
221
+ logger.success(f" ✓ {entity_key}")
222
+
223
+ except Exception as e:
224
+ failed += 1
225
+ logger.error(f" ✗ {file_path.name}: {e}")
226
+
227
+ logger.info(f"Completed: {processed} succeeded, {failed} failed")
228
+
101
229
  asyncio.run(_ingest())
102
230
 
103
231
  def register_commands(group: click.Group):
@@ -1,63 +1,55 @@
1
- """Ontology entity for tenant-specific knowledge extensions.
1
+ """Ontology entity for domain-specific knowledge.
2
2
 
3
- **What is Ontology Extraction?**
3
+ **What are Ontologies?**
4
4
 
5
- Ontologies are **domain-specific structured knowledge** extracted from files using custom
6
- agent schemas. They extend REM's normal file processing pipeline with tenant-specific
7
- parsers that extract structured data the standard chunking pipeline would miss.
5
+ Ontologies are **domain-specific structured knowledge** that can be:
6
+ 1. **Extracted** from files using custom agent schemas (agent-extracted)
7
+ 2. **Loaded directly** from external sources like git repos or S3 (direct-loaded)
8
8
 
9
- **Normal File Processing:**
10
- File → extract text → chunk → embed → resources (semantic search ready)
9
+ **Use Case 1: Agent-Extracted Ontologies**
11
10
 
12
- **Ontology Processing (Tenant Knowledge Extensions):**
13
11
  File → custom agent → structured JSON → ontology (domain knowledge)
14
12
 
15
- **Why Ontologies?**
16
- - Standard chunking gives you semantic search over raw content
17
- - Ontologies give you **structured queryable fields** from domain logic
18
- - Example: A contract PDF becomes both searchable chunks AND a structured record with
19
- parties, dates, payment terms, obligations as queryable fields
13
+ Example: A contract PDF becomes a structured record with parties, dates, payment terms.
14
+
15
+ **Use Case 2: Direct-Loaded Ontologies (Knowledge Bases)**
16
+
17
+ External source (git/S3) load ontology (reference knowledge)
18
+
19
+ Example: A psychiatric ontology of disorders, symptoms, and drugs loaded from markdown
20
+ files in a git repository. Each markdown file becomes an ontology node with:
21
+ - `uri`: git path (e.g., `git://org/repo/ontology/disorders/anxiety/panic-disorder.md`)
22
+ - `content`: markdown content for embedding/search
23
+ - `extracted_data`: parsed frontmatter or structure
20
24
 
21
25
  **Architecture:**
22
- - Runs as part of dreaming worker (background knowledge extraction)
23
- - OntologyConfig defines which files trigger which extractors (MIME type, URI pattern, tags)
26
+ - Runs as part of dreaming worker (background knowledge extraction) OR
27
+ - Loaded directly via `rem db load` for external knowledge bases
28
+ - OntologyConfig defines which files trigger which extractors
24
29
  - Multiple ontologies per file (apply different domain lenses)
25
- - Tenant-scoped: Each tenant can define their own extractors
30
+ - Tenant-scoped: Each tenant can define their own extractors and knowledge bases
26
31
 
27
32
  **Use Cases:**
28
33
 
29
- 1. **Recruitment (CV Parsing)**
30
- - Standard pipeline: Chunks for "find me candidates with Python experience"
31
- - Ontology: Structured fields for filtering/sorting (years_experience, seniority_level, skills[])
32
-
33
- 2. **Legal (Contract Analysis)**
34
- - Standard pipeline: Semantic search over contract text
35
- - Ontology: Queryable fields (parties, effective_date, payment_amount, key_obligations[])
34
+ 1. **Recruitment (CV Parsing)** - Agent-extracted
35
+ - Ontology: Structured fields for filtering/sorting (years_experience, skills[])
36
36
 
37
- 3. **Medical (Health Records)**
38
- - Standard pipeline: Find mentions of conditions
39
- - Ontology: Structured diagnoses, medications, dosages, treatment plans
37
+ 2. **Legal (Contract Analysis)** - Agent-extracted
38
+ - Ontology: Queryable fields (parties, effective_date, payment_amount)
40
39
 
41
- 4. **Finance (Report Analysis)**
42
- - Standard pipeline: Search for financial terms
43
- - Ontology: Extracted metrics, risk_flags, trends, forecasts
40
+ 3. **Medical Knowledge Base** - Direct-loaded
41
+ - Ontology: Disorders, symptoms, medications from curated markdown files
42
+ - Enables semantic search over psychiatric/medical domain knowledge
44
43
 
45
- **Example Flow:**
46
- 1. Tenant creates OntologyConfig: "Run cv-parser-v1 on files with mime_type='application/pdf' and tags=['resume']"
47
- 2. File uploaded with tags=["resume"]
48
- 3. Normal processing: File → chunks → resources
49
- 4. Dreaming worker detects matching OntologyConfig
50
- 5. Loads cv-parser-v1 agent schema from database
51
- 6. Runs agent on file content → extracts structured data
52
- 7. Stores Ontology with extracted_data = {candidate_name, skills, experience, education, ...}
53
- 8. Ontology is now queryable via LOOKUP, SEARCH, or direct SQL
44
+ 4. **Documentation/Procedures** - Direct-loaded
45
+ - Ontology: Clinical procedures (e.g., SCID-5 assessment steps)
46
+ - Reference material accessible via RAG
54
47
 
55
48
  **Design:**
56
- - Each ontology links to a File via file_id
57
- - Agent schema tracked via agent_schema_id (human-readable label, not UUID)
58
- - Structured data in `extracted_data` (arbitrary JSON, schema defined by agent)
59
- - Embeddings generated for semantic search (configurable fields via agent schema)
60
- - Multiple ontologies per file using different schemas
49
+ - `file_id` and `agent_schema_id` are optional (only needed for agent-extracted)
50
+ - `uri` field for external source references (git://, s3://, https://)
51
+ - Structured data in `extracted_data` (arbitrary JSON)
52
+ - Embeddings generated for semantic search via `content` field
61
53
  - Tenant-isolated: OntologyConfigs are tenant-scoped
62
54
  """
63
55
 
@@ -70,18 +62,19 @@ from ..core.core_model import CoreModel
70
62
 
71
63
 
72
64
  class Ontology(CoreModel):
73
- """Domain-specific knowledge extracted from files using custom agents.
65
+ """Domain-specific knowledge - either agent-extracted or direct-loaded.
74
66
 
75
67
  Attributes:
76
68
  name: Human-readable label for this ontology instance
77
- file_id: Foreign key to File entity that was processed
78
- agent_schema_id: Foreign key to Schema entity that performed extraction
79
- provider_name: LLM provider used for extraction (e.g., "anthropic", "openai")
80
- model_name: Specific model used (e.g., "claude-sonnet-4-5")
81
- extracted_data: Structured data extracted by agent (arbitrary JSON)
69
+ uri: External source reference (git://, s3://, https://) for direct-loaded ontologies
70
+ file_id: Foreign key to File entity (optional - only for agent-extracted)
71
+ agent_schema_id: Schema that performed extraction (optional - only for agent-extracted)
72
+ provider_name: LLM provider used for extraction (optional)
73
+ model_name: Specific model used (optional)
74
+ extracted_data: Structured data - either extracted by agent or parsed from source
82
75
  confidence_score: Optional confidence score from extraction (0.0-1.0)
83
76
  extraction_timestamp: When extraction was performed
84
- embedding_text: Text used for generating embedding (derived from extracted_data)
77
+ content: Text used for generating embedding
85
78
 
86
79
  Inherited from CoreModel:
87
80
  id: UUID or string identifier
@@ -93,10 +86,9 @@ class Ontology(CoreModel):
93
86
  graph_edges: Relationships to other entities
94
87
  metadata: Flexible metadata storage
95
88
  tags: Classification tags
96
- column: Database schema metadata
97
89
 
98
90
  Example Usage:
99
- # CV extraction
91
+ # Agent-extracted: CV parsing
100
92
  cv_ontology = Ontology(
101
93
  name="john-doe-cv-2024",
102
94
  file_id="file-uuid-123",
@@ -105,73 +97,72 @@ class Ontology(CoreModel):
105
97
  model_name="claude-sonnet-4-5-20250929",
106
98
  extracted_data={
107
99
  "candidate_name": "John Doe",
108
- "email": "john@example.com",
109
100
  "skills": ["Python", "PostgreSQL", "Kubernetes"],
110
- "experience": [
111
- {
112
- "company": "TechCorp",
113
- "role": "Senior Engineer",
114
- "years": 3,
115
- "achievements": ["Led migration to k8s", "Reduced costs 40%"]
116
- }
117
- ],
118
- "education": [
119
- {"degree": "BS Computer Science", "institution": "MIT", "year": 2018}
120
- ]
121
101
  },
122
102
  confidence_score=0.95,
123
- tags=["cv", "engineering", "senior-level"]
103
+ tags=["cv", "engineering"]
124
104
  )
125
105
 
126
- # Contract extraction
127
- contract_ontology = Ontology(
128
- name="acme-supplier-agreement-2024",
129
- file_id="file-uuid-456",
130
- agent_schema_id="contract-parser-v2",
131
- provider_name="openai",
132
- model_name="gpt-4.1",
106
+ # Direct-loaded: Knowledge base from git
107
+ api_docs = Ontology(
108
+ name="rest-api-guide",
109
+ uri="git://example-org/docs/api/rest-api-guide.md",
110
+ content="# REST API Guide\\n\\nThis guide covers RESTful API design...",
133
111
  extracted_data={
134
- "contract_type": "supplier_agreement",
135
- "parties": [
136
- {"name": "ACME Corp", "role": "buyer"},
137
- {"name": "SupplyChain Inc", "role": "supplier"}
138
- ],
139
- "effective_date": "2024-01-01",
140
- "termination_date": "2026-12-31",
141
- "payment_terms": {
142
- "amount": 500000,
143
- "currency": "USD",
144
- "frequency": "quarterly"
145
- },
146
- "key_obligations": [
147
- "Supplier must deliver within 30 days",
148
- "Buyer must pay within 60 days of invoice"
149
- ]
112
+ "type": "documentation",
113
+ "category": "api",
114
+ "version": "2.0",
115
+ },
116
+ tags=["api", "rest", "documentation"]
117
+ )
118
+
119
+ # Direct-loaded: Technical spec from git
120
+ config_spec = Ontology(
121
+ name="config-schema",
122
+ uri="git://example-org/docs/specs/config-schema.md",
123
+ content="# Configuration Schema\\n\\nThis document defines...",
124
+ extracted_data={
125
+ "type": "specification",
126
+ "format": "yaml",
127
+ "version": "1.0",
150
128
  },
151
- confidence_score=0.92,
152
- tags=["contract", "supplier", "procurement"]
129
+ tags=["config", "schema", "specification"]
153
130
  )
154
131
  """
155
132
 
156
133
  # Core fields
157
134
  name: str
158
- file_id: UUID | str
159
- agent_schema_id: str # Natural language label of Schema entity
135
+ uri: Optional[str] = None # External source: git://, s3://, https://
160
136
 
161
- # Extraction metadata
162
- provider_name: str # LLM provider (anthropic, openai, etc.)
163
- model_name: str # Specific model used
164
- extracted_data: dict[str, Any] # Arbitrary structured data from agent
137
+ # Agent extraction fields (optional - only for agent-extracted ontologies)
138
+ file_id: Optional[UUID | str] = None # FK to File entity
139
+ agent_schema_id: Optional[str] = None # Schema that performed extraction
140
+ provider_name: Optional[str] = None # LLM provider (anthropic, openai, etc.)
141
+ model_name: Optional[str] = None # Specific model used
142
+
143
+ # Data fields
144
+ extracted_data: Optional[dict[str, Any]] = None # Structured data
165
145
  confidence_score: Optional[float] = None # 0.0-1.0 if provided by agent
166
146
  extraction_timestamp: Optional[str] = None # ISO8601 timestamp
167
147
 
168
- # Semantic search support
169
- embedding_text: Optional[str] = None # Text for embedding generation
148
+ # Semantic search support - 'content' is a default embeddable field name
149
+ content: Optional[str] = None # Text for embedding generation
170
150
 
171
151
  model_config = ConfigDict(
172
152
  json_schema_extra={
173
- "description": "Domain-specific knowledge extracted from files using custom agents",
153
+ "description": "Domain-specific knowledge - agent-extracted or direct-loaded from external sources",
174
154
  "examples": [
155
+ {
156
+ "name": "panic-disorder",
157
+ "uri": "git://org/repo/ontology/disorders/anxiety/panic-disorder.md",
158
+ "content": "# Panic Disorder\n\nPanic disorder is characterized by...",
159
+ "extracted_data": {
160
+ "type": "disorder",
161
+ "category": "anxiety",
162
+ "icd10": "F41.0"
163
+ },
164
+ "tags": ["disorder", "anxiety"]
165
+ },
175
166
  {
176
167
  "name": "john-doe-cv-2024",
177
168
  "file_id": "550e8400-e29b-41d4-a716-446655440000",
@@ -180,8 +171,7 @@ class Ontology(CoreModel):
180
171
  "model_name": "claude-sonnet-4-5-20250929",
181
172
  "extracted_data": {
182
173
  "candidate_name": "John Doe",
183
- "skills": ["Python", "PostgreSQL"],
184
- "experience": []
174
+ "skills": ["Python", "PostgreSQL"]
185
175
  },
186
176
  "confidence_score": 0.95,
187
177
  "tags": ["cv", "engineering"]
@@ -124,7 +124,7 @@ json_schema_extra:
124
124
 
125
125
  # Explicit resource declarations for reference data
126
126
  resources:
127
- - uri: rem://schemas
127
+ - uri: rem://agents
128
128
  name: Agent Schemas List
129
129
  description: List all available agent schemas in the system
130
130
  - uri: rem://status
@@ -274,7 +274,7 @@ class ContentService:
274
274
  async def ingest_file(
275
275
  self,
276
276
  file_uri: str,
277
- user_id: str,
277
+ user_id: str | None = None,
278
278
  category: str | None = None,
279
279
  tags: list[str] | None = None,
280
280
  is_local_server: bool = False,
@@ -283,6 +283,10 @@ class ContentService:
283
283
  """
284
284
  Complete file ingestion pipeline: read → store → parse → chunk → embed.
285
285
 
286
+ **IMPORTANT: Data is PUBLIC by default (user_id=None).**
287
+ This is correct for shared knowledge bases (ontologies, procedures, reference data).
288
+ Private user-scoped data is rarely needed - only set user_id for truly personal content.
289
+
286
290
  **CENTRALIZED INGESTION**: This is the single entry point for all file ingestion
287
291
  in REM. It handles:
288
292
 
@@ -319,7 +323,9 @@ class ContentService:
319
323
 
320
324
  Args:
321
325
  file_uri: Source file location (local path, s3://, or https://)
322
- user_id: User identifier for data isolation and ownership
326
+ user_id: User identifier for PRIVATE data only. Default None = PUBLIC/shared.
327
+ Leave as None for shared knowledge bases, ontologies, reference data.
328
+ Only set for truly private user-specific content.
323
329
  category: Optional category tag (document, code, audio, etc.)
324
330
  tags: Optional list of tags
325
331
  is_local_server: True if running as local/stdio MCP server
@@ -347,12 +353,19 @@ class ContentService:
347
353
 
348
354
  Example:
349
355
  >>> service = ContentService()
356
+ >>> # PUBLIC data (default) - visible to all users
350
357
  >>> result = await service.ingest_file(
351
- ... file_uri="s3://bucket/contract.pdf",
352
- ... user_id="user-123",
353
- ... category="legal"
358
+ ... file_uri="s3://bucket/procedure.pdf",
359
+ ... category="medical"
354
360
  ... )
355
361
  >>> print(f"Created {result['resources_created']} searchable chunks")
362
+ >>>
363
+ >>> # PRIVATE data (rare) - only for user-specific content
364
+ >>> result = await service.ingest_file(
365
+ ... file_uri="s3://bucket/personal-notes.pdf",
366
+ ... user_id="user-123", # Only this user can access
367
+ ... category="personal"
368
+ ... )
356
369
  """
357
370
  from pathlib import Path
358
371
  from uuid import uuid4
@@ -376,8 +376,17 @@ class EmailService:
376
376
  await user_repo.upsert(existing_user)
377
377
  return {"allowed": True, "error": None}
378
378
  else:
379
- # New user - check if domain is trusted
380
- if settings and hasattr(settings, 'email') and settings.email.trusted_domain_list:
379
+ # New user - first check if they're a subscriber (by email lookup)
380
+ from ...models.entities import Subscriber
381
+ subscriber_repo = Repository(Subscriber, db=db)
382
+ existing_subscriber = await subscriber_repo.find_one({"email": email})
383
+
384
+ if existing_subscriber:
385
+ # Subscriber exists - allow them to create account
386
+ # (approved field may not exist in older schemas, so just check existence)
387
+ logger.info(f"Subscriber {email} creating user account")
388
+ elif settings and hasattr(settings, 'email') and settings.email.trusted_domain_list:
389
+ # Not an approved subscriber - check if domain is trusted
381
390
  if not settings.email.is_domain_trusted(email):
382
391
  email_domain = email.split("@")[-1]
383
392
  logger.warning(f"Untrusted domain attempted signup: {email_domain}")