remdb 0.3.171__py3-none-any.whl → 0.3.180__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rem/cli/commands/db.py CHANGED
@@ -333,29 +333,46 @@ def rebuild_cache(connection: str | None):
333
333
 
334
334
  @click.command()
335
335
  @click.argument("file_path", type=click.Path(exists=True, path_type=Path))
336
+ @click.option("--table", "-t", default=None, help="Target table name (required for non-YAML formats)")
336
337
  @click.option("--user-id", default=None, help="User ID to scope data privately (default: public/shared)")
337
338
  @click.option("--dry-run", is_flag=True, help="Show what would be loaded without loading")
338
- def load(file_path: Path, user_id: str | None, dry_run: bool):
339
+ def load(file_path: Path, table: str | None, user_id: str | None, dry_run: bool):
339
340
  """
340
- Load data from YAML file into database.
341
+ Load data from file into database.
341
342
 
342
- File format:
343
- - table: resources
344
- key_field: name
345
- rows:
346
- - name: Example
347
- content: Test data...
343
+ Supports YAML with embedded metadata, or any tabular format via Polars
344
+ (jsonl, parquet, csv, json, arrow, etc.). For non-YAML formats, use --table.
348
345
 
349
346
  Examples:
350
- rem db load rem/tests/data/graph_seed.yaml
351
- rem db load data.yaml --user-id my-user # Private to user
352
- rem db load data.yaml --dry-run
347
+ rem db load data.yaml # YAML with metadata
348
+ rem db load data.jsonl -t resources # Any Polars-supported format
353
349
  """
354
- asyncio.run(_load_async(file_path, user_id, dry_run))
350
+ asyncio.run(_load_async(file_path, table, user_id, dry_run))
355
351
 
356
352
 
357
- async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
353
+ def _load_dataframe_from_file(file_path: Path) -> "pl.DataFrame":
354
+ """Load any Polars-supported file format into a DataFrame."""
355
+ import polars as pl
356
+
357
+ suffix = file_path.suffix.lower()
358
+
359
+ if suffix in {".jsonl", ".ndjson"}:
360
+ return pl.read_ndjson(file_path)
361
+ elif suffix in {".parquet", ".pq"}:
362
+ return pl.read_parquet(file_path)
363
+ elif suffix == ".csv":
364
+ return pl.read_csv(file_path)
365
+ elif suffix == ".json":
366
+ return pl.read_json(file_path)
367
+ elif suffix in {".ipc", ".arrow"}:
368
+ return pl.read_ipc(file_path)
369
+ else:
370
+ raise ValueError(f"Unsupported file format: {suffix}. Use any Polars-supported format.")
371
+
372
+
373
+ async def _load_async(file_path: Path, table: str | None, user_id: str | None, dry_run: bool):
358
374
  """Async implementation of load command."""
375
+ import polars as pl
359
376
  import yaml
360
377
  from ...models.core.inline_edge import InlineEdge
361
378
  from ...models.entities import Resource, Moment, User, Message, SharedSession, Schema
@@ -365,21 +382,10 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
365
382
  scope_msg = f"user: {user_id}" if user_id else "public"
366
383
  logger.info(f"Scope: {scope_msg}")
367
384
 
368
- # Load YAML file
369
- with open(file_path) as f:
370
- data = yaml.safe_load(f)
371
-
372
- if not isinstance(data, list):
373
- logger.error("YAML must be a list of table definitions")
374
- raise click.Abort()
375
-
376
- if dry_run:
377
- logger.info("DRY RUN - Would load:")
378
- logger.info(yaml.dump(data, default_flow_style=False))
379
- return
385
+ suffix = file_path.suffix.lower()
386
+ is_yaml = suffix in {".yaml", ".yml"}
380
387
 
381
388
  # Map table names to model classes
382
- # CoreModel subclasses use Repository.upsert()
383
389
  MODEL_MAP = {
384
390
  "users": User,
385
391
  "moments": Moment,
@@ -391,6 +397,58 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
391
397
  # Non-CoreModel tables that need direct SQL insertion
392
398
  DIRECT_INSERT_TABLES = {"shared_sessions"}
393
399
 
400
+ # Parse file based on format
401
+ if is_yaml:
402
+ # YAML with embedded metadata
403
+ with open(file_path) as f:
404
+ data = yaml.safe_load(f)
405
+
406
+ if not isinstance(data, list):
407
+ logger.error("YAML must be a list of table definitions")
408
+ raise click.Abort()
409
+
410
+ if dry_run:
411
+ logger.info("DRY RUN - Would load:")
412
+ logger.info(yaml.dump(data, default_flow_style=False))
413
+ return
414
+
415
+ table_defs = data
416
+ else:
417
+ # Polars-supported format - require --table
418
+ if not table:
419
+ logger.error(f"For {suffix} files, --table is required. Example: rem db load {file_path.name} -t resources")
420
+ raise click.Abort()
421
+
422
+ try:
423
+ df = _load_dataframe_from_file(file_path)
424
+ except Exception as e:
425
+ logger.error(f"Failed to load file: {e}")
426
+ raise click.Abort()
427
+
428
+ rows = df.to_dicts()
429
+
430
+ if dry_run:
431
+ logger.info(f"DRY RUN - Would load {len(rows)} rows to table '{table}':")
432
+ logger.info(f"Columns: {list(df.columns)}")
433
+
434
+ # Validate first row against model if table is known
435
+ if table in {"users", "moments", "resources", "messages", "schemas"} and rows:
436
+ from ...models.entities import Resource, Moment, User, Message, Schema
437
+ from ...utils.model_helpers import validate_data_for_model
438
+ model_map = {"users": User, "moments": Moment, "resources": Resource,
439
+ "messages": Message, "schemas": Schema}
440
+ result = validate_data_for_model(model_map[table], rows[0])
441
+ if result.extra_fields:
442
+ logger.warning(f"Unknown fields (ignored): {result.extra_fields}")
443
+ if result.valid:
444
+ logger.success(f"Sample row validates OK. Required: {result.required_fields or '(none)'}")
445
+ else:
446
+ result.log_errors("Sample row")
447
+ return
448
+
449
+ # Wrap as single table definition
450
+ table_defs = [{"table": table, "rows": rows}]
451
+
394
452
  # Connect to database
395
453
  pg = get_postgres_service()
396
454
  if not pg:
@@ -402,20 +460,17 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
402
460
  try:
403
461
  total_loaded = 0
404
462
 
405
- for table_def in data:
463
+ for table_def in table_defs:
406
464
  table_name = table_def["table"]
407
- key_field = table_def.get("key_field", "id")
408
465
  rows = table_def.get("rows", [])
409
466
 
410
467
  # Handle direct insert tables (non-CoreModel)
411
468
  if table_name in DIRECT_INSERT_TABLES:
412
469
  for row_data in rows:
413
- # Add tenant_id if not present
414
470
  if "tenant_id" not in row_data:
415
471
  row_data["tenant_id"] = "default"
416
472
 
417
473
  if table_name == "shared_sessions":
418
- # Insert shared_session directly
419
474
  await pg.fetch(
420
475
  """INSERT INTO shared_sessions
421
476
  (session_id, owner_user_id, shared_with_user_id, tenant_id)
@@ -434,12 +489,9 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
434
489
  logger.warning(f"Unknown table: {table_name}, skipping")
435
490
  continue
436
491
 
437
- model_class = MODEL_MAP[table_name] # Type is inferred from MODEL_MAP
492
+ model_class = MODEL_MAP[table_name]
438
493
 
439
- for row_data in rows:
440
- # Add user_id and tenant_id only if explicitly provided
441
- # Default is public (None) - data is shared/visible to all
442
- # Pass --user-id to scope data privately to a specific user
494
+ for row_idx, row_data in enumerate(rows):
443
495
  if "user_id" not in row_data and user_id is not None:
444
496
  row_data["user_id"] = user_id
445
497
  if "tenant_id" not in row_data and user_id is not None:
@@ -452,26 +504,28 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
452
504
  for edge in row_data["graph_edges"]
453
505
  ]
454
506
 
455
- # Convert any ISO timestamp strings with Z suffix to naive datetime
456
- # This handles fields like starts_timestamp, ends_timestamp, etc.
507
+ # Convert ISO timestamp strings
457
508
  from ...utils.date_utils import parse_iso
458
509
  for key, value in list(row_data.items()):
459
510
  if isinstance(value, str) and (key.endswith("_timestamp") or key.endswith("_at")):
460
511
  try:
461
512
  row_data[key] = parse_iso(value)
462
513
  except (ValueError, TypeError):
463
- pass # Not a valid datetime string, leave as-is
514
+ pass
464
515
 
465
- # Create model instance and upsert via repository
466
516
  from ...services.postgres.repository import Repository
517
+ from ...utils.model_helpers import validate_data_for_model
518
+
519
+ result = validate_data_for_model(model_class, row_data)
520
+ if not result.valid:
521
+ result.log_errors(f"Row {row_idx + 1} ({table_name})")
522
+ raise click.Abort()
467
523
 
468
- instance = model_class(**row_data)
469
- repo = Repository(model_class, table_name, pg) # Type inferred from MODEL_MAP
470
- await repo.upsert(instance) # type: ignore[arg-type]
524
+ repo = Repository(model_class, table_name, pg)
525
+ await repo.upsert(result.instance) # type: ignore[arg-type]
471
526
  total_loaded += 1
472
527
 
473
- # Log based on model type
474
- name = getattr(instance, 'name', getattr(instance, 'id', '?'))
528
+ name = getattr(result.instance, 'name', getattr(result.instance, 'id', '?'))
475
529
  logger.success(f"Loaded {table_name[:-1]}: {name}")
476
530
 
477
531
  logger.success(f"Data loaded successfully! Total rows: {total_loaded}")
@@ -1,63 +1,55 @@
1
- """Ontology entity for tenant-specific knowledge extensions.
1
+ """Ontology entity for domain-specific knowledge.
2
2
 
3
- **What is Ontology Extraction?**
3
+ **What are Ontologies?**
4
4
 
5
- Ontologies are **domain-specific structured knowledge** extracted from files using custom
6
- agent schemas. They extend REM's normal file processing pipeline with tenant-specific
7
- parsers that extract structured data the standard chunking pipeline would miss.
5
+ Ontologies are **domain-specific structured knowledge** that can be:
6
+ 1. **Extracted** from files using custom agent schemas (agent-extracted)
7
+ 2. **Loaded directly** from external sources like git repos or S3 (direct-loaded)
8
8
 
9
- **Normal File Processing:**
10
- File → extract text → chunk → embed → resources (semantic search ready)
9
+ **Use Case 1: Agent-Extracted Ontologies**
11
10
 
12
- **Ontology Processing (Tenant Knowledge Extensions):**
13
11
  File → custom agent → structured JSON → ontology (domain knowledge)
14
12
 
15
- **Why Ontologies?**
16
- - Standard chunking gives you semantic search over raw content
17
- - Ontologies give you **structured queryable fields** from domain logic
18
- - Example: A contract PDF becomes both searchable chunks AND a structured record with
19
- parties, dates, payment terms, obligations as queryable fields
13
+ Example: A contract PDF becomes a structured record with parties, dates, payment terms.
14
+
15
+ **Use Case 2: Direct-Loaded Ontologies (Knowledge Bases)**
16
+
17
+ External source (git/S3) load ontology (reference knowledge)
18
+
19
+ Example: A psychiatric ontology of disorders, symptoms, and drugs loaded from markdown
20
+ files in a git repository. Each markdown file becomes an ontology node with:
21
+ - `uri`: git path (e.g., `git://org/repo/ontology/disorders/anxiety/panic-disorder.md`)
22
+ - `content`: markdown content for embedding/search
23
+ - `extracted_data`: parsed frontmatter or structure
20
24
 
21
25
  **Architecture:**
22
- - Runs as part of dreaming worker (background knowledge extraction)
23
- - OntologyConfig defines which files trigger which extractors (MIME type, URI pattern, tags)
26
+ - Runs as part of dreaming worker (background knowledge extraction) OR
27
+ - Loaded directly via `rem db load` for external knowledge bases
28
+ - OntologyConfig defines which files trigger which extractors
24
29
  - Multiple ontologies per file (apply different domain lenses)
25
- - Tenant-scoped: Each tenant can define their own extractors
30
+ - Tenant-scoped: Each tenant can define their own extractors and knowledge bases
26
31
 
27
32
  **Use Cases:**
28
33
 
29
- 1. **Recruitment (CV Parsing)**
30
- - Standard pipeline: Chunks for "find me candidates with Python experience"
31
- - Ontology: Structured fields for filtering/sorting (years_experience, seniority_level, skills[])
32
-
33
- 2. **Legal (Contract Analysis)**
34
- - Standard pipeline: Semantic search over contract text
35
- - Ontology: Queryable fields (parties, effective_date, payment_amount, key_obligations[])
34
+ 1. **Recruitment (CV Parsing)** - Agent-extracted
35
+ - Ontology: Structured fields for filtering/sorting (years_experience, skills[])
36
36
 
37
- 3. **Medical (Health Records)**
38
- - Standard pipeline: Find mentions of conditions
39
- - Ontology: Structured diagnoses, medications, dosages, treatment plans
37
+ 2. **Legal (Contract Analysis)** - Agent-extracted
38
+ - Ontology: Queryable fields (parties, effective_date, payment_amount)
40
39
 
41
- 4. **Finance (Report Analysis)**
42
- - Standard pipeline: Search for financial terms
43
- - Ontology: Extracted metrics, risk_flags, trends, forecasts
40
+ 3. **Medical Knowledge Base** - Direct-loaded
41
+ - Ontology: Disorders, symptoms, medications from curated markdown files
42
+ - Enables semantic search over psychiatric/medical domain knowledge
44
43
 
45
- **Example Flow:**
46
- 1. Tenant creates OntologyConfig: "Run cv-parser-v1 on files with mime_type='application/pdf' and tags=['resume']"
47
- 2. File uploaded with tags=["resume"]
48
- 3. Normal processing: File → chunks → resources
49
- 4. Dreaming worker detects matching OntologyConfig
50
- 5. Loads cv-parser-v1 agent schema from database
51
- 6. Runs agent on file content → extracts structured data
52
- 7. Stores Ontology with extracted_data = {candidate_name, skills, experience, education, ...}
53
- 8. Ontology is now queryable via LOOKUP, SEARCH, or direct SQL
44
+ 4. **Documentation/Procedures** - Direct-loaded
45
+ - Ontology: Clinical procedures (e.g., SCID-5 assessment steps)
46
+ - Reference material accessible via RAG
54
47
 
55
48
  **Design:**
56
- - Each ontology links to a File via file_id
57
- - Agent schema tracked via agent_schema_id (human-readable label, not UUID)
58
- - Structured data in `extracted_data` (arbitrary JSON, schema defined by agent)
59
- - Embeddings generated for semantic search (configurable fields via agent schema)
60
- - Multiple ontologies per file using different schemas
49
+ - `file_id` and `agent_schema_id` are optional (only needed for agent-extracted)
50
+ - `uri` field for external source references (git://, s3://, https://)
51
+ - Structured data in `extracted_data` (arbitrary JSON)
52
+ - Embeddings generated for semantic search via `content` field
61
53
  - Tenant-isolated: OntologyConfigs are tenant-scoped
62
54
  """
63
55
 
@@ -70,18 +62,19 @@ from ..core.core_model import CoreModel
70
62
 
71
63
 
72
64
  class Ontology(CoreModel):
73
- """Domain-specific knowledge extracted from files using custom agents.
65
+ """Domain-specific knowledge - either agent-extracted or direct-loaded.
74
66
 
75
67
  Attributes:
76
68
  name: Human-readable label for this ontology instance
77
- file_id: Foreign key to File entity that was processed
78
- agent_schema_id: Foreign key to Schema entity that performed extraction
79
- provider_name: LLM provider used for extraction (e.g., "anthropic", "openai")
80
- model_name: Specific model used (e.g., "claude-sonnet-4-5")
81
- extracted_data: Structured data extracted by agent (arbitrary JSON)
69
+ uri: External source reference (git://, s3://, https://) for direct-loaded ontologies
70
+ file_id: Foreign key to File entity (optional - only for agent-extracted)
71
+ agent_schema_id: Schema that performed extraction (optional - only for agent-extracted)
72
+ provider_name: LLM provider used for extraction (optional)
73
+ model_name: Specific model used (optional)
74
+ extracted_data: Structured data - either extracted by agent or parsed from source
82
75
  confidence_score: Optional confidence score from extraction (0.0-1.0)
83
76
  extraction_timestamp: When extraction was performed
84
- embedding_text: Text used for generating embedding (derived from extracted_data)
77
+ content: Text used for generating embedding
85
78
 
86
79
  Inherited from CoreModel:
87
80
  id: UUID or string identifier
@@ -93,10 +86,9 @@ class Ontology(CoreModel):
93
86
  graph_edges: Relationships to other entities
94
87
  metadata: Flexible metadata storage
95
88
  tags: Classification tags
96
- column: Database schema metadata
97
89
 
98
90
  Example Usage:
99
- # CV extraction
91
+ # Agent-extracted: CV parsing
100
92
  cv_ontology = Ontology(
101
93
  name="john-doe-cv-2024",
102
94
  file_id="file-uuid-123",
@@ -105,73 +97,74 @@ class Ontology(CoreModel):
105
97
  model_name="claude-sonnet-4-5-20250929",
106
98
  extracted_data={
107
99
  "candidate_name": "John Doe",
108
- "email": "john@example.com",
109
100
  "skills": ["Python", "PostgreSQL", "Kubernetes"],
110
- "experience": [
111
- {
112
- "company": "TechCorp",
113
- "role": "Senior Engineer",
114
- "years": 3,
115
- "achievements": ["Led migration to k8s", "Reduced costs 40%"]
116
- }
117
- ],
118
- "education": [
119
- {"degree": "BS Computer Science", "institution": "MIT", "year": 2018}
120
- ]
121
101
  },
122
102
  confidence_score=0.95,
123
- tags=["cv", "engineering", "senior-level"]
103
+ tags=["cv", "engineering"]
124
104
  )
125
105
 
126
- # Contract extraction
127
- contract_ontology = Ontology(
128
- name="acme-supplier-agreement-2024",
129
- file_id="file-uuid-456",
130
- agent_schema_id="contract-parser-v2",
131
- provider_name="openai",
132
- model_name="gpt-4.1",
106
+ # Direct-loaded: Medical knowledge base from git
107
+ disorder_ontology = Ontology(
108
+ name="panic-disorder",
109
+ uri="git://bwolfson-siggie/Siggy-MVP/ontology/disorders/anxiety/panic-disorder.md",
110
+ content="# Panic Disorder\\n\\nPanic disorder is characterized by...",
133
111
  extracted_data={
134
- "contract_type": "supplier_agreement",
135
- "parties": [
136
- {"name": "ACME Corp", "role": "buyer"},
137
- {"name": "SupplyChain Inc", "role": "supplier"}
138
- ],
139
- "effective_date": "2024-01-01",
140
- "termination_date": "2026-12-31",
141
- "payment_terms": {
142
- "amount": 500000,
143
- "currency": "USD",
144
- "frequency": "quarterly"
145
- },
146
- "key_obligations": [
147
- "Supplier must deliver within 30 days",
148
- "Buyer must pay within 60 days of invoice"
149
- ]
112
+ "type": "disorder",
113
+ "category": "anxiety",
114
+ "icd10": "F41.0",
115
+ "dsm5_criteria": ["A", "B", "C", "D"],
116
+ },
117
+ tags=["disorder", "anxiety", "dsm5"]
118
+ )
119
+
120
+ # Direct-loaded: Clinical procedure from git
121
+ scid_node = Ontology(
122
+ name="scid-5-f1",
123
+ uri="git://bwolfson-siggie/Siggy-MVP/ontology/procedures/scid-5/module-f/scid-5-f1.md",
124
+ content="# scid-5-f1: Panic Attack Screening\\n\\n...",
125
+ extracted_data={
126
+ "type": "procedure",
127
+ "module": "F",
128
+ "section": "Panic Disorder",
129
+ "dsm5_criterion": "Panic Attack Specifier",
150
130
  },
151
- confidence_score=0.92,
152
- tags=["contract", "supplier", "procurement"]
131
+ tags=["scid-5", "procedure", "anxiety"]
153
132
  )
154
133
  """
155
134
 
156
135
  # Core fields
157
136
  name: str
158
- file_id: UUID | str
159
- agent_schema_id: str # Natural language label of Schema entity
137
+ uri: Optional[str] = None # External source: git://, s3://, https://
160
138
 
161
- # Extraction metadata
162
- provider_name: str # LLM provider (anthropic, openai, etc.)
163
- model_name: str # Specific model used
164
- extracted_data: dict[str, Any] # Arbitrary structured data from agent
139
+ # Agent extraction fields (optional - only for agent-extracted ontologies)
140
+ file_id: Optional[UUID | str] = None # FK to File entity
141
+ agent_schema_id: Optional[str] = None # Schema that performed extraction
142
+ provider_name: Optional[str] = None # LLM provider (anthropic, openai, etc.)
143
+ model_name: Optional[str] = None # Specific model used
144
+
145
+ # Data fields
146
+ extracted_data: Optional[dict[str, Any]] = None # Structured data
165
147
  confidence_score: Optional[float] = None # 0.0-1.0 if provided by agent
166
148
  extraction_timestamp: Optional[str] = None # ISO8601 timestamp
167
149
 
168
- # Semantic search support
169
- embedding_text: Optional[str] = None # Text for embedding generation
150
+ # Semantic search support - 'content' is a default embeddable field name
151
+ content: Optional[str] = None # Text for embedding generation
170
152
 
171
153
  model_config = ConfigDict(
172
154
  json_schema_extra={
173
- "description": "Domain-specific knowledge extracted from files using custom agents",
155
+ "description": "Domain-specific knowledge - agent-extracted or direct-loaded from external sources",
174
156
  "examples": [
157
+ {
158
+ "name": "panic-disorder",
159
+ "uri": "git://org/repo/ontology/disorders/anxiety/panic-disorder.md",
160
+ "content": "# Panic Disorder\n\nPanic disorder is characterized by...",
161
+ "extracted_data": {
162
+ "type": "disorder",
163
+ "category": "anxiety",
164
+ "icd10": "F41.0"
165
+ },
166
+ "tags": ["disorder", "anxiety"]
167
+ },
175
168
  {
176
169
  "name": "john-doe-cv-2024",
177
170
  "file_id": "550e8400-e29b-41d4-a716-446655440000",
@@ -180,8 +173,7 @@ class Ontology(CoreModel):
180
173
  "model_name": "claude-sonnet-4-5-20250929",
181
174
  "extracted_data": {
182
175
  "candidate_name": "John Doe",
183
- "skills": ["Python", "PostgreSQL"],
184
- "experience": []
176
+ "skills": ["Python", "PostgreSQL"]
185
177
  },
186
178
  "confidence_score": 0.95,
187
179
  "tags": ["cv", "engineering"]
@@ -376,8 +376,17 @@ class EmailService:
376
376
  await user_repo.upsert(existing_user)
377
377
  return {"allowed": True, "error": None}
378
378
  else:
379
- # New user - check if domain is trusted
380
- if settings and hasattr(settings, 'email') and settings.email.trusted_domain_list:
379
+ # New user - first check if they're a subscriber (by email lookup)
380
+ from ...models.entities import Subscriber
381
+ subscriber_repo = Repository(Subscriber, db=db)
382
+ existing_subscriber = await subscriber_repo.find_one({"email": email})
383
+
384
+ if existing_subscriber:
385
+ # Subscriber exists - allow them to create account
386
+ # (approved field may not exist in older schemas, so just check existence)
387
+ logger.info(f"Subscriber {email} creating user account")
388
+ elif settings and hasattr(settings, 'email') and settings.email.trusted_domain_list:
389
+ # Not an approved subscriber - check if domain is trusted
381
390
  if not settings.email.is_domain_trusted(email):
382
391
  email_domain = email.split("@")[-1]
383
392
  logger.warning(f"Untrusted domain attempted signup: {email_domain}")
@@ -268,7 +268,7 @@ BEGIN
268
268
  graph_edges,
269
269
  updated_at
270
270
  ) VALUES (
271
- NEW.{entity_key_field}::VARCHAR,
271
+ normalize_key(NEW.{entity_key_field}::VARCHAR),
272
272
  '{table_name}',
273
273
  NEW.id,
274
274
  NEW.tenant_id,
rem/settings.py CHANGED
@@ -77,6 +77,7 @@ class LLMSettings(BaseSettings):
77
77
  LLM__ANTHROPIC_API_KEY or ANTHROPIC_API_KEY - Anthropic API key
78
78
  LLM__EMBEDDING_PROVIDER or EMBEDDING_PROVIDER - Default embedding provider (openai)
79
79
  LLM__EMBEDDING_MODEL or EMBEDDING_MODEL - Default embedding model name
80
+ LLM__DEFAULT_STRUCTURED_OUTPUT - Default structured output mode (False = streaming text)
80
81
  """
81
82
 
82
83
  model_config = SettingsConfigDict(
@@ -138,6 +139,11 @@ class LLMSettings(BaseSettings):
138
139
  description="Default embedding model (provider-specific model name)",
139
140
  )
140
141
 
142
+ default_structured_output: bool = Field(
143
+ default=False,
144
+ description="Default structured output mode for agents. False = streaming text (easier), True = JSON schema validation",
145
+ )
146
+
141
147
  @field_validator("openai_api_key", mode="before")
142
148
  @classmethod
143
149
  def validate_openai_api_key(cls, v):
@@ -21,6 +21,11 @@ CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_embeddings_moments_vector_hnsw
21
21
  ON embeddings_moments
22
22
  USING hnsw (embedding vector_cosine_ops);
23
23
 
24
+ -- HNSW vector index for embeddings_ontologies
25
+ CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_embeddings_ontologies_vector_hnsw
26
+ ON embeddings_ontologies
27
+ USING hnsw (embedding vector_cosine_ops);
28
+
24
29
  -- HNSW vector index for embeddings_ontology_configs
25
30
  CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_embeddings_ontology_configs_vector_hnsw
26
31
  ON embeddings_ontology_configs
@@ -44,6 +44,33 @@ BEGIN
44
44
  RAISE NOTICE '✓ All required extensions installed successfully';
45
45
  END $$;
46
46
 
47
+ -- ============================================================================
48
+ -- NORMALIZATION HELPER
49
+ -- ============================================================================
50
+
51
+ -- Normalize entity keys to lower-kebab-case for consistent lookups
52
+ -- "Mood Disorder" -> "mood-disorder"
53
+ -- "mood_disorder" -> "mood-disorder"
54
+ -- "MoodDisorder" -> "mood-disorder"
55
+ CREATE OR REPLACE FUNCTION normalize_key(input TEXT)
56
+ RETURNS TEXT AS $$
57
+ BEGIN
58
+ RETURN lower(
59
+ regexp_replace(
60
+ regexp_replace(
61
+ regexp_replace(input, '([a-z])([A-Z])', '\1-\2', 'g'), -- camelCase -> kebab
62
+ '[_\s]+', '-', 'g' -- underscores/spaces -> hyphens
63
+ ),
64
+ '-+', '-', 'g' -- collapse multiple hyphens
65
+ )
66
+ );
67
+ END;
68
+ $$ LANGUAGE plpgsql IMMUTABLE;
69
+
70
+ COMMENT ON FUNCTION normalize_key IS
71
+ 'Normalizes entity keys to lower-kebab-case for consistent lookups.
72
+ Examples: "Mood Disorder" -> "mood-disorder", "mood_disorder" -> "mood-disorder"';
73
+
47
74
  -- ============================================================================
48
75
  -- MIGRATION TRACKING
49
76
  -- ============================================================================
@@ -237,10 +264,11 @@ BEGIN
237
264
 
238
265
  -- First lookup in KV store to get entity_type (table name)
239
266
  -- Include user-owned AND public (NULL user_id) entries
267
+ -- Normalize input key for consistent matching
240
268
  SELECT kv.entity_type INTO entity_table
241
269
  FROM kv_store kv
242
270
  WHERE (kv.user_id = effective_user_id OR kv.user_id IS NULL)
243
- AND kv.entity_key = p_entity_key
271
+ AND kv.entity_key = normalize_key(p_entity_key)
244
272
  LIMIT 1;
245
273
 
246
274
  -- If not found, return empty
@@ -414,6 +442,7 @@ BEGIN
414
442
  FOR graph_keys IN
415
443
  WITH RECURSIVE graph_traversal AS (
416
444
  -- Base case: Find starting entity (user-owned OR public)
445
+ -- Normalize input key for consistent matching
417
446
  SELECT
418
447
  0 AS depth,
419
448
  kv.entity_key,
@@ -424,7 +453,7 @@ BEGIN
424
453
  ARRAY[kv.entity_key]::TEXT[] AS path
425
454
  FROM kv_store kv
426
455
  WHERE (kv.user_id = effective_user_id OR kv.user_id IS NULL)
427
- AND kv.entity_key = p_entity_key
456
+ AND kv.entity_key = normalize_key(p_entity_key)
428
457
 
429
458
  UNION ALL
430
459
 
@@ -441,7 +470,7 @@ BEGIN
441
470
  JOIN kv_store source_kv ON source_kv.entity_key = gt.entity_key
442
471
  AND (source_kv.user_id = effective_user_id OR source_kv.user_id IS NULL)
443
472
  CROSS JOIN LATERAL jsonb_array_elements(COALESCE(source_kv.graph_edges, '[]'::jsonb)) AS edge
444
- JOIN kv_store target_kv ON target_kv.entity_key = (edge->>'dst')::VARCHAR(255)
473
+ JOIN kv_store target_kv ON target_kv.entity_key = normalize_key((edge->>'dst')::VARCHAR(255))
445
474
  AND (target_kv.user_id = effective_user_id OR target_kv.user_id IS NULL)
446
475
  WHERE gt.depth < p_max_depth
447
476
  AND (p_rel_type IS NULL OR (edge->>'rel_type')::VARCHAR(100) = p_rel_type)