remdb 0.3.157__py3-none-any.whl → 0.3.180__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/agentic/agents/agent_manager.py +2 -1
- rem/agentic/context.py +81 -3
- rem/agentic/context_builder.py +31 -6
- rem/agentic/mcp/tool_wrapper.py +43 -14
- rem/agentic/providers/pydantic_ai.py +76 -34
- rem/agentic/schema.py +4 -3
- rem/agentic/tools/rem_tools.py +11 -0
- rem/api/deps.py +1 -3
- rem/api/main.py +21 -2
- rem/api/mcp_router/resources.py +75 -14
- rem/api/mcp_router/server.py +27 -24
- rem/api/mcp_router/tools.py +83 -2
- rem/api/middleware/tracking.py +5 -5
- rem/api/routers/auth.py +152 -10
- rem/api/routers/chat/completions.py +5 -3
- rem/api/routers/chat/streaming.py +18 -0
- rem/api/routers/messages.py +24 -15
- rem/auth/jwt.py +352 -0
- rem/auth/middleware.py +70 -30
- rem/cli/commands/ask.py +1 -1
- rem/cli/commands/db.py +98 -44
- rem/models/entities/ontology.py +93 -101
- rem/schemas/agents/core/agent-builder.yaml +143 -42
- rem/services/email/service.py +72 -9
- rem/services/postgres/register_type.py +1 -1
- rem/services/postgres/repository.py +5 -4
- rem/services/user_service.py +41 -9
- rem/settings.py +15 -1
- rem/sql/background_indexes.sql +5 -0
- rem/sql/migrations/001_install.sql +33 -4
- rem/sql/migrations/002_install_models.sql +186 -168
- rem/utils/model_helpers.py +101 -0
- rem/utils/schema_loader.py +45 -7
- {remdb-0.3.157.dist-info → remdb-0.3.180.dist-info}/METADATA +1 -1
- {remdb-0.3.157.dist-info → remdb-0.3.180.dist-info}/RECORD +37 -36
- {remdb-0.3.157.dist-info → remdb-0.3.180.dist-info}/WHEEL +0 -0
- {remdb-0.3.157.dist-info → remdb-0.3.180.dist-info}/entry_points.txt +0 -0
rem/cli/commands/db.py
CHANGED
|
@@ -333,29 +333,46 @@ def rebuild_cache(connection: str | None):
|
|
|
333
333
|
|
|
334
334
|
@click.command()
|
|
335
335
|
@click.argument("file_path", type=click.Path(exists=True, path_type=Path))
|
|
336
|
+
@click.option("--table", "-t", default=None, help="Target table name (required for non-YAML formats)")
|
|
336
337
|
@click.option("--user-id", default=None, help="User ID to scope data privately (default: public/shared)")
|
|
337
338
|
@click.option("--dry-run", is_flag=True, help="Show what would be loaded without loading")
|
|
338
|
-
def load(file_path: Path, user_id: str | None, dry_run: bool):
|
|
339
|
+
def load(file_path: Path, table: str | None, user_id: str | None, dry_run: bool):
|
|
339
340
|
"""
|
|
340
|
-
Load data from
|
|
341
|
+
Load data from file into database.
|
|
341
342
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
key_field: name
|
|
345
|
-
rows:
|
|
346
|
-
- name: Example
|
|
347
|
-
content: Test data...
|
|
343
|
+
Supports YAML with embedded metadata, or any tabular format via Polars
|
|
344
|
+
(jsonl, parquet, csv, json, arrow, etc.). For non-YAML formats, use --table.
|
|
348
345
|
|
|
349
346
|
Examples:
|
|
350
|
-
rem db load
|
|
351
|
-
rem db load data.
|
|
352
|
-
rem db load data.yaml --dry-run
|
|
347
|
+
rem db load data.yaml # YAML with metadata
|
|
348
|
+
rem db load data.jsonl -t resources # Any Polars-supported format
|
|
353
349
|
"""
|
|
354
|
-
asyncio.run(_load_async(file_path, user_id, dry_run))
|
|
350
|
+
asyncio.run(_load_async(file_path, table, user_id, dry_run))
|
|
355
351
|
|
|
356
352
|
|
|
357
|
-
|
|
353
|
+
def _load_dataframe_from_file(file_path: Path) -> "pl.DataFrame":
|
|
354
|
+
"""Load any Polars-supported file format into a DataFrame."""
|
|
355
|
+
import polars as pl
|
|
356
|
+
|
|
357
|
+
suffix = file_path.suffix.lower()
|
|
358
|
+
|
|
359
|
+
if suffix in {".jsonl", ".ndjson"}:
|
|
360
|
+
return pl.read_ndjson(file_path)
|
|
361
|
+
elif suffix in {".parquet", ".pq"}:
|
|
362
|
+
return pl.read_parquet(file_path)
|
|
363
|
+
elif suffix == ".csv":
|
|
364
|
+
return pl.read_csv(file_path)
|
|
365
|
+
elif suffix == ".json":
|
|
366
|
+
return pl.read_json(file_path)
|
|
367
|
+
elif suffix in {".ipc", ".arrow"}:
|
|
368
|
+
return pl.read_ipc(file_path)
|
|
369
|
+
else:
|
|
370
|
+
raise ValueError(f"Unsupported file format: {suffix}. Use any Polars-supported format.")
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
async def _load_async(file_path: Path, table: str | None, user_id: str | None, dry_run: bool):
|
|
358
374
|
"""Async implementation of load command."""
|
|
375
|
+
import polars as pl
|
|
359
376
|
import yaml
|
|
360
377
|
from ...models.core.inline_edge import InlineEdge
|
|
361
378
|
from ...models.entities import Resource, Moment, User, Message, SharedSession, Schema
|
|
@@ -365,21 +382,10 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
|
|
|
365
382
|
scope_msg = f"user: {user_id}" if user_id else "public"
|
|
366
383
|
logger.info(f"Scope: {scope_msg}")
|
|
367
384
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
data = yaml.safe_load(f)
|
|
371
|
-
|
|
372
|
-
if not isinstance(data, list):
|
|
373
|
-
logger.error("YAML must be a list of table definitions")
|
|
374
|
-
raise click.Abort()
|
|
375
|
-
|
|
376
|
-
if dry_run:
|
|
377
|
-
logger.info("DRY RUN - Would load:")
|
|
378
|
-
logger.info(yaml.dump(data, default_flow_style=False))
|
|
379
|
-
return
|
|
385
|
+
suffix = file_path.suffix.lower()
|
|
386
|
+
is_yaml = suffix in {".yaml", ".yml"}
|
|
380
387
|
|
|
381
388
|
# Map table names to model classes
|
|
382
|
-
# CoreModel subclasses use Repository.upsert()
|
|
383
389
|
MODEL_MAP = {
|
|
384
390
|
"users": User,
|
|
385
391
|
"moments": Moment,
|
|
@@ -391,6 +397,58 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
|
|
|
391
397
|
# Non-CoreModel tables that need direct SQL insertion
|
|
392
398
|
DIRECT_INSERT_TABLES = {"shared_sessions"}
|
|
393
399
|
|
|
400
|
+
# Parse file based on format
|
|
401
|
+
if is_yaml:
|
|
402
|
+
# YAML with embedded metadata
|
|
403
|
+
with open(file_path) as f:
|
|
404
|
+
data = yaml.safe_load(f)
|
|
405
|
+
|
|
406
|
+
if not isinstance(data, list):
|
|
407
|
+
logger.error("YAML must be a list of table definitions")
|
|
408
|
+
raise click.Abort()
|
|
409
|
+
|
|
410
|
+
if dry_run:
|
|
411
|
+
logger.info("DRY RUN - Would load:")
|
|
412
|
+
logger.info(yaml.dump(data, default_flow_style=False))
|
|
413
|
+
return
|
|
414
|
+
|
|
415
|
+
table_defs = data
|
|
416
|
+
else:
|
|
417
|
+
# Polars-supported format - require --table
|
|
418
|
+
if not table:
|
|
419
|
+
logger.error(f"For {suffix} files, --table is required. Example: rem db load {file_path.name} -t resources")
|
|
420
|
+
raise click.Abort()
|
|
421
|
+
|
|
422
|
+
try:
|
|
423
|
+
df = _load_dataframe_from_file(file_path)
|
|
424
|
+
except Exception as e:
|
|
425
|
+
logger.error(f"Failed to load file: {e}")
|
|
426
|
+
raise click.Abort()
|
|
427
|
+
|
|
428
|
+
rows = df.to_dicts()
|
|
429
|
+
|
|
430
|
+
if dry_run:
|
|
431
|
+
logger.info(f"DRY RUN - Would load {len(rows)} rows to table '{table}':")
|
|
432
|
+
logger.info(f"Columns: {list(df.columns)}")
|
|
433
|
+
|
|
434
|
+
# Validate first row against model if table is known
|
|
435
|
+
if table in {"users", "moments", "resources", "messages", "schemas"} and rows:
|
|
436
|
+
from ...models.entities import Resource, Moment, User, Message, Schema
|
|
437
|
+
from ...utils.model_helpers import validate_data_for_model
|
|
438
|
+
model_map = {"users": User, "moments": Moment, "resources": Resource,
|
|
439
|
+
"messages": Message, "schemas": Schema}
|
|
440
|
+
result = validate_data_for_model(model_map[table], rows[0])
|
|
441
|
+
if result.extra_fields:
|
|
442
|
+
logger.warning(f"Unknown fields (ignored): {result.extra_fields}")
|
|
443
|
+
if result.valid:
|
|
444
|
+
logger.success(f"Sample row validates OK. Required: {result.required_fields or '(none)'}")
|
|
445
|
+
else:
|
|
446
|
+
result.log_errors("Sample row")
|
|
447
|
+
return
|
|
448
|
+
|
|
449
|
+
# Wrap as single table definition
|
|
450
|
+
table_defs = [{"table": table, "rows": rows}]
|
|
451
|
+
|
|
394
452
|
# Connect to database
|
|
395
453
|
pg = get_postgres_service()
|
|
396
454
|
if not pg:
|
|
@@ -402,20 +460,17 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
|
|
|
402
460
|
try:
|
|
403
461
|
total_loaded = 0
|
|
404
462
|
|
|
405
|
-
for table_def in
|
|
463
|
+
for table_def in table_defs:
|
|
406
464
|
table_name = table_def["table"]
|
|
407
|
-
key_field = table_def.get("key_field", "id")
|
|
408
465
|
rows = table_def.get("rows", [])
|
|
409
466
|
|
|
410
467
|
# Handle direct insert tables (non-CoreModel)
|
|
411
468
|
if table_name in DIRECT_INSERT_TABLES:
|
|
412
469
|
for row_data in rows:
|
|
413
|
-
# Add tenant_id if not present
|
|
414
470
|
if "tenant_id" not in row_data:
|
|
415
471
|
row_data["tenant_id"] = "default"
|
|
416
472
|
|
|
417
473
|
if table_name == "shared_sessions":
|
|
418
|
-
# Insert shared_session directly
|
|
419
474
|
await pg.fetch(
|
|
420
475
|
"""INSERT INTO shared_sessions
|
|
421
476
|
(session_id, owner_user_id, shared_with_user_id, tenant_id)
|
|
@@ -434,12 +489,9 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
|
|
|
434
489
|
logger.warning(f"Unknown table: {table_name}, skipping")
|
|
435
490
|
continue
|
|
436
491
|
|
|
437
|
-
model_class = MODEL_MAP[table_name]
|
|
492
|
+
model_class = MODEL_MAP[table_name]
|
|
438
493
|
|
|
439
|
-
for row_data in rows:
|
|
440
|
-
# Add user_id and tenant_id only if explicitly provided
|
|
441
|
-
# Default is public (None) - data is shared/visible to all
|
|
442
|
-
# Pass --user-id to scope data privately to a specific user
|
|
494
|
+
for row_idx, row_data in enumerate(rows):
|
|
443
495
|
if "user_id" not in row_data and user_id is not None:
|
|
444
496
|
row_data["user_id"] = user_id
|
|
445
497
|
if "tenant_id" not in row_data and user_id is not None:
|
|
@@ -452,26 +504,28 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
|
|
|
452
504
|
for edge in row_data["graph_edges"]
|
|
453
505
|
]
|
|
454
506
|
|
|
455
|
-
# Convert
|
|
456
|
-
# This handles fields like starts_timestamp, ends_timestamp, etc.
|
|
507
|
+
# Convert ISO timestamp strings
|
|
457
508
|
from ...utils.date_utils import parse_iso
|
|
458
509
|
for key, value in list(row_data.items()):
|
|
459
510
|
if isinstance(value, str) and (key.endswith("_timestamp") or key.endswith("_at")):
|
|
460
511
|
try:
|
|
461
512
|
row_data[key] = parse_iso(value)
|
|
462
513
|
except (ValueError, TypeError):
|
|
463
|
-
pass
|
|
514
|
+
pass
|
|
464
515
|
|
|
465
|
-
# Create model instance and upsert via repository
|
|
466
516
|
from ...services.postgres.repository import Repository
|
|
517
|
+
from ...utils.model_helpers import validate_data_for_model
|
|
518
|
+
|
|
519
|
+
result = validate_data_for_model(model_class, row_data)
|
|
520
|
+
if not result.valid:
|
|
521
|
+
result.log_errors(f"Row {row_idx + 1} ({table_name})")
|
|
522
|
+
raise click.Abort()
|
|
467
523
|
|
|
468
|
-
|
|
469
|
-
repo
|
|
470
|
-
await repo.upsert(instance) # type: ignore[arg-type]
|
|
524
|
+
repo = Repository(model_class, table_name, pg)
|
|
525
|
+
await repo.upsert(result.instance) # type: ignore[arg-type]
|
|
471
526
|
total_loaded += 1
|
|
472
527
|
|
|
473
|
-
|
|
474
|
-
name = getattr(instance, 'name', getattr(instance, 'id', '?'))
|
|
528
|
+
name = getattr(result.instance, 'name', getattr(result.instance, 'id', '?'))
|
|
475
529
|
logger.success(f"Loaded {table_name[:-1]}: {name}")
|
|
476
530
|
|
|
477
531
|
logger.success(f"Data loaded successfully! Total rows: {total_loaded}")
|
rem/models/entities/ontology.py
CHANGED
|
@@ -1,63 +1,55 @@
|
|
|
1
|
-
"""Ontology entity for
|
|
1
|
+
"""Ontology entity for domain-specific knowledge.
|
|
2
2
|
|
|
3
|
-
**What
|
|
3
|
+
**What are Ontologies?**
|
|
4
4
|
|
|
5
|
-
Ontologies are **domain-specific structured knowledge**
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
Ontologies are **domain-specific structured knowledge** that can be:
|
|
6
|
+
1. **Extracted** from files using custom agent schemas (agent-extracted)
|
|
7
|
+
2. **Loaded directly** from external sources like git repos or S3 (direct-loaded)
|
|
8
8
|
|
|
9
|
-
**
|
|
10
|
-
File → extract text → chunk → embed → resources (semantic search ready)
|
|
9
|
+
**Use Case 1: Agent-Extracted Ontologies**
|
|
11
10
|
|
|
12
|
-
**Ontology Processing (Tenant Knowledge Extensions):**
|
|
13
11
|
File → custom agent → structured JSON → ontology (domain knowledge)
|
|
14
12
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
13
|
+
Example: A contract PDF becomes a structured record with parties, dates, payment terms.
|
|
14
|
+
|
|
15
|
+
**Use Case 2: Direct-Loaded Ontologies (Knowledge Bases)**
|
|
16
|
+
|
|
17
|
+
External source (git/S3) → load → ontology (reference knowledge)
|
|
18
|
+
|
|
19
|
+
Example: A psychiatric ontology of disorders, symptoms, and drugs loaded from markdown
|
|
20
|
+
files in a git repository. Each markdown file becomes an ontology node with:
|
|
21
|
+
- `uri`: git path (e.g., `git://org/repo/ontology/disorders/anxiety/panic-disorder.md`)
|
|
22
|
+
- `content`: markdown content for embedding/search
|
|
23
|
+
- `extracted_data`: parsed frontmatter or structure
|
|
20
24
|
|
|
21
25
|
**Architecture:**
|
|
22
|
-
- Runs as part of dreaming worker (background knowledge extraction)
|
|
23
|
-
-
|
|
26
|
+
- Runs as part of dreaming worker (background knowledge extraction) OR
|
|
27
|
+
- Loaded directly via `rem db load` for external knowledge bases
|
|
28
|
+
- OntologyConfig defines which files trigger which extractors
|
|
24
29
|
- Multiple ontologies per file (apply different domain lenses)
|
|
25
|
-
- Tenant-scoped: Each tenant can define their own extractors
|
|
30
|
+
- Tenant-scoped: Each tenant can define their own extractors and knowledge bases
|
|
26
31
|
|
|
27
32
|
**Use Cases:**
|
|
28
33
|
|
|
29
|
-
1. **Recruitment (CV Parsing)**
|
|
30
|
-
-
|
|
31
|
-
- Ontology: Structured fields for filtering/sorting (years_experience, seniority_level, skills[])
|
|
32
|
-
|
|
33
|
-
2. **Legal (Contract Analysis)**
|
|
34
|
-
- Standard pipeline: Semantic search over contract text
|
|
35
|
-
- Ontology: Queryable fields (parties, effective_date, payment_amount, key_obligations[])
|
|
34
|
+
1. **Recruitment (CV Parsing)** - Agent-extracted
|
|
35
|
+
- Ontology: Structured fields for filtering/sorting (years_experience, skills[])
|
|
36
36
|
|
|
37
|
-
|
|
38
|
-
-
|
|
39
|
-
- Ontology: Structured diagnoses, medications, dosages, treatment plans
|
|
37
|
+
2. **Legal (Contract Analysis)** - Agent-extracted
|
|
38
|
+
- Ontology: Queryable fields (parties, effective_date, payment_amount)
|
|
40
39
|
|
|
41
|
-
|
|
42
|
-
-
|
|
43
|
-
-
|
|
40
|
+
3. **Medical Knowledge Base** - Direct-loaded
|
|
41
|
+
- Ontology: Disorders, symptoms, medications from curated markdown files
|
|
42
|
+
- Enables semantic search over psychiatric/medical domain knowledge
|
|
44
43
|
|
|
45
|
-
**
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
3. Normal processing: File → chunks → resources
|
|
49
|
-
4. Dreaming worker detects matching OntologyConfig
|
|
50
|
-
5. Loads cv-parser-v1 agent schema from database
|
|
51
|
-
6. Runs agent on file content → extracts structured data
|
|
52
|
-
7. Stores Ontology with extracted_data = {candidate_name, skills, experience, education, ...}
|
|
53
|
-
8. Ontology is now queryable via LOOKUP, SEARCH, or direct SQL
|
|
44
|
+
4. **Documentation/Procedures** - Direct-loaded
|
|
45
|
+
- Ontology: Clinical procedures (e.g., SCID-5 assessment steps)
|
|
46
|
+
- Reference material accessible via RAG
|
|
54
47
|
|
|
55
48
|
**Design:**
|
|
56
|
-
-
|
|
57
|
-
-
|
|
58
|
-
- Structured data in `extracted_data` (arbitrary JSON
|
|
59
|
-
- Embeddings generated for semantic search
|
|
60
|
-
- Multiple ontologies per file using different schemas
|
|
49
|
+
- `file_id` and `agent_schema_id` are optional (only needed for agent-extracted)
|
|
50
|
+
- `uri` field for external source references (git://, s3://, https://)
|
|
51
|
+
- Structured data in `extracted_data` (arbitrary JSON)
|
|
52
|
+
- Embeddings generated for semantic search via `content` field
|
|
61
53
|
- Tenant-isolated: OntologyConfigs are tenant-scoped
|
|
62
54
|
"""
|
|
63
55
|
|
|
@@ -70,18 +62,19 @@ from ..core.core_model import CoreModel
|
|
|
70
62
|
|
|
71
63
|
|
|
72
64
|
class Ontology(CoreModel):
|
|
73
|
-
"""Domain-specific knowledge
|
|
65
|
+
"""Domain-specific knowledge - either agent-extracted or direct-loaded.
|
|
74
66
|
|
|
75
67
|
Attributes:
|
|
76
68
|
name: Human-readable label for this ontology instance
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
69
|
+
uri: External source reference (git://, s3://, https://) for direct-loaded ontologies
|
|
70
|
+
file_id: Foreign key to File entity (optional - only for agent-extracted)
|
|
71
|
+
agent_schema_id: Schema that performed extraction (optional - only for agent-extracted)
|
|
72
|
+
provider_name: LLM provider used for extraction (optional)
|
|
73
|
+
model_name: Specific model used (optional)
|
|
74
|
+
extracted_data: Structured data - either extracted by agent or parsed from source
|
|
82
75
|
confidence_score: Optional confidence score from extraction (0.0-1.0)
|
|
83
76
|
extraction_timestamp: When extraction was performed
|
|
84
|
-
|
|
77
|
+
content: Text used for generating embedding
|
|
85
78
|
|
|
86
79
|
Inherited from CoreModel:
|
|
87
80
|
id: UUID or string identifier
|
|
@@ -93,10 +86,9 @@ class Ontology(CoreModel):
|
|
|
93
86
|
graph_edges: Relationships to other entities
|
|
94
87
|
metadata: Flexible metadata storage
|
|
95
88
|
tags: Classification tags
|
|
96
|
-
column: Database schema metadata
|
|
97
89
|
|
|
98
90
|
Example Usage:
|
|
99
|
-
# CV
|
|
91
|
+
# Agent-extracted: CV parsing
|
|
100
92
|
cv_ontology = Ontology(
|
|
101
93
|
name="john-doe-cv-2024",
|
|
102
94
|
file_id="file-uuid-123",
|
|
@@ -105,73 +97,74 @@ class Ontology(CoreModel):
|
|
|
105
97
|
model_name="claude-sonnet-4-5-20250929",
|
|
106
98
|
extracted_data={
|
|
107
99
|
"candidate_name": "John Doe",
|
|
108
|
-
"email": "john@example.com",
|
|
109
100
|
"skills": ["Python", "PostgreSQL", "Kubernetes"],
|
|
110
|
-
"experience": [
|
|
111
|
-
{
|
|
112
|
-
"company": "TechCorp",
|
|
113
|
-
"role": "Senior Engineer",
|
|
114
|
-
"years": 3,
|
|
115
|
-
"achievements": ["Led migration to k8s", "Reduced costs 40%"]
|
|
116
|
-
}
|
|
117
|
-
],
|
|
118
|
-
"education": [
|
|
119
|
-
{"degree": "BS Computer Science", "institution": "MIT", "year": 2018}
|
|
120
|
-
]
|
|
121
101
|
},
|
|
122
102
|
confidence_score=0.95,
|
|
123
|
-
tags=["cv", "engineering"
|
|
103
|
+
tags=["cv", "engineering"]
|
|
124
104
|
)
|
|
125
105
|
|
|
126
|
-
#
|
|
127
|
-
|
|
128
|
-
name="
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
provider_name="openai",
|
|
132
|
-
model_name="gpt-4.1",
|
|
106
|
+
# Direct-loaded: Medical knowledge base from git
|
|
107
|
+
disorder_ontology = Ontology(
|
|
108
|
+
name="panic-disorder",
|
|
109
|
+
uri="git://bwolfson-siggie/Siggy-MVP/ontology/disorders/anxiety/panic-disorder.md",
|
|
110
|
+
content="# Panic Disorder\\n\\nPanic disorder is characterized by...",
|
|
133
111
|
extracted_data={
|
|
134
|
-
"
|
|
135
|
-
"
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
112
|
+
"type": "disorder",
|
|
113
|
+
"category": "anxiety",
|
|
114
|
+
"icd10": "F41.0",
|
|
115
|
+
"dsm5_criteria": ["A", "B", "C", "D"],
|
|
116
|
+
},
|
|
117
|
+
tags=["disorder", "anxiety", "dsm5"]
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Direct-loaded: Clinical procedure from git
|
|
121
|
+
scid_node = Ontology(
|
|
122
|
+
name="scid-5-f1",
|
|
123
|
+
uri="git://bwolfson-siggie/Siggy-MVP/ontology/procedures/scid-5/module-f/scid-5-f1.md",
|
|
124
|
+
content="# scid-5-f1: Panic Attack Screening\\n\\n...",
|
|
125
|
+
extracted_data={
|
|
126
|
+
"type": "procedure",
|
|
127
|
+
"module": "F",
|
|
128
|
+
"section": "Panic Disorder",
|
|
129
|
+
"dsm5_criterion": "Panic Attack Specifier",
|
|
150
130
|
},
|
|
151
|
-
|
|
152
|
-
tags=["contract", "supplier", "procurement"]
|
|
131
|
+
tags=["scid-5", "procedure", "anxiety"]
|
|
153
132
|
)
|
|
154
133
|
"""
|
|
155
134
|
|
|
156
135
|
# Core fields
|
|
157
136
|
name: str
|
|
158
|
-
|
|
159
|
-
agent_schema_id: str # Natural language label of Schema entity
|
|
137
|
+
uri: Optional[str] = None # External source: git://, s3://, https://
|
|
160
138
|
|
|
161
|
-
#
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
139
|
+
# Agent extraction fields (optional - only for agent-extracted ontologies)
|
|
140
|
+
file_id: Optional[UUID | str] = None # FK to File entity
|
|
141
|
+
agent_schema_id: Optional[str] = None # Schema that performed extraction
|
|
142
|
+
provider_name: Optional[str] = None # LLM provider (anthropic, openai, etc.)
|
|
143
|
+
model_name: Optional[str] = None # Specific model used
|
|
144
|
+
|
|
145
|
+
# Data fields
|
|
146
|
+
extracted_data: Optional[dict[str, Any]] = None # Structured data
|
|
165
147
|
confidence_score: Optional[float] = None # 0.0-1.0 if provided by agent
|
|
166
148
|
extraction_timestamp: Optional[str] = None # ISO8601 timestamp
|
|
167
149
|
|
|
168
|
-
# Semantic search support
|
|
169
|
-
|
|
150
|
+
# Semantic search support - 'content' is a default embeddable field name
|
|
151
|
+
content: Optional[str] = None # Text for embedding generation
|
|
170
152
|
|
|
171
153
|
model_config = ConfigDict(
|
|
172
154
|
json_schema_extra={
|
|
173
|
-
"description": "Domain-specific knowledge extracted
|
|
155
|
+
"description": "Domain-specific knowledge - agent-extracted or direct-loaded from external sources",
|
|
174
156
|
"examples": [
|
|
157
|
+
{
|
|
158
|
+
"name": "panic-disorder",
|
|
159
|
+
"uri": "git://org/repo/ontology/disorders/anxiety/panic-disorder.md",
|
|
160
|
+
"content": "# Panic Disorder\n\nPanic disorder is characterized by...",
|
|
161
|
+
"extracted_data": {
|
|
162
|
+
"type": "disorder",
|
|
163
|
+
"category": "anxiety",
|
|
164
|
+
"icd10": "F41.0"
|
|
165
|
+
},
|
|
166
|
+
"tags": ["disorder", "anxiety"]
|
|
167
|
+
},
|
|
175
168
|
{
|
|
176
169
|
"name": "john-doe-cv-2024",
|
|
177
170
|
"file_id": "550e8400-e29b-41d4-a716-446655440000",
|
|
@@ -180,8 +173,7 @@ class Ontology(CoreModel):
|
|
|
180
173
|
"model_name": "claude-sonnet-4-5-20250929",
|
|
181
174
|
"extracted_data": {
|
|
182
175
|
"candidate_name": "John Doe",
|
|
183
|
-
"skills": ["Python", "PostgreSQL"]
|
|
184
|
-
"experience": []
|
|
176
|
+
"skills": ["Python", "PostgreSQL"]
|
|
185
177
|
},
|
|
186
178
|
"confidence_score": 0.95,
|
|
187
179
|
"tags": ["cv", "engineering"]
|