remdb 0.3.163__py3-none-any.whl → 0.3.200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/agentic/agents/agent_manager.py +2 -1
- rem/agentic/context.py +101 -0
- rem/agentic/context_builder.py +30 -8
- rem/agentic/mcp/tool_wrapper.py +43 -14
- rem/agentic/providers/pydantic_ai.py +76 -34
- rem/agentic/schema.py +4 -3
- rem/agentic/tools/rem_tools.py +11 -0
- rem/api/main.py +1 -1
- rem/api/mcp_router/resources.py +75 -14
- rem/api/mcp_router/server.py +31 -24
- rem/api/mcp_router/tools.py +476 -155
- rem/api/routers/auth.py +11 -6
- rem/api/routers/chat/completions.py +52 -10
- rem/api/routers/chat/sse_events.py +2 -2
- rem/api/routers/chat/streaming.py +162 -19
- rem/api/routers/messages.py +96 -23
- rem/auth/middleware.py +59 -42
- rem/cli/README.md +62 -0
- rem/cli/commands/ask.py +1 -1
- rem/cli/commands/db.py +148 -70
- rem/cli/commands/process.py +171 -43
- rem/models/entities/ontology.py +93 -101
- rem/schemas/agents/core/agent-builder.yaml +143 -42
- rem/services/content/service.py +18 -5
- rem/services/email/service.py +17 -6
- rem/services/embeddings/worker.py +26 -12
- rem/services/postgres/__init__.py +28 -3
- rem/services/postgres/diff_service.py +57 -5
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
- rem/services/postgres/register_type.py +12 -11
- rem/services/postgres/repository.py +32 -21
- rem/services/postgres/schema_generator.py +5 -5
- rem/services/postgres/sql_builder.py +6 -5
- rem/services/session/__init__.py +7 -1
- rem/services/session/pydantic_messages.py +210 -0
- rem/services/user_service.py +12 -9
- rem/settings.py +7 -1
- rem/sql/background_indexes.sql +5 -0
- rem/sql/migrations/001_install.sql +148 -11
- rem/sql/migrations/002_install_models.sql +162 -132
- rem/sql/migrations/004_cache_system.sql +7 -275
- rem/utils/model_helpers.py +101 -0
- rem/utils/schema_loader.py +51 -13
- {remdb-0.3.163.dist-info → remdb-0.3.200.dist-info}/METADATA +1 -1
- {remdb-0.3.163.dist-info → remdb-0.3.200.dist-info}/RECORD +48 -46
- {remdb-0.3.163.dist-info → remdb-0.3.200.dist-info}/WHEEL +0 -0
- {remdb-0.3.163.dist-info → remdb-0.3.200.dist-info}/entry_points.txt +0 -0
rem/cli/commands/process.py
CHANGED
|
@@ -11,39 +11,102 @@ from rem.services.content import ContentService
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
@click.command(name="ingest")
|
|
14
|
-
@click.argument("
|
|
15
|
-
@click.option("--
|
|
14
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
15
|
+
@click.option("--table", "-t", default=None, help="Target table (e.g., ontologies, resources). Auto-detected for schemas.")
|
|
16
|
+
@click.option("--make-private", is_flag=True, help="Make data private to a specific user. RARELY NEEDED - most data should be public/shared.")
|
|
17
|
+
@click.option("--user-id", default=None, help="User ID for private data. REQUIRES --make-private flag.")
|
|
16
18
|
@click.option("--category", help="Optional file category")
|
|
17
19
|
@click.option("--tags", help="Optional comma-separated tags")
|
|
20
|
+
@click.option("--pattern", "-p", default="**/*.md", help="Glob pattern for directory ingestion (default: **/*.md)")
|
|
21
|
+
@click.option("--dry-run", is_flag=True, help="Show what would be ingested without making changes")
|
|
18
22
|
def process_ingest(
|
|
19
|
-
|
|
23
|
+
path: str,
|
|
24
|
+
table: str | None,
|
|
25
|
+
make_private: bool,
|
|
20
26
|
user_id: str | None,
|
|
21
27
|
category: str | None,
|
|
22
28
|
tags: str | None,
|
|
29
|
+
pattern: str,
|
|
30
|
+
dry_run: bool,
|
|
23
31
|
):
|
|
24
32
|
"""
|
|
25
|
-
Ingest
|
|
33
|
+
Ingest files into REM (storage + parsing + embedding).
|
|
26
34
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
35
|
+
Supports both single files and directories. For directories, recursively
|
|
36
|
+
processes files matching the pattern (default: **/*.md).
|
|
37
|
+
|
|
38
|
+
**IMPORTANT: Data is PUBLIC by default.** This is the correct behavior for
|
|
39
|
+
shared knowledge bases (ontologies, procedures, reference data). Private
|
|
40
|
+
user-scoped data is rarely needed and requires explicit --make-private flag.
|
|
41
|
+
|
|
42
|
+
Target table is auto-detected for schemas (agent.yaml → schemas table).
|
|
43
|
+
Use --table to explicitly set the target (e.g., ontologies for clinical knowledge).
|
|
33
44
|
|
|
34
45
|
Examples:
|
|
35
46
|
rem process ingest sample.pdf
|
|
36
47
|
rem process ingest contract.docx --category legal --tags contract,2023
|
|
37
48
|
rem process ingest agent.yaml # Auto-detects kind=agent, saves to schemas table
|
|
49
|
+
|
|
50
|
+
# Directory ingestion into ontologies table (PUBLIC - no user-id needed)
|
|
51
|
+
rem process ingest ontology/procedures/scid-5/ --table ontologies
|
|
52
|
+
rem process ingest ontology/ --table ontologies --pattern "**/*.md"
|
|
53
|
+
|
|
54
|
+
# Preview what would be ingested
|
|
55
|
+
rem process ingest ontology/ --table ontologies --dry-run
|
|
56
|
+
|
|
57
|
+
# RARE: Private user-scoped data (requires --make-private)
|
|
58
|
+
rem process ingest private-notes.md --make-private --user-id user-123
|
|
38
59
|
"""
|
|
39
60
|
import asyncio
|
|
61
|
+
|
|
62
|
+
# Validate: user_id requires --make-private flag
|
|
63
|
+
if user_id and not make_private:
|
|
64
|
+
raise click.UsageError(
|
|
65
|
+
"Setting --user-id requires the --make-private flag.\n\n"
|
|
66
|
+
"Data should be PUBLIC by default (no user-id). Private user-scoped data\n"
|
|
67
|
+
"is rarely needed - only use --make-private for truly personal content.\n\n"
|
|
68
|
+
"Example: rem process ingest file.md --make-private --user-id user-123"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# If --make-private is set, user_id is required
|
|
72
|
+
if make_private and not user_id:
|
|
73
|
+
raise click.UsageError(
|
|
74
|
+
"--make-private requires --user-id to specify which user owns the data.\n\n"
|
|
75
|
+
"Example: rem process ingest file.md --make-private --user-id user-123"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Clear user_id if not making private (ensure None for public data)
|
|
79
|
+
effective_user_id = user_id if make_private else None
|
|
80
|
+
from pathlib import Path
|
|
40
81
|
from ...services.content import ContentService
|
|
41
82
|
|
|
42
83
|
async def _ingest():
|
|
43
|
-
# Initialize ContentService with repositories for proper resource saving
|
|
44
84
|
from rem.services.postgres import get_postgres_service
|
|
45
85
|
from rem.services.postgres.repository import Repository
|
|
46
|
-
from rem.models.entities import File, Resource
|
|
86
|
+
from rem.models.entities import File, Resource, Ontology
|
|
87
|
+
|
|
88
|
+
input_path = Path(path)
|
|
89
|
+
tag_list = tags.split(",") if tags else None
|
|
90
|
+
|
|
91
|
+
# Collect files to process
|
|
92
|
+
if input_path.is_dir():
|
|
93
|
+
files_to_process = list(input_path.glob(pattern))
|
|
94
|
+
if not files_to_process:
|
|
95
|
+
logger.error(f"No files matching '{pattern}' found in {input_path}")
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
logger.info(f"Found {len(files_to_process)} files matching '{pattern}'")
|
|
98
|
+
else:
|
|
99
|
+
files_to_process = [input_path]
|
|
100
|
+
|
|
101
|
+
# Dry run: just show what would be processed
|
|
102
|
+
if dry_run:
|
|
103
|
+
logger.info("DRY RUN - Would ingest:")
|
|
104
|
+
for f in files_to_process[:20]:
|
|
105
|
+
entity_key = f.stem # filename without extension
|
|
106
|
+
logger.info(f" {f} → {table or 'auto-detect'} (key: {entity_key})")
|
|
107
|
+
if len(files_to_process) > 20:
|
|
108
|
+
logger.info(f" ... and {len(files_to_process) - 20} more files")
|
|
109
|
+
return
|
|
47
110
|
|
|
48
111
|
db = get_postgres_service()
|
|
49
112
|
if not db:
|
|
@@ -51,53 +114,118 @@ def process_ingest(
|
|
|
51
114
|
await db.connect()
|
|
52
115
|
|
|
53
116
|
try:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
user_id=user_id,
|
|
65
|
-
category=category,
|
|
66
|
-
tags=tag_list,
|
|
67
|
-
is_local_server=True, # CLI is local
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
# Handle schema ingestion (agents/evaluators)
|
|
71
|
-
if result.get("schema_name"):
|
|
72
|
-
logger.success(f"Schema ingested: {result['schema_name']} (kind={result.get('kind', 'agent')})")
|
|
73
|
-
logger.info(f"Version: {result.get('version', '1.0.0')}")
|
|
74
|
-
# Handle file ingestion
|
|
75
|
-
elif result.get("processing_status") == "completed":
|
|
76
|
-
logger.success(f"File ingested: {result['file_name']}")
|
|
77
|
-
logger.info(f"File ID: {result['file_id']}")
|
|
78
|
-
logger.info(f"Resources created: {result['resources_created']}")
|
|
117
|
+
# Direct table ingestion (ontologies, etc.)
|
|
118
|
+
if table:
|
|
119
|
+
await _ingest_to_table(
|
|
120
|
+
db=db,
|
|
121
|
+
files=files_to_process,
|
|
122
|
+
table_name=table,
|
|
123
|
+
user_id=effective_user_id,
|
|
124
|
+
category=category,
|
|
125
|
+
tag_list=tag_list,
|
|
126
|
+
)
|
|
79
127
|
else:
|
|
80
|
-
|
|
81
|
-
|
|
128
|
+
# Standard file ingestion via ContentService
|
|
129
|
+
file_repo = Repository(File, "files", db=db)
|
|
130
|
+
resource_repo = Repository(Resource, "resources", db=db)
|
|
131
|
+
service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
|
|
132
|
+
|
|
133
|
+
for file_path in files_to_process:
|
|
134
|
+
scope_msg = f"user: {effective_user_id}" if effective_user_id else "public"
|
|
135
|
+
logger.info(f"Ingesting: {file_path} ({scope_msg})")
|
|
136
|
+
|
|
137
|
+
result = await service.ingest_file(
|
|
138
|
+
file_uri=str(file_path),
|
|
139
|
+
user_id=effective_user_id,
|
|
140
|
+
category=category,
|
|
141
|
+
tags=tag_list,
|
|
142
|
+
is_local_server=True,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Handle schema ingestion (agents/evaluators)
|
|
146
|
+
if result.get("schema_name"):
|
|
147
|
+
logger.success(f"Schema: {result['schema_name']} (kind={result.get('kind', 'agent')})")
|
|
148
|
+
elif result.get("processing_status") == "completed":
|
|
149
|
+
logger.success(f"File: {result['file_name']} ({result['resources_created']} resources)")
|
|
150
|
+
else:
|
|
151
|
+
logger.error(f"Failed: {result.get('message', 'Unknown error')}")
|
|
82
152
|
|
|
83
153
|
except Exception as e:
|
|
84
154
|
logger.error(f"Error during ingestion: {e}")
|
|
85
155
|
sys.exit(1)
|
|
86
156
|
finally:
|
|
87
|
-
# Wait for
|
|
157
|
+
# Wait for embedding worker to finish
|
|
88
158
|
from rem.services.embeddings.worker import get_global_embedding_worker
|
|
89
159
|
try:
|
|
90
160
|
worker = get_global_embedding_worker()
|
|
91
161
|
if worker and worker.running and not worker.task_queue.empty():
|
|
92
|
-
logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks
|
|
93
|
-
# Worker.stop() waits for queue to drain (see worker.py line ~148)
|
|
162
|
+
logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks...")
|
|
94
163
|
await worker.stop()
|
|
95
164
|
except RuntimeError:
|
|
96
|
-
# Worker doesn't exist yet - no tasks queued
|
|
97
165
|
pass
|
|
98
166
|
|
|
99
167
|
await db.disconnect()
|
|
100
168
|
|
|
169
|
+
async def _ingest_to_table(db, files, table_name, user_id, category, tag_list):
|
|
170
|
+
"""Direct ingestion of files to a specific table (ontologies, etc.)."""
|
|
171
|
+
from rem.services.postgres.repository import Repository
|
|
172
|
+
from rem import get_model_registry
|
|
173
|
+
from rem.utils.model_helpers import get_table_name
|
|
174
|
+
|
|
175
|
+
# Get model class for table
|
|
176
|
+
registry = get_model_registry()
|
|
177
|
+
registry.register_core_models()
|
|
178
|
+
model_class = None
|
|
179
|
+
for model in registry.get_model_classes().values():
|
|
180
|
+
if get_table_name(model) == table_name:
|
|
181
|
+
model_class = model
|
|
182
|
+
break
|
|
183
|
+
|
|
184
|
+
if not model_class:
|
|
185
|
+
logger.error(f"Unknown table: {table_name}")
|
|
186
|
+
sys.exit(1)
|
|
187
|
+
|
|
188
|
+
repo = Repository(model_class, table_name, db=db)
|
|
189
|
+
processed = 0
|
|
190
|
+
failed = 0
|
|
191
|
+
|
|
192
|
+
for file_path in files:
|
|
193
|
+
try:
|
|
194
|
+
# Read file content
|
|
195
|
+
content = file_path.read_text(encoding="utf-8")
|
|
196
|
+
entity_key = file_path.stem # filename without extension
|
|
197
|
+
|
|
198
|
+
# Build entity based on table
|
|
199
|
+
entity_data = {
|
|
200
|
+
"name": entity_key,
|
|
201
|
+
"content": content,
|
|
202
|
+
"tags": tag_list or [],
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
# Add optional fields
|
|
206
|
+
if category:
|
|
207
|
+
entity_data["category"] = category
|
|
208
|
+
|
|
209
|
+
# Scoping: user_id for private data, None for public/shared
|
|
210
|
+
# tenant_id=None and user_id=None means PUBLIC data (visible to all)
|
|
211
|
+
entity_data["tenant_id"] = user_id # None = public/shared
|
|
212
|
+
entity_data["user_id"] = user_id # None = public/shared
|
|
213
|
+
|
|
214
|
+
# For ontologies, add URI
|
|
215
|
+
if table_name == "ontologies":
|
|
216
|
+
entity_data["uri"] = f"file://{file_path.absolute()}"
|
|
217
|
+
|
|
218
|
+
entity = model_class(**entity_data)
|
|
219
|
+
await repo.upsert(entity, embeddable_fields=["content"], generate_embeddings=True)
|
|
220
|
+
processed += 1
|
|
221
|
+
logger.success(f" ✓ {entity_key}")
|
|
222
|
+
|
|
223
|
+
except Exception as e:
|
|
224
|
+
failed += 1
|
|
225
|
+
logger.error(f" ✗ {file_path.name}: {e}")
|
|
226
|
+
|
|
227
|
+
logger.info(f"Completed: {processed} succeeded, {failed} failed")
|
|
228
|
+
|
|
101
229
|
asyncio.run(_ingest())
|
|
102
230
|
|
|
103
231
|
def register_commands(group: click.Group):
|
rem/models/entities/ontology.py
CHANGED
|
@@ -1,63 +1,55 @@
|
|
|
1
|
-
"""Ontology entity for
|
|
1
|
+
"""Ontology entity for domain-specific knowledge.
|
|
2
2
|
|
|
3
|
-
**What
|
|
3
|
+
**What are Ontologies?**
|
|
4
4
|
|
|
5
|
-
Ontologies are **domain-specific structured knowledge**
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
Ontologies are **domain-specific structured knowledge** that can be:
|
|
6
|
+
1. **Extracted** from files using custom agent schemas (agent-extracted)
|
|
7
|
+
2. **Loaded directly** from external sources like git repos or S3 (direct-loaded)
|
|
8
8
|
|
|
9
|
-
**
|
|
10
|
-
File → extract text → chunk → embed → resources (semantic search ready)
|
|
9
|
+
**Use Case 1: Agent-Extracted Ontologies**
|
|
11
10
|
|
|
12
|
-
**Ontology Processing (Tenant Knowledge Extensions):**
|
|
13
11
|
File → custom agent → structured JSON → ontology (domain knowledge)
|
|
14
12
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
13
|
+
Example: A contract PDF becomes a structured record with parties, dates, payment terms.
|
|
14
|
+
|
|
15
|
+
**Use Case 2: Direct-Loaded Ontologies (Knowledge Bases)**
|
|
16
|
+
|
|
17
|
+
External source (git/S3) → load → ontology (reference knowledge)
|
|
18
|
+
|
|
19
|
+
Example: A psychiatric ontology of disorders, symptoms, and drugs loaded from markdown
|
|
20
|
+
files in a git repository. Each markdown file becomes an ontology node with:
|
|
21
|
+
- `uri`: git path (e.g., `git://org/repo/ontology/disorders/anxiety/panic-disorder.md`)
|
|
22
|
+
- `content`: markdown content for embedding/search
|
|
23
|
+
- `extracted_data`: parsed frontmatter or structure
|
|
20
24
|
|
|
21
25
|
**Architecture:**
|
|
22
|
-
- Runs as part of dreaming worker (background knowledge extraction)
|
|
23
|
-
-
|
|
26
|
+
- Runs as part of dreaming worker (background knowledge extraction) OR
|
|
27
|
+
- Loaded directly via `rem db load` for external knowledge bases
|
|
28
|
+
- OntologyConfig defines which files trigger which extractors
|
|
24
29
|
- Multiple ontologies per file (apply different domain lenses)
|
|
25
|
-
- Tenant-scoped: Each tenant can define their own extractors
|
|
30
|
+
- Tenant-scoped: Each tenant can define their own extractors and knowledge bases
|
|
26
31
|
|
|
27
32
|
**Use Cases:**
|
|
28
33
|
|
|
29
|
-
1. **Recruitment (CV Parsing)**
|
|
30
|
-
-
|
|
31
|
-
- Ontology: Structured fields for filtering/sorting (years_experience, seniority_level, skills[])
|
|
32
|
-
|
|
33
|
-
2. **Legal (Contract Analysis)**
|
|
34
|
-
- Standard pipeline: Semantic search over contract text
|
|
35
|
-
- Ontology: Queryable fields (parties, effective_date, payment_amount, key_obligations[])
|
|
34
|
+
1. **Recruitment (CV Parsing)** - Agent-extracted
|
|
35
|
+
- Ontology: Structured fields for filtering/sorting (years_experience, skills[])
|
|
36
36
|
|
|
37
|
-
|
|
38
|
-
-
|
|
39
|
-
- Ontology: Structured diagnoses, medications, dosages, treatment plans
|
|
37
|
+
2. **Legal (Contract Analysis)** - Agent-extracted
|
|
38
|
+
- Ontology: Queryable fields (parties, effective_date, payment_amount)
|
|
40
39
|
|
|
41
|
-
|
|
42
|
-
-
|
|
43
|
-
-
|
|
40
|
+
3. **Medical Knowledge Base** - Direct-loaded
|
|
41
|
+
- Ontology: Disorders, symptoms, medications from curated markdown files
|
|
42
|
+
- Enables semantic search over psychiatric/medical domain knowledge
|
|
44
43
|
|
|
45
|
-
**
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
3. Normal processing: File → chunks → resources
|
|
49
|
-
4. Dreaming worker detects matching OntologyConfig
|
|
50
|
-
5. Loads cv-parser-v1 agent schema from database
|
|
51
|
-
6. Runs agent on file content → extracts structured data
|
|
52
|
-
7. Stores Ontology with extracted_data = {candidate_name, skills, experience, education, ...}
|
|
53
|
-
8. Ontology is now queryable via LOOKUP, SEARCH, or direct SQL
|
|
44
|
+
4. **Documentation/Procedures** - Direct-loaded
|
|
45
|
+
- Ontology: Clinical procedures (e.g., SCID-5 assessment steps)
|
|
46
|
+
- Reference material accessible via RAG
|
|
54
47
|
|
|
55
48
|
**Design:**
|
|
56
|
-
-
|
|
57
|
-
-
|
|
58
|
-
- Structured data in `extracted_data` (arbitrary JSON
|
|
59
|
-
- Embeddings generated for semantic search
|
|
60
|
-
- Multiple ontologies per file using different schemas
|
|
49
|
+
- `file_id` and `agent_schema_id` are optional (only needed for agent-extracted)
|
|
50
|
+
- `uri` field for external source references (git://, s3://, https://)
|
|
51
|
+
- Structured data in `extracted_data` (arbitrary JSON)
|
|
52
|
+
- Embeddings generated for semantic search via `content` field
|
|
61
53
|
- Tenant-isolated: OntologyConfigs are tenant-scoped
|
|
62
54
|
"""
|
|
63
55
|
|
|
@@ -70,18 +62,19 @@ from ..core.core_model import CoreModel
|
|
|
70
62
|
|
|
71
63
|
|
|
72
64
|
class Ontology(CoreModel):
|
|
73
|
-
"""Domain-specific knowledge
|
|
65
|
+
"""Domain-specific knowledge - either agent-extracted or direct-loaded.
|
|
74
66
|
|
|
75
67
|
Attributes:
|
|
76
68
|
name: Human-readable label for this ontology instance
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
69
|
+
uri: External source reference (git://, s3://, https://) for direct-loaded ontologies
|
|
70
|
+
file_id: Foreign key to File entity (optional - only for agent-extracted)
|
|
71
|
+
agent_schema_id: Schema that performed extraction (optional - only for agent-extracted)
|
|
72
|
+
provider_name: LLM provider used for extraction (optional)
|
|
73
|
+
model_name: Specific model used (optional)
|
|
74
|
+
extracted_data: Structured data - either extracted by agent or parsed from source
|
|
82
75
|
confidence_score: Optional confidence score from extraction (0.0-1.0)
|
|
83
76
|
extraction_timestamp: When extraction was performed
|
|
84
|
-
|
|
77
|
+
content: Text used for generating embedding
|
|
85
78
|
|
|
86
79
|
Inherited from CoreModel:
|
|
87
80
|
id: UUID or string identifier
|
|
@@ -93,10 +86,9 @@ class Ontology(CoreModel):
|
|
|
93
86
|
graph_edges: Relationships to other entities
|
|
94
87
|
metadata: Flexible metadata storage
|
|
95
88
|
tags: Classification tags
|
|
96
|
-
column: Database schema metadata
|
|
97
89
|
|
|
98
90
|
Example Usage:
|
|
99
|
-
# CV
|
|
91
|
+
# Agent-extracted: CV parsing
|
|
100
92
|
cv_ontology = Ontology(
|
|
101
93
|
name="john-doe-cv-2024",
|
|
102
94
|
file_id="file-uuid-123",
|
|
@@ -105,73 +97,74 @@ class Ontology(CoreModel):
|
|
|
105
97
|
model_name="claude-sonnet-4-5-20250929",
|
|
106
98
|
extracted_data={
|
|
107
99
|
"candidate_name": "John Doe",
|
|
108
|
-
"email": "john@example.com",
|
|
109
100
|
"skills": ["Python", "PostgreSQL", "Kubernetes"],
|
|
110
|
-
"experience": [
|
|
111
|
-
{
|
|
112
|
-
"company": "TechCorp",
|
|
113
|
-
"role": "Senior Engineer",
|
|
114
|
-
"years": 3,
|
|
115
|
-
"achievements": ["Led migration to k8s", "Reduced costs 40%"]
|
|
116
|
-
}
|
|
117
|
-
],
|
|
118
|
-
"education": [
|
|
119
|
-
{"degree": "BS Computer Science", "institution": "MIT", "year": 2018}
|
|
120
|
-
]
|
|
121
101
|
},
|
|
122
102
|
confidence_score=0.95,
|
|
123
|
-
tags=["cv", "engineering"
|
|
103
|
+
tags=["cv", "engineering"]
|
|
124
104
|
)
|
|
125
105
|
|
|
126
|
-
#
|
|
127
|
-
|
|
128
|
-
name="
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
provider_name="openai",
|
|
132
|
-
model_name="gpt-4.1",
|
|
106
|
+
# Direct-loaded: Medical knowledge base from git
|
|
107
|
+
disorder_ontology = Ontology(
|
|
108
|
+
name="panic-disorder",
|
|
109
|
+
uri="git://bwolfson-siggie/Siggy-MVP/ontology/disorders/anxiety/panic-disorder.md",
|
|
110
|
+
content="# Panic Disorder\\n\\nPanic disorder is characterized by...",
|
|
133
111
|
extracted_data={
|
|
134
|
-
"
|
|
135
|
-
"
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
112
|
+
"type": "disorder",
|
|
113
|
+
"category": "anxiety",
|
|
114
|
+
"icd10": "F41.0",
|
|
115
|
+
"dsm5_criteria": ["A", "B", "C", "D"],
|
|
116
|
+
},
|
|
117
|
+
tags=["disorder", "anxiety", "dsm5"]
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Direct-loaded: Clinical procedure from git
|
|
121
|
+
scid_node = Ontology(
|
|
122
|
+
name="scid-5-f1",
|
|
123
|
+
uri="git://bwolfson-siggie/Siggy-MVP/ontology/procedures/scid-5/module-f/scid-5-f1.md",
|
|
124
|
+
content="# scid-5-f1: Panic Attack Screening\\n\\n...",
|
|
125
|
+
extracted_data={
|
|
126
|
+
"type": "procedure",
|
|
127
|
+
"module": "F",
|
|
128
|
+
"section": "Panic Disorder",
|
|
129
|
+
"dsm5_criterion": "Panic Attack Specifier",
|
|
150
130
|
},
|
|
151
|
-
|
|
152
|
-
tags=["contract", "supplier", "procurement"]
|
|
131
|
+
tags=["scid-5", "procedure", "anxiety"]
|
|
153
132
|
)
|
|
154
133
|
"""
|
|
155
134
|
|
|
156
135
|
# Core fields
|
|
157
136
|
name: str
|
|
158
|
-
|
|
159
|
-
agent_schema_id: str # Natural language label of Schema entity
|
|
137
|
+
uri: Optional[str] = None # External source: git://, s3://, https://
|
|
160
138
|
|
|
161
|
-
#
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
139
|
+
# Agent extraction fields (optional - only for agent-extracted ontologies)
|
|
140
|
+
file_id: Optional[UUID | str] = None # FK to File entity
|
|
141
|
+
agent_schema_id: Optional[str] = None # Schema that performed extraction
|
|
142
|
+
provider_name: Optional[str] = None # LLM provider (anthropic, openai, etc.)
|
|
143
|
+
model_name: Optional[str] = None # Specific model used
|
|
144
|
+
|
|
145
|
+
# Data fields
|
|
146
|
+
extracted_data: Optional[dict[str, Any]] = None # Structured data
|
|
165
147
|
confidence_score: Optional[float] = None # 0.0-1.0 if provided by agent
|
|
166
148
|
extraction_timestamp: Optional[str] = None # ISO8601 timestamp
|
|
167
149
|
|
|
168
|
-
# Semantic search support
|
|
169
|
-
|
|
150
|
+
# Semantic search support - 'content' is a default embeddable field name
|
|
151
|
+
content: Optional[str] = None # Text for embedding generation
|
|
170
152
|
|
|
171
153
|
model_config = ConfigDict(
|
|
172
154
|
json_schema_extra={
|
|
173
|
-
"description": "Domain-specific knowledge extracted
|
|
155
|
+
"description": "Domain-specific knowledge - agent-extracted or direct-loaded from external sources",
|
|
174
156
|
"examples": [
|
|
157
|
+
{
|
|
158
|
+
"name": "panic-disorder",
|
|
159
|
+
"uri": "git://org/repo/ontology/disorders/anxiety/panic-disorder.md",
|
|
160
|
+
"content": "# Panic Disorder\n\nPanic disorder is characterized by...",
|
|
161
|
+
"extracted_data": {
|
|
162
|
+
"type": "disorder",
|
|
163
|
+
"category": "anxiety",
|
|
164
|
+
"icd10": "F41.0"
|
|
165
|
+
},
|
|
166
|
+
"tags": ["disorder", "anxiety"]
|
|
167
|
+
},
|
|
175
168
|
{
|
|
176
169
|
"name": "john-doe-cv-2024",
|
|
177
170
|
"file_id": "550e8400-e29b-41d4-a716-446655440000",
|
|
@@ -180,8 +173,7 @@ class Ontology(CoreModel):
|
|
|
180
173
|
"model_name": "claude-sonnet-4-5-20250929",
|
|
181
174
|
"extracted_data": {
|
|
182
175
|
"candidate_name": "John Doe",
|
|
183
|
-
"skills": ["Python", "PostgreSQL"]
|
|
184
|
-
"experience": []
|
|
176
|
+
"skills": ["Python", "PostgreSQL"]
|
|
185
177
|
},
|
|
186
178
|
"confidence_score": 0.95,
|
|
187
179
|
"tags": ["cv", "engineering"]
|