remdb 0.3.146__py3-none-any.whl → 0.3.181__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (57) hide show
  1. rem/agentic/agents/__init__.py +16 -0
  2. rem/agentic/agents/agent_manager.py +311 -0
  3. rem/agentic/context.py +81 -3
  4. rem/agentic/context_builder.py +36 -9
  5. rem/agentic/mcp/tool_wrapper.py +43 -14
  6. rem/agentic/providers/pydantic_ai.py +76 -34
  7. rem/agentic/schema.py +4 -3
  8. rem/agentic/tools/rem_tools.py +11 -0
  9. rem/api/deps.py +3 -5
  10. rem/api/main.py +22 -3
  11. rem/api/mcp_router/resources.py +75 -14
  12. rem/api/mcp_router/server.py +28 -23
  13. rem/api/mcp_router/tools.py +177 -2
  14. rem/api/middleware/tracking.py +5 -5
  15. rem/api/routers/auth.py +352 -6
  16. rem/api/routers/chat/completions.py +5 -3
  17. rem/api/routers/chat/streaming.py +95 -22
  18. rem/api/routers/messages.py +24 -15
  19. rem/auth/__init__.py +13 -3
  20. rem/auth/jwt.py +352 -0
  21. rem/auth/middleware.py +70 -30
  22. rem/auth/providers/__init__.py +4 -1
  23. rem/auth/providers/email.py +215 -0
  24. rem/cli/commands/ask.py +1 -1
  25. rem/cli/commands/db.py +118 -54
  26. rem/models/entities/__init__.py +4 -0
  27. rem/models/entities/ontology.py +93 -101
  28. rem/models/entities/subscriber.py +175 -0
  29. rem/models/entities/user.py +1 -0
  30. rem/schemas/agents/core/agent-builder.yaml +235 -0
  31. rem/services/__init__.py +3 -1
  32. rem/services/content/service.py +4 -3
  33. rem/services/email/__init__.py +10 -0
  34. rem/services/email/service.py +522 -0
  35. rem/services/email/templates.py +360 -0
  36. rem/services/embeddings/worker.py +26 -12
  37. rem/services/postgres/README.md +38 -0
  38. rem/services/postgres/diff_service.py +19 -3
  39. rem/services/postgres/pydantic_to_sqlalchemy.py +37 -2
  40. rem/services/postgres/register_type.py +1 -1
  41. rem/services/postgres/repository.py +37 -25
  42. rem/services/postgres/schema_generator.py +5 -5
  43. rem/services/postgres/sql_builder.py +6 -5
  44. rem/services/session/compression.py +113 -50
  45. rem/services/session/reload.py +14 -7
  46. rem/services/user_service.py +41 -9
  47. rem/settings.py +182 -1
  48. rem/sql/background_indexes.sql +5 -0
  49. rem/sql/migrations/001_install.sql +33 -4
  50. rem/sql/migrations/002_install_models.sql +204 -186
  51. rem/sql/migrations/005_schema_update.sql +145 -0
  52. rem/utils/model_helpers.py +101 -0
  53. rem/utils/schema_loader.py +45 -7
  54. {remdb-0.3.146.dist-info → remdb-0.3.181.dist-info}/METADATA +1 -1
  55. {remdb-0.3.146.dist-info → remdb-0.3.181.dist-info}/RECORD +57 -48
  56. {remdb-0.3.146.dist-info → remdb-0.3.181.dist-info}/WHEEL +0 -0
  57. {remdb-0.3.146.dist-info → remdb-0.3.181.dist-info}/entry_points.txt +0 -0
@@ -1,63 +1,55 @@
1
- """Ontology entity for tenant-specific knowledge extensions.
1
+ """Ontology entity for domain-specific knowledge.
2
2
 
3
- **What is Ontology Extraction?**
3
+ **What are Ontologies?**
4
4
 
5
- Ontologies are **domain-specific structured knowledge** extracted from files using custom
6
- agent schemas. They extend REM's normal file processing pipeline with tenant-specific
7
- parsers that extract structured data the standard chunking pipeline would miss.
5
+ Ontologies are **domain-specific structured knowledge** that can be:
6
+ 1. **Extracted** from files using custom agent schemas (agent-extracted)
7
+ 2. **Loaded directly** from external sources like git repos or S3 (direct-loaded)
8
8
 
9
- **Normal File Processing:**
10
- File → extract text → chunk → embed → resources (semantic search ready)
9
+ **Use Case 1: Agent-Extracted Ontologies**
11
10
 
12
- **Ontology Processing (Tenant Knowledge Extensions):**
13
11
  File → custom agent → structured JSON → ontology (domain knowledge)
14
12
 
15
- **Why Ontologies?**
16
- - Standard chunking gives you semantic search over raw content
17
- - Ontologies give you **structured queryable fields** from domain logic
18
- - Example: A contract PDF becomes both searchable chunks AND a structured record with
19
- parties, dates, payment terms, obligations as queryable fields
13
+ Example: A contract PDF becomes a structured record with parties, dates, payment terms.
14
+
15
+ **Use Case 2: Direct-Loaded Ontologies (Knowledge Bases)**
16
+
17
+ External source (git/S3) load ontology (reference knowledge)
18
+
19
+ Example: A psychiatric ontology of disorders, symptoms, and drugs loaded from markdown
20
+ files in a git repository. Each markdown file becomes an ontology node with:
21
+ - `uri`: git path (e.g., `git://org/repo/ontology/disorders/anxiety/panic-disorder.md`)
22
+ - `content`: markdown content for embedding/search
23
+ - `extracted_data`: parsed frontmatter or structure
20
24
 
21
25
  **Architecture:**
22
- - Runs as part of dreaming worker (background knowledge extraction)
23
- - OntologyConfig defines which files trigger which extractors (MIME type, URI pattern, tags)
26
+ - Runs as part of dreaming worker (background knowledge extraction) OR
27
+ - Loaded directly via `rem db load` for external knowledge bases
28
+ - OntologyConfig defines which files trigger which extractors
24
29
  - Multiple ontologies per file (apply different domain lenses)
25
- - Tenant-scoped: Each tenant can define their own extractors
30
+ - Tenant-scoped: Each tenant can define their own extractors and knowledge bases
26
31
 
27
32
  **Use Cases:**
28
33
 
29
- 1. **Recruitment (CV Parsing)**
30
- - Standard pipeline: Chunks for "find me candidates with Python experience"
31
- - Ontology: Structured fields for filtering/sorting (years_experience, seniority_level, skills[])
32
-
33
- 2. **Legal (Contract Analysis)**
34
- - Standard pipeline: Semantic search over contract text
35
- - Ontology: Queryable fields (parties, effective_date, payment_amount, key_obligations[])
34
+ 1. **Recruitment (CV Parsing)** - Agent-extracted
35
+ - Ontology: Structured fields for filtering/sorting (years_experience, skills[])
36
36
 
37
- 3. **Medical (Health Records)**
38
- - Standard pipeline: Find mentions of conditions
39
- - Ontology: Structured diagnoses, medications, dosages, treatment plans
37
+ 2. **Legal (Contract Analysis)** - Agent-extracted
38
+ - Ontology: Queryable fields (parties, effective_date, payment_amount)
40
39
 
41
- 4. **Finance (Report Analysis)**
42
- - Standard pipeline: Search for financial terms
43
- - Ontology: Extracted metrics, risk_flags, trends, forecasts
40
+ 3. **Medical Knowledge Base** - Direct-loaded
41
+ - Ontology: Disorders, symptoms, medications from curated markdown files
42
+ - Enables semantic search over psychiatric/medical domain knowledge
44
43
 
45
- **Example Flow:**
46
- 1. Tenant creates OntologyConfig: "Run cv-parser-v1 on files with mime_type='application/pdf' and tags=['resume']"
47
- 2. File uploaded with tags=["resume"]
48
- 3. Normal processing: File → chunks → resources
49
- 4. Dreaming worker detects matching OntologyConfig
50
- 5. Loads cv-parser-v1 agent schema from database
51
- 6. Runs agent on file content → extracts structured data
52
- 7. Stores Ontology with extracted_data = {candidate_name, skills, experience, education, ...}
53
- 8. Ontology is now queryable via LOOKUP, SEARCH, or direct SQL
44
+ 4. **Documentation/Procedures** - Direct-loaded
45
+ - Ontology: Clinical procedures (e.g., SCID-5 assessment steps)
46
+ - Reference material accessible via RAG
54
47
 
55
48
  **Design:**
56
- - Each ontology links to a File via file_id
57
- - Agent schema tracked via agent_schema_id (human-readable label, not UUID)
58
- - Structured data in `extracted_data` (arbitrary JSON, schema defined by agent)
59
- - Embeddings generated for semantic search (configurable fields via agent schema)
60
- - Multiple ontologies per file using different schemas
49
+ - `file_id` and `agent_schema_id` are optional (only needed for agent-extracted)
50
+ - `uri` field for external source references (git://, s3://, https://)
51
+ - Structured data in `extracted_data` (arbitrary JSON)
52
+ - Embeddings generated for semantic search via `content` field
61
53
  - Tenant-isolated: OntologyConfigs are tenant-scoped
62
54
  """
63
55
 
@@ -70,18 +62,19 @@ from ..core.core_model import CoreModel
70
62
 
71
63
 
72
64
  class Ontology(CoreModel):
73
- """Domain-specific knowledge extracted from files using custom agents.
65
+ """Domain-specific knowledge - either agent-extracted or direct-loaded.
74
66
 
75
67
  Attributes:
76
68
  name: Human-readable label for this ontology instance
77
- file_id: Foreign key to File entity that was processed
78
- agent_schema_id: Foreign key to Schema entity that performed extraction
79
- provider_name: LLM provider used for extraction (e.g., "anthropic", "openai")
80
- model_name: Specific model used (e.g., "claude-sonnet-4-5")
81
- extracted_data: Structured data extracted by agent (arbitrary JSON)
69
+ uri: External source reference (git://, s3://, https://) for direct-loaded ontologies
70
+ file_id: Foreign key to File entity (optional - only for agent-extracted)
71
+ agent_schema_id: Schema that performed extraction (optional - only for agent-extracted)
72
+ provider_name: LLM provider used for extraction (optional)
73
+ model_name: Specific model used (optional)
74
+ extracted_data: Structured data - either extracted by agent or parsed from source
82
75
  confidence_score: Optional confidence score from extraction (0.0-1.0)
83
76
  extraction_timestamp: When extraction was performed
84
- embedding_text: Text used for generating embedding (derived from extracted_data)
77
+ content: Text used for generating embedding
85
78
 
86
79
  Inherited from CoreModel:
87
80
  id: UUID or string identifier
@@ -93,10 +86,9 @@ class Ontology(CoreModel):
93
86
  graph_edges: Relationships to other entities
94
87
  metadata: Flexible metadata storage
95
88
  tags: Classification tags
96
- column: Database schema metadata
97
89
 
98
90
  Example Usage:
99
- # CV extraction
91
+ # Agent-extracted: CV parsing
100
92
  cv_ontology = Ontology(
101
93
  name="john-doe-cv-2024",
102
94
  file_id="file-uuid-123",
@@ -105,73 +97,74 @@ class Ontology(CoreModel):
105
97
  model_name="claude-sonnet-4-5-20250929",
106
98
  extracted_data={
107
99
  "candidate_name": "John Doe",
108
- "email": "john@example.com",
109
100
  "skills": ["Python", "PostgreSQL", "Kubernetes"],
110
- "experience": [
111
- {
112
- "company": "TechCorp",
113
- "role": "Senior Engineer",
114
- "years": 3,
115
- "achievements": ["Led migration to k8s", "Reduced costs 40%"]
116
- }
117
- ],
118
- "education": [
119
- {"degree": "BS Computer Science", "institution": "MIT", "year": 2018}
120
- ]
121
101
  },
122
102
  confidence_score=0.95,
123
- tags=["cv", "engineering", "senior-level"]
103
+ tags=["cv", "engineering"]
124
104
  )
125
105
 
126
- # Contract extraction
127
- contract_ontology = Ontology(
128
- name="acme-supplier-agreement-2024",
129
- file_id="file-uuid-456",
130
- agent_schema_id="contract-parser-v2",
131
- provider_name="openai",
132
- model_name="gpt-4.1",
106
+ # Direct-loaded: Medical knowledge base from git
107
+ disorder_ontology = Ontology(
108
+ name="panic-disorder",
109
+ uri="git://bwolfson-siggie/Siggy-MVP/ontology/disorders/anxiety/panic-disorder.md",
110
+ content="# Panic Disorder\\n\\nPanic disorder is characterized by...",
133
111
  extracted_data={
134
- "contract_type": "supplier_agreement",
135
- "parties": [
136
- {"name": "ACME Corp", "role": "buyer"},
137
- {"name": "SupplyChain Inc", "role": "supplier"}
138
- ],
139
- "effective_date": "2024-01-01",
140
- "termination_date": "2026-12-31",
141
- "payment_terms": {
142
- "amount": 500000,
143
- "currency": "USD",
144
- "frequency": "quarterly"
145
- },
146
- "key_obligations": [
147
- "Supplier must deliver within 30 days",
148
- "Buyer must pay within 60 days of invoice"
149
- ]
112
+ "type": "disorder",
113
+ "category": "anxiety",
114
+ "icd10": "F41.0",
115
+ "dsm5_criteria": ["A", "B", "C", "D"],
116
+ },
117
+ tags=["disorder", "anxiety", "dsm5"]
118
+ )
119
+
120
+ # Direct-loaded: Clinical procedure from git
121
+ scid_node = Ontology(
122
+ name="scid-5-f1",
123
+ uri="git://bwolfson-siggie/Siggy-MVP/ontology/procedures/scid-5/module-f/scid-5-f1.md",
124
+ content="# scid-5-f1: Panic Attack Screening\\n\\n...",
125
+ extracted_data={
126
+ "type": "procedure",
127
+ "module": "F",
128
+ "section": "Panic Disorder",
129
+ "dsm5_criterion": "Panic Attack Specifier",
150
130
  },
151
- confidence_score=0.92,
152
- tags=["contract", "supplier", "procurement"]
131
+ tags=["scid-5", "procedure", "anxiety"]
153
132
  )
154
133
  """
155
134
 
156
135
  # Core fields
157
136
  name: str
158
- file_id: UUID | str
159
- agent_schema_id: str # Natural language label of Schema entity
137
+ uri: Optional[str] = None # External source: git://, s3://, https://
160
138
 
161
- # Extraction metadata
162
- provider_name: str # LLM provider (anthropic, openai, etc.)
163
- model_name: str # Specific model used
164
- extracted_data: dict[str, Any] # Arbitrary structured data from agent
139
+ # Agent extraction fields (optional - only for agent-extracted ontologies)
140
+ file_id: Optional[UUID | str] = None # FK to File entity
141
+ agent_schema_id: Optional[str] = None # Schema that performed extraction
142
+ provider_name: Optional[str] = None # LLM provider (anthropic, openai, etc.)
143
+ model_name: Optional[str] = None # Specific model used
144
+
145
+ # Data fields
146
+ extracted_data: Optional[dict[str, Any]] = None # Structured data
165
147
  confidence_score: Optional[float] = None # 0.0-1.0 if provided by agent
166
148
  extraction_timestamp: Optional[str] = None # ISO8601 timestamp
167
149
 
168
- # Semantic search support
169
- embedding_text: Optional[str] = None # Text for embedding generation
150
+ # Semantic search support - 'content' is a default embeddable field name
151
+ content: Optional[str] = None # Text for embedding generation
170
152
 
171
153
  model_config = ConfigDict(
172
154
  json_schema_extra={
173
- "description": "Domain-specific knowledge extracted from files using custom agents",
155
+ "description": "Domain-specific knowledge - agent-extracted or direct-loaded from external sources",
174
156
  "examples": [
157
+ {
158
+ "name": "panic-disorder",
159
+ "uri": "git://org/repo/ontology/disorders/anxiety/panic-disorder.md",
160
+ "content": "# Panic Disorder\n\nPanic disorder is characterized by...",
161
+ "extracted_data": {
162
+ "type": "disorder",
163
+ "category": "anxiety",
164
+ "icd10": "F41.0"
165
+ },
166
+ "tags": ["disorder", "anxiety"]
167
+ },
175
168
  {
176
169
  "name": "john-doe-cv-2024",
177
170
  "file_id": "550e8400-e29b-41d4-a716-446655440000",
@@ -180,8 +173,7 @@ class Ontology(CoreModel):
180
173
  "model_name": "claude-sonnet-4-5-20250929",
181
174
  "extracted_data": {
182
175
  "candidate_name": "John Doe",
183
- "skills": ["Python", "PostgreSQL"],
184
- "experience": []
176
+ "skills": ["Python", "PostgreSQL"]
185
177
  },
186
178
  "confidence_score": 0.95,
187
179
  "tags": ["cv", "engineering"]
@@ -0,0 +1,175 @@
1
+ """
2
+ Subscriber - Email subscription management.
3
+
4
+ This model stores subscribers who sign up via websites/apps.
5
+ Subscribers can be collected before user registration for newsletters,
6
+ updates, and approval-based access control.
7
+
8
+ Key features:
9
+ - Deterministic UUID from email (same email = same ID)
10
+ - Approval workflow for access control
11
+ - Tags for segmentation
12
+ - Origin tracking for analytics
13
+ """
14
+
15
+ import uuid
16
+ from datetime import datetime, timezone
17
+ from enum import Enum
18
+ from typing import Optional
19
+
20
+ from pydantic import Field, EmailStr, model_validator
21
+
22
+ from ..core import CoreModel
23
+
24
+
25
+ class SubscriberStatus(str, Enum):
26
+ """Subscription status."""
27
+
28
+ ACTIVE = "active" # Actively subscribed
29
+ UNSUBSCRIBED = "unsubscribed" # User unsubscribed
30
+ BOUNCED = "bounced" # Email bounced
31
+ PENDING = "pending" # Pending confirmation (if double opt-in)
32
+
33
+
34
+ class SubscriberOrigin(str, Enum):
35
+ """Where the subscription originated from."""
36
+
37
+ WEBSITE = "website" # Main website subscribe form
38
+ LANDING_PAGE = "landing_page" # Campaign landing page
39
+ APP = "app" # In-app subscription
40
+ IMPORT = "import" # Bulk import
41
+ REFERRAL = "referral" # Referred by another user
42
+ OTHER = "other"
43
+
44
+
45
+ class Subscriber(CoreModel):
46
+ """
47
+ Email subscriber for newsletters and access control.
48
+
49
+ This model captures subscribers who sign up via the website, landing pages,
50
+ or in-app prompts. Uses deterministic UUID from email for natural upserts.
51
+
52
+ Access control via `approved` field:
53
+ - When email auth checks subscriber status, only approved subscribers
54
+ can complete login (if approval is enabled in settings).
55
+ - Subscribers can be pre-approved, or approved manually/automatically.
56
+
57
+ Usage:
58
+ from rem.services.postgres import Repository
59
+ from rem.models.entities import Subscriber, SubscriberStatus
60
+
61
+ repo = Repository(Subscriber, db=db)
62
+
63
+ # Create subscriber (ID auto-generated from email)
64
+ subscriber = Subscriber(
65
+ email="user@example.com",
66
+ name="John Doe",
67
+ origin=SubscriberOrigin.WEBSITE,
68
+ )
69
+ await repo.upsert(subscriber)
70
+
71
+ # Check if approved for login
72
+ subscriber = await repo.get_by_id(subscriber.id, tenant_id="default")
73
+ if subscriber and subscriber.approved:
74
+ # Allow login
75
+ pass
76
+ """
77
+
78
+ # Required field
79
+ email: EmailStr = Field(
80
+ description="Subscriber's email address (unique identifier)"
81
+ )
82
+
83
+ # Optional fields
84
+ name: Optional[str] = Field(
85
+ default=None,
86
+ description="Subscriber's name (optional)"
87
+ )
88
+
89
+ comment: Optional[str] = Field(
90
+ default=None,
91
+ max_length=500,
92
+ description="Optional comment or message from subscriber"
93
+ )
94
+
95
+ status: SubscriberStatus = Field(
96
+ default=SubscriberStatus.ACTIVE,
97
+ description="Current subscription status"
98
+ )
99
+
100
+ # Access control
101
+ approved: bool = Field(
102
+ default=False,
103
+ description="Whether subscriber is approved for login (for approval workflows)"
104
+ )
105
+
106
+ approved_at: Optional[datetime] = Field(
107
+ default=None,
108
+ description="When the subscriber was approved"
109
+ )
110
+
111
+ approved_by: Optional[str] = Field(
112
+ default=None,
113
+ description="Who approved the subscriber (user ID or 'system')"
114
+ )
115
+
116
+ # Origin tracking
117
+ origin: SubscriberOrigin = Field(
118
+ default=SubscriberOrigin.WEBSITE,
119
+ description="Where the subscription originated"
120
+ )
121
+
122
+ origin_detail: Optional[str] = Field(
123
+ default=None,
124
+ description="Additional origin context (e.g., campaign name, page URL)"
125
+ )
126
+
127
+ # Timestamps
128
+ subscribed_at: datetime = Field(
129
+ default_factory=lambda: datetime.now(timezone.utc),
130
+ description="When the subscription was created"
131
+ )
132
+
133
+ unsubscribed_at: Optional[datetime] = Field(
134
+ default=None,
135
+ description="When the user unsubscribed (if applicable)"
136
+ )
137
+
138
+ # Compliance
139
+ ip_address: Optional[str] = Field(
140
+ default=None,
141
+ description="IP address at subscription time (for compliance)"
142
+ )
143
+
144
+ user_agent: Optional[str] = Field(
145
+ default=None,
146
+ description="Browser user agent at subscription time"
147
+ )
148
+
149
+ # Segmentation
150
+ tags: list[str] = Field(
151
+ default_factory=list,
152
+ description="Tags for segmentation (e.g., ['early-access', 'beta'])"
153
+ )
154
+
155
+ @staticmethod
156
+ def email_to_uuid(email: str) -> uuid.UUID:
157
+ """Generate a deterministic UUID from an email address.
158
+
159
+ Uses UUID v5 with DNS namespace for consistency with
160
+ EmailService.generate_user_id_from_email().
161
+
162
+ Args:
163
+ email: Email address
164
+
165
+ Returns:
166
+ Deterministic UUID
167
+ """
168
+ return uuid.uuid5(uuid.NAMESPACE_DNS, email.lower().strip())
169
+
170
+ @model_validator(mode="after")
171
+ def set_id_from_email(self) -> "Subscriber":
172
+ """Auto-generate deterministic ID from email for natural upsert."""
173
+ if self.email:
174
+ self.id = self.email_to_uuid(self.email)
175
+ return self
@@ -22,6 +22,7 @@ from ..core import CoreModel
22
22
  class UserTier(str, Enum):
23
23
  """User subscription tier for feature gating."""
24
24
 
25
+ BLOCKED = "blocked" # User is blocked from logging in
25
26
  ANONYMOUS = "anonymous"
26
27
  FREE = "free"
27
28
  BASIC = "basic"