remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,88 @@
1
+ """
2
+ ImageResource - Image-specific resource with CLIP embeddings.
3
+
4
+ ImageResources are a specialized subclass of Resource for images,
5
+ with support for CLIP embeddings and vision LLM descriptions.
6
+
7
+ Key differences from base Resource:
8
+ - **Separate table**: Stored in `image_resources` table, not `resources`
9
+ - **Different embeddings**: Uses CLIP embeddings (multimodal) instead of text embeddings
10
+ - **Embedding provider override**: Must use CLIP-compatible provider (Jina AI, self-hosted)
11
+ - **Vision descriptions**: Optional vision LLM descriptions (tier/sampling gated)
12
+ - **Image metadata**: Dimensions, format, and other image-specific fields
13
+
14
+ Why separate table?
15
+ 1. Different embedding dimensionality (512/768 vs 1536)
16
+ 2. Different embedding model (CLIP vs text-embedding-3-small)
17
+ 3. Multimodal search capabilities (text-to-image, image-to-image)
18
+ 4. Image-specific indexes and queries
19
+ 5. Cost tracking (CLIP tokens vs text tokens)
20
+
21
+ Usage:
22
+ - ImageProvider saves to ImageResource table with CLIP embeddings
23
+ - Regular text Resources use standard text embeddings
24
+ - Cross-modal search: text queries can search ImageResources via CLIP
25
+ """
26
+
27
+ from typing import Optional
28
+
29
+ from pydantic import Field
30
+
31
+ from .resource import Resource
32
+
33
+
34
+ class ImageResource(Resource):
35
+ """
36
+ Image-specific resource with CLIP embeddings.
37
+
38
+ Stored in separate `image_resources` table with CLIP embeddings
39
+ instead of text embeddings. This enables:
40
+ - Multimodal search (text-to-image, image-to-image)
41
+ - Proper dimensionality (512/768 for CLIP vs 1536 for text)
42
+ - Cost tracking (CLIP tokens separate from text tokens)
43
+
44
+ Embedding Strategy:
45
+ - Default (when JINA_API_KEY set): Jina CLIP API (jina-clip-v2)
46
+ - Future: Self-hosted OpenCLIP models via KEDA-scaled pods
47
+ - Fallback: No embeddings (images searchable by metadata only)
48
+
49
+ Vision LLM Strategy (tier/sampling gated):
50
+ - Gold tier: Always get vision descriptions
51
+ - Silver/Free: Probabilistic sampling (IMAGE_VLLM_SAMPLE_RATE)
52
+ - Fallback: Basic metadata only
53
+
54
+ Tenant isolation provided via CoreModel.tenant_id field.
55
+ """
56
+
57
+ image_width: Optional[int] = Field(
58
+ default=None,
59
+ description="Image width in pixels",
60
+ )
61
+ image_height: Optional[int] = Field(
62
+ default=None,
63
+ description="Image height in pixels",
64
+ )
65
+ image_format: Optional[str] = Field(
66
+ default=None,
67
+ description="Image format (PNG, JPEG, GIF, WebP)",
68
+ )
69
+ vision_description: Optional[str] = Field(
70
+ default=None,
71
+ description="Vision LLM generated description (markdown, only for gold tier or sampled images)",
72
+ )
73
+ vision_provider: Optional[str] = Field(
74
+ default=None,
75
+ description="Vision provider used (anthropic, gemini, openai)",
76
+ )
77
+ vision_model: Optional[str] = Field(
78
+ default=None,
79
+ description="Vision model used for description",
80
+ )
81
+ clip_embedding: Optional[list[float]] = Field(
82
+ default=None,
83
+ description="CLIP embedding vector (512 or 768 dimensions, from Jina AI or self-hosted)",
84
+ )
85
+ clip_dimensions: Optional[int] = Field(
86
+ default=None,
87
+ description="CLIP embedding dimensionality (512 for jina-clip-v2, 768 for jina-clip-v1)",
88
+ )
@@ -0,0 +1,64 @@
1
+ """
2
+ Message - Communication content in REM.
3
+
4
+ Messages represent individual communication units (chat messages, emails, etc.)
5
+ that can be grouped into conversations or moments.
6
+
7
+ Messages are simpler than Resources but share the same graph connectivity
8
+ through CoreModel inheritance.
9
+
10
+ Trace Integration:
11
+ - trace_id: OTEL trace ID for linking to observability
12
+ - span_id: OTEL span ID for specific span reference
13
+ - These enable feedback to be attached to Phoenix annotations
14
+ """
15
+
16
+ from pydantic import Field
17
+
18
+ from ..core import CoreModel
19
+
20
+
21
+ class Message(CoreModel):
22
+ """
23
+ Communication content unit.
24
+
25
+ Represents individual messages in conversations, chats, or other
26
+ communication contexts. Tenant isolation is provided via CoreModel.tenant_id field.
27
+
28
+ Trace fields (trace_id, span_id) enable integration with OTEL/Phoenix
29
+ for observability and feedback annotation.
30
+ """
31
+
32
+ content: str = Field(
33
+ ...,
34
+ description="Message content text",
35
+ )
36
+ message_type: str | None = Field(
37
+ default=None,
38
+ description="Message type e.g. role: 'user', 'assistant', 'system', 'tool'",
39
+ )
40
+ session_id: str | None = Field(
41
+ default=None,
42
+ description="Session identifier for tracking message context",
43
+ )
44
+ prompt: str | None = Field(
45
+ default=None,
46
+ description="Custom prompt used for this message (if overridden from default)",
47
+ )
48
+ model: str | None = Field(
49
+ default=None,
50
+ description="Model used for generating this message (provider:model format)",
51
+ )
52
+ token_count: int | None = Field(
53
+ default=None,
54
+ description="Token count for this message",
55
+ )
56
+ # OTEL/Phoenix trace integration
57
+ trace_id: str | None = Field(
58
+ default=None,
59
+ description="OTEL trace ID for observability integration",
60
+ )
61
+ span_id: str | None = Field(
62
+ default=None,
63
+ description="OTEL span ID for specific span reference",
64
+ )
@@ -0,0 +1,123 @@
1
+ """
2
+ Moment - Temporal narrative in REM.
3
+
4
+ Moments are extracted from Resources through first-order dreaming workflows.
5
+ They represent temporal narratives like meetings, coding sessions, conversations,
6
+ or any classified time period when users were focused on specific activities.
7
+
8
+ Moments provide temporal structure to the REM graph:
9
+ - Temporal boundaries (starts_timestamp, ends_timestamp)
10
+ - Present persons (who was involved)
11
+ - Emotion tags (team sentiment)
12
+ - Topic tags (what was discussed)
13
+ - Natural language summaries
14
+
15
+ Moments enable temporal queries:
16
+ - "What happened between milestone A and B?"
17
+ - "When did Sarah and Mike meet?"
18
+ - "What was discussed in Q4 retrospective?"
19
+
20
+ Data Model:
21
+ - Inherits from CoreModel (id, tenant_id, timestamps, graph_edges, etc.)
22
+ - name: Human-readable moment name
23
+ - moment_type: Classification (meeting, coding-session, conversation, etc.)
24
+ - starts_timestamp: Start time
25
+ - ends_timestamp: End time
26
+ - present_persons: List of Person objects with id, name, role
27
+ - emotion_tags: Sentiment tags (happy, frustrated, focused)
28
+ - topic_tags: Topic/concept tags (project names, technologies)
29
+ - summary: Natural language description
30
+ - source_resource_ids: Resources used to construct this moment
31
+ """
32
+
33
+ from datetime import datetime
34
+ from typing import Optional
35
+
36
+ from pydantic import BaseModel, Field, model_validator
37
+
38
+ from ..core import CoreModel
39
+
40
+
41
+ class Person(BaseModel):
42
+ """Person reference in a moment."""
43
+
44
+ id: str = Field(..., description="Person entity label")
45
+ name: str = Field(..., description="Person name")
46
+ role: Optional[str] = Field(default=None, description="Person role in moment")
47
+
48
+
49
+
50
+ class Moment(CoreModel):
51
+ """
52
+ Temporal narrative extracted from resources.
53
+
54
+ Moments provide temporal structure and context for the REM graph,
55
+ enabling time-based queries and understanding of when events occurred.
56
+ Tenant isolation is provided via CoreModel.tenant_id field.
57
+ """
58
+
59
+ name: Optional[str] = Field(
60
+ default=None,
61
+ description="Human-readable moment name (used as graph label). Auto-generated from starts_timestamp+moment_type if not provided.",
62
+ json_schema_extra={"entity_key": True}, # Primary business key for KV lookups
63
+ )
64
+ moment_type: Optional[str] = Field(
65
+ default=None,
66
+ description="Moment classification (meeting, coding-session, conversation, etc.)",
67
+ )
68
+ category: Optional[str] = Field(
69
+ default=None,
70
+ description="Moment category for grouping and filtering",
71
+ )
72
+ starts_timestamp: datetime = Field(
73
+ ...,
74
+ description="Moment start time",
75
+ )
76
+ ends_timestamp: Optional[datetime] = Field(
77
+ default=None,
78
+ description="Moment end time",
79
+ )
80
+ present_persons: list[Person] = Field(
81
+ default_factory=list,
82
+ description="People present in the moment",
83
+ )
84
+
85
+ emotion_tags: list[str] = Field(
86
+ default_factory=list,
87
+ description="Emotion/sentiment tags (happy, frustrated, focused, etc.)",
88
+ )
89
+ topic_tags: list[str] = Field(
90
+ default_factory=list,
91
+ description="Topic/concept tags (project names, technologies, etc.)",
92
+ )
93
+ summary: Optional[str] = Field(
94
+ default=None,
95
+ description="Natural language summary of the moment",
96
+ )
97
+ source_resource_ids: list[str] = Field(
98
+ default_factory=list,
99
+ description="Resource IDs used to construct this moment",
100
+ )
101
+
102
+ @model_validator(mode='after')
103
+ def generate_name_if_missing(self) -> 'Moment':
104
+ """Auto-generate name from starts_timestamp+moment_type if not provided."""
105
+ if not self.name:
106
+ # Format: "Meeting on 2024-12-20" or "Coding Session on 2024-12-20 14:30"
107
+ if self.starts_timestamp:
108
+ date_str = self.starts_timestamp.strftime("%Y-%m-%d")
109
+ time_str = self.starts_timestamp.strftime("%H:%M")
110
+
111
+ if self.moment_type:
112
+ moment_label = self.moment_type.replace('-', ' ').replace('_', ' ').title()
113
+ self.name = f"{moment_label} on {date_str}"
114
+ else:
115
+ self.name = f"Moment on {date_str} {time_str}"
116
+ else:
117
+ # Fallback: use ID or generic name
118
+ if self.id:
119
+ self.name = f"moment-{str(self.id)[:8]}"
120
+ else:
121
+ self.name = "unnamed-moment"
122
+
123
+ return self
@@ -0,0 +1,181 @@
1
+ """Ontology entity for domain-specific knowledge.
2
+
3
+ **What are Ontologies?**
4
+
5
+ Ontologies are **domain-specific structured knowledge** that can be:
6
+ 1. **Extracted** from files using custom agent schemas (agent-extracted)
7
+ 2. **Loaded directly** from external sources like git repos or S3 (direct-loaded)
8
+
9
+ **Use Case 1: Agent-Extracted Ontologies**
10
+
11
+ File → custom agent → structured JSON → ontology (domain knowledge)
12
+
13
+ Example: A contract PDF becomes a structured record with parties, dates, payment terms.
14
+
15
+ **Use Case 2: Direct-Loaded Ontologies (Knowledge Bases)**
16
+
17
+ External source (git/S3) → load → ontology (reference knowledge)
18
+
19
+ Example: A psychiatric ontology of disorders, symptoms, and drugs loaded from markdown
20
+ files in a git repository. Each markdown file becomes an ontology node with:
21
+ - `uri`: git path (e.g., `git://org/repo/ontology/disorders/anxiety/panic-disorder.md`)
22
+ - `content`: markdown content for embedding/search
23
+ - `extracted_data`: parsed frontmatter or structure
24
+
25
+ **Architecture:**
26
+ - Runs as part of dreaming worker (background knowledge extraction) OR
27
+ - Loaded directly via `rem db load` for external knowledge bases
28
+ - OntologyConfig defines which files trigger which extractors
29
+ - Multiple ontologies per file (apply different domain lenses)
30
+ - Tenant-scoped: Each tenant can define their own extractors and knowledge bases
31
+
32
+ **Use Cases:**
33
+
34
+ 1. **Recruitment (CV Parsing)** - Agent-extracted
35
+ - Ontology: Structured fields for filtering/sorting (years_experience, skills[])
36
+
37
+ 2. **Legal (Contract Analysis)** - Agent-extracted
38
+ - Ontology: Queryable fields (parties, effective_date, payment_amount)
39
+
40
+ 3. **Medical Knowledge Base** - Direct-loaded
41
+ - Ontology: Disorders, symptoms, medications from curated markdown files
42
+ - Enables semantic search over psychiatric/medical domain knowledge
43
+
44
+ 4. **Documentation/Procedures** - Direct-loaded
45
+ - Ontology: Clinical procedures (e.g., SCID-5 assessment steps)
46
+ - Reference material accessible via RAG
47
+
48
+ **Design:**
49
+ - `file_id` and `agent_schema_id` are optional (only needed for agent-extracted)
50
+ - `uri` field for external source references (git://, s3://, https://)
51
+ - Structured data in `extracted_data` (arbitrary JSON)
52
+ - Embeddings generated for semantic search via `content` field
53
+ - Tenant-isolated: OntologyConfigs are tenant-scoped
54
+ """
55
+
56
+ from typing import Any, Optional
57
+ from uuid import UUID
58
+
59
+ from pydantic import ConfigDict
60
+
61
+ from ..core.core_model import CoreModel
62
+
63
+
64
+ class Ontology(CoreModel):
65
+ """Domain-specific knowledge - either agent-extracted or direct-loaded.
66
+
67
+ Attributes:
68
+ name: Human-readable label for this ontology instance
69
+ uri: External source reference (git://, s3://, https://) for direct-loaded ontologies
70
+ file_id: Foreign key to File entity (optional - only for agent-extracted)
71
+ agent_schema_id: Schema that performed extraction (optional - only for agent-extracted)
72
+ provider_name: LLM provider used for extraction (optional)
73
+ model_name: Specific model used (optional)
74
+ extracted_data: Structured data - either extracted by agent or parsed from source
75
+ confidence_score: Optional confidence score from extraction (0.0-1.0)
76
+ extraction_timestamp: When extraction was performed
77
+ content: Text used for generating embedding
78
+
79
+ Inherited from CoreModel:
80
+ id: UUID or string identifier
81
+ created_at: Entity creation timestamp
82
+ updated_at: Last update timestamp
83
+ deleted_at: Soft deletion timestamp
84
+ tenant_id: Multi-tenancy isolation
85
+ user_id: Ownership
86
+ graph_edges: Relationships to other entities
87
+ metadata: Flexible metadata storage
88
+ tags: Classification tags
89
+
90
+ Example Usage:
91
+ # Agent-extracted: CV parsing
92
+ cv_ontology = Ontology(
93
+ name="john-doe-cv-2024",
94
+ file_id="file-uuid-123",
95
+ agent_schema_id="cv-parser-v1",
96
+ provider_name="anthropic",
97
+ model_name="claude-sonnet-4-5-20250929",
98
+ extracted_data={
99
+ "candidate_name": "John Doe",
100
+ "skills": ["Python", "PostgreSQL", "Kubernetes"],
101
+ },
102
+ confidence_score=0.95,
103
+ tags=["cv", "engineering"]
104
+ )
105
+
106
+ # Direct-loaded: Knowledge base from git
107
+ api_docs = Ontology(
108
+ name="rest-api-guide",
109
+ uri="git://example-org/docs/api/rest-api-guide.md",
110
+ content="# REST API Guide\\n\\nThis guide covers RESTful API design...",
111
+ extracted_data={
112
+ "type": "documentation",
113
+ "category": "api",
114
+ "version": "2.0",
115
+ },
116
+ tags=["api", "rest", "documentation"]
117
+ )
118
+
119
+ # Direct-loaded: Technical spec from git
120
+ config_spec = Ontology(
121
+ name="config-schema",
122
+ uri="git://example-org/docs/specs/config-schema.md",
123
+ content="# Configuration Schema\\n\\nThis document defines...",
124
+ extracted_data={
125
+ "type": "specification",
126
+ "format": "yaml",
127
+ "version": "1.0",
128
+ },
129
+ tags=["config", "schema", "specification"]
130
+ )
131
+ """
132
+
133
+ # Core fields
134
+ name: str
135
+ uri: Optional[str] = None # External source: git://, s3://, https://
136
+
137
+ # Agent extraction fields (optional - only for agent-extracted ontologies)
138
+ file_id: Optional[UUID | str] = None # FK to File entity
139
+ agent_schema_id: Optional[str] = None # Schema that performed extraction
140
+ provider_name: Optional[str] = None # LLM provider (anthropic, openai, etc.)
141
+ model_name: Optional[str] = None # Specific model used
142
+
143
+ # Data fields
144
+ extracted_data: Optional[dict[str, Any]] = None # Structured data
145
+ confidence_score: Optional[float] = None # 0.0-1.0 if provided by agent
146
+ extraction_timestamp: Optional[str] = None # ISO8601 timestamp
147
+
148
+ # Semantic search support - 'content' is a default embeddable field name
149
+ content: Optional[str] = None # Text for embedding generation
150
+
151
+ model_config = ConfigDict(
152
+ json_schema_extra={
153
+ "description": "Domain-specific knowledge - agent-extracted or direct-loaded from external sources",
154
+ "examples": [
155
+ {
156
+ "name": "panic-disorder",
157
+ "uri": "git://org/repo/ontology/disorders/anxiety/panic-disorder.md",
158
+ "content": "# Panic Disorder\n\nPanic disorder is characterized by...",
159
+ "extracted_data": {
160
+ "type": "disorder",
161
+ "category": "anxiety",
162
+ "icd10": "F41.0"
163
+ },
164
+ "tags": ["disorder", "anxiety"]
165
+ },
166
+ {
167
+ "name": "john-doe-cv-2024",
168
+ "file_id": "550e8400-e29b-41d4-a716-446655440000",
169
+ "agent_schema_id": "cv-parser-v1",
170
+ "provider_name": "anthropic",
171
+ "model_name": "claude-sonnet-4-5-20250929",
172
+ "extracted_data": {
173
+ "candidate_name": "John Doe",
174
+ "skills": ["Python", "PostgreSQL"]
175
+ },
176
+ "confidence_score": 0.95,
177
+ "tags": ["cv", "engineering"]
178
+ }
179
+ ]
180
+ }
181
+ )
@@ -0,0 +1,131 @@
1
+ """OntologyConfig entity for user-defined ontology extraction rules.
2
+
3
+ OntologyConfig allows users to define which agent schemas should be applied to
4
+ which files during the dreaming/processing workflow. This enables domain-specific
5
+ knowledge extraction tailored to user needs.
6
+
7
+ Examples:
8
+ - "Apply cv-parser-v1 to all PDF files in /resumes/"
9
+ - "Apply contract-analyzer-v2 to files tagged with 'legal'"
10
+ - "Apply medical-records-extractor to files with mime_type application/pdf AND tags ['medical']"
11
+
12
+ Design:
13
+ - Each config is tenant-scoped for isolation
14
+ - File matching via mime_type patterns, tag filters, and URI patterns
15
+ - Multiple configs can match a single file (all will be applied)
16
+ - Priority field for execution order when multiple configs match
17
+ - Enabled/disabled toggle for temporary deactivation
18
+ """
19
+
20
+ from typing import Optional
21
+
22
+ from pydantic import ConfigDict
23
+
24
+ from ..core.core_model import CoreModel
25
+
26
+
27
+ class OntologyConfig(CoreModel):
28
+ """User configuration for automatic ontology extraction.
29
+
30
+ Attributes:
31
+ name: Human-readable config name
32
+ agent_schema_id: Foreign key to Schema entity to use for extraction
33
+ description: Purpose and scope of this config
34
+
35
+ # File matching rules (ANY matching rule triggers extraction)
36
+ mime_type_pattern: Regex pattern for file MIME types (e.g., "application/pdf")
37
+ uri_pattern: Regex pattern for file URIs (e.g., "s3://bucket/resumes/.*")
38
+ tag_filter: List of tags (file must have ALL tags to match)
39
+
40
+ # Execution control
41
+ priority: Execution order (higher = earlier, default 100)
42
+ enabled: Whether this config is active (default True)
43
+
44
+ # LLM provider configuration
45
+ provider_name: Optional LLM provider override (defaults to settings)
46
+ model_name: Optional model override (defaults to settings)
47
+
48
+ Inherited from CoreModel:
49
+ id, created_at, updated_at, deleted_at, tenant_id, user_id,
50
+ graph_edges, metadata, tags, column
51
+
52
+ Example Usage:
53
+ # CV extraction for recruitment
54
+ cv_config = OntologyConfig(
55
+ name="recruitment-cv-parser",
56
+ agent_schema_id="cv-parser-v1",
57
+ description="Extract candidate information from resumes",
58
+ mime_type_pattern="application/pdf",
59
+ uri_pattern=".*/resumes/.*",
60
+ tag_filter=["cv", "candidate"],
61
+ priority=100,
62
+ enabled=True,
63
+ tenant_id="acme-corp",
64
+ tags=["recruitment", "hr"]
65
+ )
66
+
67
+ # Contract analysis for legal team
68
+ contract_config = OntologyConfig(
69
+ name="legal-contract-analyzer",
70
+ agent_schema_id="contract-parser-v2",
71
+ description="Extract key terms from supplier contracts",
72
+ mime_type_pattern="application/(pdf|msword|vnd.openxmlformats.*)",
73
+ tag_filter=["legal", "contract"],
74
+ priority=200, # Higher priority = runs first
75
+ enabled=True,
76
+ provider_name="openai", # Override default provider
77
+ model_name="gpt-4.1",
78
+ tenant_id="acme-corp",
79
+ tags=["legal", "procurement"]
80
+ )
81
+
82
+ # Medical records for healthcare
83
+ medical_config = OntologyConfig(
84
+ name="medical-records-extractor",
85
+ agent_schema_id="medical-parser-v1",
86
+ description="Extract diagnoses and treatments from medical records",
87
+ mime_type_pattern="application/pdf",
88
+ tag_filter=["medical", "patient-record"],
89
+ priority=50,
90
+ enabled=True,
91
+ tenant_id="healthsystem",
92
+ tags=["medical", "hipaa-compliant"]
93
+ )
94
+ """
95
+
96
+ # Core fields
97
+ name: str
98
+ agent_schema_id: str # Foreign key to Schema entity
99
+ description: Optional[str] = None
100
+
101
+ # File matching rules (ANY rule can trigger match)
102
+ mime_type_pattern: Optional[str] = None # Regex for MIME type
103
+ uri_pattern: Optional[str] = None # Regex for file URI
104
+ tag_filter: list[str] = [] # File must have ALL tags
105
+
106
+ # Execution control
107
+ priority: int = 100 # Higher = runs first
108
+ enabled: bool = True # Toggle to disable without deleting
109
+
110
+ # Optional provider overrides
111
+ provider_name: Optional[str] = None # Override default provider
112
+ model_name: Optional[str] = None # Override default model
113
+
114
+ model_config = ConfigDict(
115
+ json_schema_extra={
116
+ "description": "Configuration for automatic ontology extraction from files",
117
+ "examples": [
118
+ {
119
+ "name": "recruitment-cv-parser",
120
+ "agent_schema_id": "cv-parser-v1",
121
+ "description": "Extract candidate information from resumes",
122
+ "mime_type_pattern": "application/pdf",
123
+ "uri_pattern": ".*/resumes/.*",
124
+ "tag_filter": ["cv", "candidate"],
125
+ "priority": 100,
126
+ "enabled": True,
127
+ "tenant_id": "acme-corp"
128
+ }
129
+ ]
130
+ }
131
+ )
@@ -0,0 +1,95 @@
1
+ """
2
+ Resource - Base content unit in REM.
3
+
4
+ Resources represent documents, conversations, artifacts, and any other
5
+ content units that form the foundation of the REM memory system.
6
+
7
+ Resources are the primary input to dreaming workflows:
8
+ - First-order dreaming extracts Moments from Resources
9
+ - Second-order dreaming creates affinity edges between Resources
10
+ - Entity extraction populates related_entities field
11
+ - Graph edges stored in graph_edges (inherited from CoreModel)
12
+
13
+ Key Fields:
14
+ - name: Human-readable resource identifier (used in graph labels)
15
+ - uri: Content location or identifier
16
+ - content: Actual content text
17
+ - timestamp: Content creation/publication time
18
+ - category: Resource classification (document, conversation, artifact, etc.)
19
+ - related_entities: Extracted entities (people, projects, concepts)
20
+ """
21
+
22
+ from datetime import datetime
23
+ from typing import Optional
24
+
25
+ from pydantic import Field, model_validator
26
+
27
+ from ..core import CoreModel
28
+
29
+
30
+ class Resource(CoreModel):
31
+ """
32
+ Base content unit in REM.
33
+
34
+ Resources are content units that feed into dreaming workflows for moment
35
+ extraction and affinity graph construction. Tenant isolation is provided
36
+ via CoreModel.tenant_id field.
37
+ """
38
+
39
+ name: Optional[str] = Field(
40
+ default=None,
41
+ description="Human-readable resource name (used as graph label). Auto-generated from uri+ordinal if not provided.",
42
+ json_schema_extra={"entity_key": True}, # Primary business key for KV lookups
43
+ )
44
+ uri: Optional[str] = Field(
45
+ default=None,
46
+ description="Content URI or identifier (file path, URL, etc.)",
47
+ )
48
+ ordinal: int = Field(
49
+ default=0,
50
+ description="Chunk ordinal for splitting large documents (0 for single-chunk resources)",
51
+ json_schema_extra={"composite_key": True}, # Part of composite unique constraint
52
+ )
53
+ content: str = Field(
54
+ default="",
55
+ description="Resource content text",
56
+ )
57
+ timestamp: datetime = Field(
58
+ default_factory=datetime.utcnow,
59
+ description="Resource timestamp (content creation/publication time)",
60
+ )
61
+ category: Optional[str] = Field(
62
+ default=None,
63
+ description="Resource category (document, conversation, artifact, etc.)",
64
+ )
65
+ related_entities: list[dict] = Field(
66
+ default_factory=list,
67
+ description="Extracted entities (people, projects, concepts) with metadata",
68
+ )
69
+
70
+ @model_validator(mode='after')
71
+ def generate_name_if_missing(self) -> 'Resource':
72
+ """Auto-generate name from uri+ordinal if not provided."""
73
+ if not self.name:
74
+ if self.uri:
75
+ # Extract filename from URI if possible
76
+ uri_parts = self.uri.rstrip('/').split('/')
77
+ filename = uri_parts[-1]
78
+
79
+ # Remove file extension for cleaner names
80
+ if '.' in filename:
81
+ filename = filename.rsplit('.', 1)[0]
82
+
83
+ # Generate name with ordinal
84
+ if self.ordinal > 0:
85
+ self.name = f"{filename}-chunk-{self.ordinal}"
86
+ else:
87
+ self.name = filename
88
+ else:
89
+ # Fallback: use ID or generic name
90
+ if self.id:
91
+ self.name = f"resource-{str(self.id)[:8]}"
92
+ else:
93
+ self.name = "unnamed-resource"
94
+
95
+ return self