remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,77 @@
1
+ ---
2
+ type: object
3
+ description: |
4
+ Evaluate the hello-world agent responses for correctness and helpfulness.
5
+
6
+ You will receive:
7
+ - input: The user's question
8
+ - output: The agent's response (with "response" and "confidence" fields)
9
+ - expected: The reference/ground truth response
10
+
11
+ Scoring Rubric:
12
+ - Correctness (0-1): Does the response answer the question accurately?
13
+ 1.0 = Perfect match with reference
14
+ 0.8 = Semantically equivalent
15
+ 0.5 = Partially correct
16
+ 0.2 = Mostly wrong
17
+ 0.0 = Completely wrong
18
+
19
+ - Helpfulness (0-1): Is the response useful to the user?
20
+ 1.0 = Very helpful
21
+ 0.7 = Somewhat helpful
22
+ 0.5 = Neutral
23
+ 0.2 = Not very helpful
24
+ 0.0 = Not helpful at all
25
+
26
+ Pass threshold: Average score >= 0.7
27
+
28
+ properties:
29
+ correctness_score:
30
+ type: number
31
+ minimum: 0.0
32
+ maximum: 1.0
33
+ description: Score for factual correctness (0-1)
34
+
35
+ correctness_details:
36
+ type: string
37
+ description: Explanation of correctness assessment
38
+
39
+ helpfulness_score:
40
+ type: number
41
+ minimum: 0.0
42
+ maximum: 1.0
43
+ description: Score for helpfulness (0-1)
44
+
45
+ helpfulness_details:
46
+ type: string
47
+ description: Explanation of helpfulness assessment
48
+
49
+ overall_score:
50
+ type: number
51
+ minimum: 0.0
52
+ maximum: 1.0
53
+ description: Average of correctness and helpfulness scores
54
+
55
+ pass:
56
+ type: boolean
57
+ description: True if overall_score >= 0.7
58
+
59
+ explanation:
60
+ type: string
61
+ description: Overall explanation combining all assessments
62
+
63
+ required:
64
+ - correctness_score
65
+ - correctness_details
66
+ - helpfulness_score
67
+ - helpfulness_details
68
+ - overall_score
69
+ - pass
70
+ - explanation
71
+
72
+ json_schema_extra:
73
+ evaluator_type: llm-as-judge
74
+ provider_configs:
75
+ - provider_name: openai
76
+ model_name: gpt-4o-mini
77
+ labels: [test, hello-world, evaluator]
@@ -0,0 +1,219 @@
1
+ description: "You are THE JUDGE evaluating agent faithfulness to retrieved context.\n\
2
+ \n**Faithfulness Evaluation (inspired by RAGAS)**\n\nYour job is to evaluate whether\
3
+ \ the agent's response is **grounded in retrieved context**\nor if the agent is\
4
+ \ **hallucinating** (making up information not in context).\n\n**Key Concept: Faithfulness**\n\
5
+ \nFaithfulness measures: \"Does the agent's answer contain ONLY information from\
6
+ \ retrieved context?\"\n\nFormula: Faithful statements / Total statements in answer\n\
7
+ \n**The Hallucination Problem:**\n\nAgents often:\n- Make up entities not in retrieved\
8
+ \ context\n- Invent properties not mentioned in context\n- Extrapolate beyond what\
9
+ \ context actually says\n- Mix factual info from context with fabricated details\n\
10
+ \n**Your Task:**\n\n1. **Break down agent's answer** into individual claims/statements\n\
11
+ 2. **For each claim**, check if it's supported by retrieved context\n3. **Mark as\
12
+ \ faithful or hallucinated**\n4. **Calculate faithfulness score**\n\n**Example Evaluation:**\n\
13
+ \nQuery: \"What is Sarah Chen's role and experience?\"\n\nRetrieved Context (from\
14
+ \ REM):\n- Entity: sarah-chen (person)\n- Properties:\n - role: \"Senior AI Engineer\"\
15
+ \n - experience_years: 5\n - technologies: [\"PostgreSQL\", \"Python\", \"LangChain\"\
16
+ ]\n - current_project: \"tidb-migration-spec\"\n\nAgent's Answer:\n\"Sarah Chen\
17
+ \ is a Senior AI Engineer with 5 years of experience. She specializes in\nPostgreSQL,\
18
+ \ Python, and LangChain. She is currently working on the TiDB migration\nproject\
19
+ \ and leads a team of 3 engineers. She has a PhD in Computer Science from MIT.\"\
20
+ \n\nFaithfulness Analysis:\n\nClaim 1: \"Senior AI Engineer\" ✓ FAITHFUL (in context:\
21
+ \ role)\nClaim 2: \"5 years of experience\" ✓ FAITHFUL (in context: experience_years)\n\
22
+ Claim 3: \"PostgreSQL, Python, LangChain\" ✓ FAITHFUL (in context: technologies)\n\
23
+ Claim 4: \"working on TiDB migration\" ✓ FAITHFUL (in context: current_project)\n\
24
+ Claim 5: \"leads a team of 3 engineers\" ✗ HALLUCINATION (NOT in context)\nClaim\
25
+ \ 6: \"PhD from MIT\" ✗ HALLUCINATION (NOT in context)\n\nFaithfulness: 4 faithful\
26
+ \ / 6 total = 0.67 (67%)\n\n**Faithfulness Criteria:**\n\nFor each claim in agent's\
27
+ \ answer:\n1. Is it explicitly stated in retrieved context?\n2. Is it a reasonable\
28
+ \ inference from context? (Use STRICT standard - prefer explicit)\n3. Is it common\
29
+ \ knowledge? (e.g., \"Python is a programming language\" - OK even if not in context)\n\
30
+ \n**Scoring Rules:**\n\n**Faithfulness Score (0.0-1.0):**\n- 1.0: All claims supported\
31
+ \ by context (100%)\n- 0.9: One minor unsupported claim (90%+)\n- 0.8: One significant\
32
+ \ unsupported claim (80-90%)\n- 0.7: A few unsupported claims (70-80%)\n- 0.5: Many\
33
+ \ unsupported claims (50-70%)\n- 0.3: Mostly unsupported (30-50%)\n- 0.0: Nearly\
34
+ \ all hallucinations (<30%)\n\n**Hallucination Severity (categorical):**\n- \"none\"\
35
+ : No hallucinations detected\n- \"minor\": Small details not in context (low impact)\n\
36
+ - \"moderate\": Significant details invented (medium impact)\n- \"severe\": Major\
37
+ \ facts fabricated (high impact)\n\n**Context Usage Quality (0.0-1.0):**\n- 1.0:\
38
+ \ Agent uses ALL relevant context, adds nothing unsupported\n- 0.8: Agent uses most\
39
+ \ context, minor additions\n- 0.6: Agent uses some context, several additions\n\
40
+ - 0.4: Agent uses little context, many additions\n- 0.2: Agent mostly ignoring context\n\
41
+ - 0.0: Agent completely ignoring context\n\n**YOUR ROLE: STRICT HALLUCINATION DETECTOR**\n\
42
+ \n1. **NO CELEBRATION** - Grade objectively\n2. **STRICT GRADING** - Any unsupported\
43
+ \ claim = hallucination\n3. **EXPLICIT SUPPORT REQUIRED** - Don't accept \"reasonable\
44
+ \ inferences\"\n4. **SEVERITY MATTERS** - Distinguish minor vs severe hallucinations\n\
45
+ \nBreak down agent's answer claim-by-claim and verify EACH against context.\n"
46
+ fully_qualified_name: rem.evaluators.faithfulness.REMFaithfulnessEvaluator
47
+ title: REMFaithfulnessEvaluator
48
+ type: object
49
+ labels:
50
+ - Evaluator
51
+ - REM
52
+ - Faithfulness
53
+ - Hallucination
54
+ - RAG
55
+ properties:
56
+ faithfulness_score:
57
+ type: number
58
+ description: 'Faithfulness: Supported claims / Total claims in answer.
59
+
60
+ Formula: |Claims ∩ Context| / |Claims|
61
+
62
+ '
63
+ minimum: 0
64
+ maximum: 1
65
+ hallucination_severity:
66
+ type: string
67
+ description: 'Severity of hallucinations detected.
68
+
69
+ '
70
+ enum:
71
+ - none
72
+ - minor
73
+ - moderate
74
+ - severe
75
+ context_usage_quality:
76
+ type: number
77
+ description: 'Score 0-1 for how well agent uses retrieved context.
78
+
79
+ Does agent use relevant info from context? Or ignore it?
80
+
81
+ '
82
+ minimum: 0
83
+ maximum: 1
84
+ overall_grounding_score:
85
+ type: number
86
+ description: 'Overall grounding: Average of faithfulness + context_usage_quality.
87
+
88
+ '
89
+ minimum: 0
90
+ maximum: 1
91
+ pass:
92
+ type: boolean
93
+ description: 'True if faithfulness_score >= 0.80 AND hallucination_severity !=
94
+ "severe".
95
+
96
+ '
97
+ claim_analysis:
98
+ type: array
99
+ description: 'Per-claim faithfulness assessment.
100
+
101
+ Break down agent''s answer into individual claims and verify each.
102
+
103
+ '
104
+ items:
105
+ type: object
106
+ properties:
107
+ claim:
108
+ type: string
109
+ description: Individual claim from agent's answer
110
+ faithful:
111
+ type: boolean
112
+ description: Is claim supported by retrieved context?
113
+ context_support:
114
+ type: string
115
+ description: 'If faithful: quote from context that supports claim.
116
+
117
+ If unfaithful: note "NOT IN CONTEXT"
118
+
119
+ '
120
+ severity:
121
+ type: string
122
+ enum:
123
+ - minor
124
+ - moderate
125
+ - severe
126
+ description: 'If unfaithful, how severe is this hallucination?
127
+
128
+ '
129
+ hallucinations_detected:
130
+ type: array
131
+ description: 'List of all hallucinated claims (not supported by context).
132
+
133
+ '
134
+ items:
135
+ type: object
136
+ properties:
137
+ claim:
138
+ type: string
139
+ severity:
140
+ type: string
141
+ impact:
142
+ type: string
143
+ description: Why this hallucination matters
144
+ unused_context:
145
+ type: array
146
+ description: 'Important information in retrieved context that agent DIDN''T use.
147
+
148
+ Helps identify if agent is ignoring relevant data.
149
+
150
+ '
151
+ items:
152
+ type: string
153
+ faithfulness_analysis:
154
+ type: string
155
+ description: "Detailed analysis of faithfulness.\nExample: \"Agent made 6 claims,\
156
+ \ 4 supported by context (67% faithful).\n Hallucinated team size and\
157
+ \ education details not in retrieved entity.\"\n"
158
+ context_usage_analysis:
159
+ type: string
160
+ description: "Analysis of how well agent uses context.\nExample: \"Agent used\
161
+ \ role, experience, and technologies from context.\n Ignored current_project\
162
+ \ field (relevant but unused).\"\n"
163
+ strengths:
164
+ type: array
165
+ description: 'What the agent did well (objective).
166
+
167
+ '
168
+ items:
169
+ type: string
170
+ critical_gaps:
171
+ type: array
172
+ description: 'Major issues (severe hallucinations, ignored context, etc.).
173
+
174
+ '
175
+ items:
176
+ type: string
177
+ improvement_suggestions:
178
+ type: array
179
+ description: 'Actionable suggestions to improve faithfulness.
180
+
181
+ Example: "Add explicit instruction: ''Only use information from retrieved entities''"
182
+
183
+ '
184
+ items:
185
+ type: string
186
+ confidence_in_grading:
187
+ type: string
188
+ description: 'Your confidence: "high", "medium", "low"
189
+
190
+ '
191
+ enum:
192
+ - high
193
+ - medium
194
+ - low
195
+ grading_notes:
196
+ type: string
197
+ description: "Internal notes about judgment calls.\nNote: Common knowledge claims\
198
+ \ (e.g., \"PostgreSQL is a database\") are OK\n even if not explicitly\
199
+ \ in context.\n"
200
+ required:
201
+ - faithfulness_score
202
+ - hallucination_severity
203
+ - context_usage_quality
204
+ - overall_grounding_score
205
+ - pass
206
+ - claim_analysis
207
+ - hallucinations_detected
208
+ - unused_context
209
+ - faithfulness_analysis
210
+ - context_usage_analysis
211
+ - strengths
212
+ - critical_gaps
213
+ - improvement_suggestions
214
+ - confidence_in_grading
215
+ - grading_notes
216
+ version: 1.0.0
217
+ json_schema_extra:
218
+ kind: evaluator
219
+ name: rem-faithfulness
@@ -0,0 +1,182 @@
1
+ description: "You are THE JUDGE evaluating a REM agent's response to a LOOKUP query.\n\
2
+ \n**REM LOOKUP Query Pattern:**\n\nLOOKUP queries retrieve entities by their natural\
3
+ \ language labels (NOT UUIDs):\n- Format: \"LOOKUP entity_type:entity_label\"\n\
4
+ - Examples:\n - \"LOOKUP person:sarah-chen\" → person entity with label \"sarah-chen\"\
5
+ \n - \"LOOKUP project:tidb-migration-spec\" → project entity with label \"tidb-migration-spec\"\
6
+ \n - \"LOOKUP technology:postgresql\" → technology entity with label \"postgresql\"\
7
+ \n\n**Expected Behavior:**\n\n1. **O(1) Performance Contract**: LOOKUP must be fast\
8
+ \ (hash/index lookup)\n2. **Complete Entity Data**: Return full entity with properties,\
9
+ \ graph_edges, metadata\n3. **Natural Language Labels**: Use human-readable labels\
10
+ \ (not UUIDs)\n4. **Type Validation**: Verify entity_type matches actual entity\n\
11
+ \n**Common Errors to Catch:**\n\n1. **Hallucinations**:\n - Made-up properties\
12
+ \ not in reference\n - Invented graph_edges to non-existent entities\n - Fake\
13
+ \ metadata fields\n\n2. **Incomplete Data**:\n - Missing properties from reference\n\
14
+ \ - Missing graph_edges (relationships)\n - Missing metadata\n\n3. **Wrong Data**:\n\
15
+ \ - Properties with incorrect values\n - Graph edges with wrong weights or destinations\n\
16
+ \ - Metadata with incorrect types\n\n**YOUR ROLE: STRICT AND CRITICAL JUDGE**\n\
17
+ \n1. **NO CELEBRATION** - Grade objectively, no praise\n2. **STRICT GRADING** -\
18
+ \ Missing data = points deducted\n3. **CATCH HALLUCINATIONS** - Made-up data = FAIL\n\
19
+ 4. **VERIFY COMPLETENESS** - Compare carefully to reference\n5. **CHECK TYPES**\
20
+ \ - Ensure entity_type matches\n\n**Scoring Rubric:**\n\n**Correctness (0.0-1.0):**\n\
21
+ - 1.0: All properties, edges, metadata match reference exactly\n- 0.8: Minor differences\
22
+ \ (e.g., property value formatting)\n- 0.6: Missing 1-2 properties or edges\n- 0.4:\
23
+ \ Several missing or incorrect fields\n- 0.2: Major data errors\n- 0.0: Wrong entity\
24
+ \ returned or hallucinated data\n\n**Completeness (0.0-1.0):**\n- 1.0: All fields\
25
+ \ from reference present\n- 0.8: Missing 1 optional field\n- 0.6: Missing 2-3 fields\n\
26
+ - 0.4: Missing several fields\n- 0.2: Major gaps in data\n- 0.0: Nearly empty response\n\
27
+ \n**Performance Contract (0.0-1.0):**\n- 1.0: Response indicates O(1) lookup (fast,\
28
+ \ indexed)\n- 0.5: Response suggests iteration or search (slow)\n- 0.0: Response\
29
+ \ clearly violates O(1) contract\n\n**Overall Score:** Average of 3 dimensions\n\
30
+ **Pass Threshold:** >= 0.75 (strict - agents must be accurate)\n\nCompare agent\
31
+ \ output to reference carefully. Identify ALL gaps and errors.\n"
32
+ fully_qualified_name: rem.evaluators.lookup_correctness.REMLookupCorrectnessEvaluator
33
+ title: REMLookupCorrectnessEvaluator
34
+ type: object
35
+ labels:
36
+ - Evaluator
37
+ - REM
38
+ - LOOKUP
39
+ - Correctness
40
+ properties:
41
+ correctness_score:
42
+ type: number
43
+ description: 'Score 0-1 for accuracy of returned entity data.
44
+
45
+ Compare properties, graph_edges, metadata to reference.
46
+
47
+ Deduct for ANY hallucinations (instant 0.0).
48
+
49
+ '
50
+ minimum: 0
51
+ maximum: 1
52
+ completeness_score:
53
+ type: number
54
+ description: 'Score 0-1 for completeness of returned entity data.
55
+
56
+ Are all properties from reference present?
57
+
58
+ Are all graph_edges included?
59
+
60
+ Is metadata complete?
61
+
62
+ '
63
+ minimum: 0
64
+ maximum: 1
65
+ performance_contract_score:
66
+ type: number
67
+ description: 'Score 0-1 for adherence to O(1) performance contract.
68
+
69
+ LOOKUP must use index/hash lookup, not iteration.
70
+
71
+ '
72
+ minimum: 0
73
+ maximum: 1
74
+ overall_score:
75
+ type: number
76
+ description: 'Average of correctness + completeness + performance_contract (sum/3).
77
+
78
+ '
79
+ minimum: 0
80
+ maximum: 1
81
+ pass:
82
+ type: boolean
83
+ description: 'True if overall_score >= 0.75 AND correctness_score >= 0.6
84
+
85
+ AND no hallucinations detected.
86
+
87
+ '
88
+ correctness_details:
89
+ type: string
90
+ description: 'Specific accuracy issues found.
91
+
92
+ Example: "graph_edges weight for ''tidb-migration'' is 0.9 (expected 1.0)"
93
+
94
+ '
95
+ completeness_details:
96
+ type: string
97
+ description: 'Specific missing fields or data.
98
+
99
+ Example: "Missing graph_edge to ''postgresql'' technology, missing ''role''
100
+ property"
101
+
102
+ '
103
+ performance_details:
104
+ type: string
105
+ description: 'Assessment of performance contract adherence.
106
+
107
+ Example: "Response indicates direct lookup (good)" or "Response suggests iteration
108
+ (bad)"
109
+
110
+ '
111
+ hallucinations_detected:
112
+ type: array
113
+ description: 'List of any made-up data not in reference.
114
+
115
+ Example: "Property ''team'' not in reference", "Edge to ''fake-project'' not
116
+ in reference"
117
+
118
+ '
119
+ items:
120
+ type: string
121
+ missing_fields:
122
+ type: array
123
+ description: 'List of fields in reference but missing from agent output.
124
+
125
+ '
126
+ items:
127
+ type: string
128
+ strengths:
129
+ type: array
130
+ description: 'What the agent did well (objective, not celebratory).
131
+
132
+ '
133
+ items:
134
+ type: string
135
+ critical_gaps:
136
+ type: array
137
+ description: 'Major issues that must be fixed (blockers).
138
+
139
+ '
140
+ items:
141
+ type: string
142
+ improvement_suggestions:
143
+ type: array
144
+ description: 'Actionable suggestions to improve accuracy and completeness.
145
+
146
+ '
147
+ items:
148
+ type: string
149
+ confidence_in_grading:
150
+ type: string
151
+ description: 'Your confidence in this grade: "high", "medium", "low"
152
+
153
+ '
154
+ enum:
155
+ - high
156
+ - medium
157
+ - low
158
+ grading_notes:
159
+ type: string
160
+ description: 'Internal notes about edge cases or judgment calls.
161
+
162
+ '
163
+ required:
164
+ - correctness_score
165
+ - completeness_score
166
+ - performance_contract_score
167
+ - overall_score
168
+ - pass
169
+ - correctness_details
170
+ - completeness_details
171
+ - performance_details
172
+ - hallucinations_detected
173
+ - missing_fields
174
+ - strengths
175
+ - critical_gaps
176
+ - improvement_suggestions
177
+ - confidence_in_grading
178
+ - grading_notes
179
+ version: 1.0.0
180
+ json_schema_extra:
181
+ kind: evaluator
182
+ name: rem-lookup-correctness
@@ -0,0 +1,199 @@
1
+ description: "You are THE JUDGE evaluating REM retrieval quality using precision metrics.\n\
2
+ \n**Context Precision Evaluation (inspired by RAGAS)**\n\nYour job is to evaluate\
3
+ \ whether REM query execution (LOOKUP, SEARCH, TRAVERSE) retrieves\nrelevant entities\
4
+ \ and ranks them appropriately.\n\n**Key Concept: Precision@K**\n\nPrecision measures:\
5
+ \ \"Of the K entities retrieved, how many are actually relevant?\"\n\nFormula: Relevant\
6
+ \ entities / Total retrieved entities\n\n**Ranking Quality Matters:**\n\nRetrieval\
7
+ \ systems should rank MORE relevant entities HIGHER in results.\nAn irrelevant entity\
8
+ \ at position #1 is worse than an irrelevant entity at position #10.\n\n**Your Task:**\n\
9
+ \n1. **Examine each retrieved entity** (in order)\n2. **Judge relevance** to the\
10
+ \ user's query\n3. **Calculate precision scores** at each position\n4. **Compute\
11
+ \ overall precision@K**\n\n**Example Evaluation:**\n\nQuery: \"SEARCH person AI\
12
+ \ engineer with database experience\"\n\nRetrieved Entities (in order):\n1. sarah-chen\
13
+ \ (person) - \"AI engineer with 5 years PostgreSQL experience\"\n → RELEVANT (AI\
14
+ \ engineer + database) ✓\n2. john-doe (person) - \"Frontend developer, React specialist\"\
15
+ \n → NOT RELEVANT (no AI or database) ✗\n3. alice-wang (person) - \"Database administrator\
16
+ \ with ML background\"\n → RELEVANT (database + ML) ✓\n4. bob-smith (person) -\
17
+ \ \"Backend engineer, Java/Spring\"\n → NOT RELEVANT (no AI or database) ✗\n5.\
18
+ \ eve-jones (person) - \"Data scientist with PostgreSQL expertise\"\n → RELEVANT\
19
+ \ (data science + database) ✓\n\nPrecision Calculation:\n- Position 1: 1 relevant\
20
+ \ of 1 = 1.00 (100%)\n- Position 2: 1 relevant of 2 = 0.50 (50%)\n- Position 3:\
21
+ \ 2 relevant of 3 = 0.67 (67%)\n- Position 4: 2 relevant of 4 = 0.50 (50%)\n- Position\
22
+ \ 5: 3 relevant of 5 = 0.60 (60%)\n\nOverall Precision@5: Average = (1.00 + 0.50\
23
+ \ + 0.67 + 0.50 + 0.60) / 5 = 0.65\n\n**Weighted Precision (penalizes early irrelevant\
24
+ \ results):**\n(1.00×1 + 0.50×0 + 0.67×1 + 0.50×0 + 0.60×1) / 3 relevant items =\
25
+ \ 0.76\n\n**Relevance Criteria:**\n\nFor each retrieved entity, ask:\n1. Does entity\
26
+ \ type match query intent? (person, project, technology, etc.)\n2. Do entity properties\
27
+ \ match query terms? (skills, technologies, roles)\n3. Is entity semantically related\
28
+ \ to query? (not just keyword match)\n\n**Scoring Rules:**\n\n**Overall Precision\
29
+ \ (0.0-1.0):**\n- 1.0: All retrieved entities highly relevant\n- 0.8: Most entities\
30
+ \ relevant (1-2 borderline)\n- 0.6: About half relevant\n- 0.4: Mostly irrelevant\n\
31
+ - 0.2: Nearly all irrelevant\n- 0.0: No relevant entities\n\n**Ranking Quality (0.0-1.0):**\n\
32
+ - 1.0: Most relevant entities ranked first\n- 0.8: Good ranking (relevant items\
33
+ \ mostly at top)\n- 0.6: Mediocre ranking (some relevant items buried)\n- 0.4: Poor\
34
+ \ ranking (relevant items scattered)\n- 0.2: Very poor ranking (relevant items at\
35
+ \ bottom)\n- 0.0: Inverse ranking (irrelevant at top)\n\n**Expected Output Quality\
36
+ \ (0.0-1.0):**\n- Compare to expected entities from golden set\n- 1.0: All expected\
37
+ \ entities present in top results\n- 0.8: Most expected entities present\n- 0.6:\
38
+ \ Some expected entities missing\n- 0.4: Many expected entities missing\n- 0.2:\
39
+ \ Most expected entities missing\n- 0.0: No expected entities found\n\n**YOUR ROLE:\
40
+ \ STRICT AND OBJECTIVE**\n\n1. **NO CELEBRATION** - Grade objectively\n2. **STRICT\
41
+ \ GRADING** - Irrelevant entities = lower scores\n3. **RANKING MATTERS** - Penalize\
42
+ \ irrelevant results at top positions\n4. **VERIFY COMPLETENESS** - Are expected\
43
+ \ entities from golden set present?\n\nCompare retrieved entities to query intent\
44
+ \ and expected entities carefully.\n"
45
+ fully_qualified_name: rem.evaluators.retrieval_precision.REMRetrievalPrecisionEvaluator
46
+ title: REMRetrievalPrecisionEvaluator
47
+ type: object
48
+ labels:
49
+ - Evaluator
50
+ - REM
51
+ - Retrieval
52
+ - Precision
53
+ - RAG
54
+ properties:
55
+ overall_precision:
56
+ type: number
57
+ description: 'Overall precision: Relevant entities / Total retrieved entities
58
+
59
+ Calculated as average precision across all positions.
60
+
61
+ '
62
+ minimum: 0
63
+ maximum: 1
64
+ weighted_precision:
65
+ type: number
66
+ description: 'Weighted precision that penalizes early irrelevant results.
67
+
68
+ Formula: Σ(Precision@k × relevance_k) / Total relevant items
69
+
70
+ '
71
+ minimum: 0
72
+ maximum: 1
73
+ ranking_quality_score:
74
+ type: number
75
+ description: 'Score 0-1 for ranking quality.
76
+
77
+ Are most relevant entities ranked higher than irrelevant ones?
78
+
79
+ '
80
+ minimum: 0
81
+ maximum: 1
82
+ expected_coverage_score:
83
+ type: number
84
+ description: 'Score 0-1 for coverage of expected entities from golden set.
85
+
86
+ What fraction of expected entities were retrieved?
87
+
88
+ '
89
+ minimum: 0
90
+ maximum: 1
91
+ retrieval_quality_score:
92
+ type: number
93
+ description: 'Overall retrieval quality: Average of precision + ranking + coverage.
94
+
95
+ '
96
+ minimum: 0
97
+ maximum: 1
98
+ pass:
99
+ type: boolean
100
+ description: 'True if retrieval_quality_score >= 0.70 AND overall_precision >=
101
+ 0.5.
102
+
103
+ '
104
+ entity_relevance_analysis:
105
+ type: array
106
+ description: 'Per-entity relevance assessment (in retrieval order).
107
+
108
+ Example: "Position 1: sarah-chen - RELEVANT (AI + database)"
109
+
110
+ '
111
+ items:
112
+ type: object
113
+ properties:
114
+ position:
115
+ type: integer
116
+ entity_label:
117
+ type: string
118
+ relevant:
119
+ type: boolean
120
+ reason:
121
+ type: string
122
+ precision_at_k:
123
+ type: array
124
+ description: 'Precision score at each position K.
125
+
126
+ Example: [1.0, 0.5, 0.67, 0.5, 0.6]
127
+
128
+ '
129
+ items:
130
+ type: number
131
+ irrelevant_entities:
132
+ type: array
133
+ description: 'List of retrieved entities judged NOT relevant to query.
134
+
135
+ '
136
+ items:
137
+ type: string
138
+ missing_expected_entities:
139
+ type: array
140
+ description: 'List of expected entities (from golden set) NOT retrieved.
141
+
142
+ '
143
+ items:
144
+ type: string
145
+ strengths:
146
+ type: array
147
+ description: 'What the retrieval did well (objective).
148
+
149
+ '
150
+ items:
151
+ type: string
152
+ critical_gaps:
153
+ type: array
154
+ description: 'Major issues (missing key entities, poor ranking, etc.).
155
+
156
+ '
157
+ items:
158
+ type: string
159
+ improvement_suggestions:
160
+ type: array
161
+ description: 'Actionable suggestions to improve retrieval quality.
162
+
163
+ '
164
+ items:
165
+ type: string
166
+ confidence_in_grading:
167
+ type: string
168
+ description: 'Your confidence: "high", "medium", "low"
169
+
170
+ '
171
+ enum:
172
+ - high
173
+ - medium
174
+ - low
175
+ grading_notes:
176
+ type: string
177
+ description: 'Internal notes about judgment calls or edge cases.
178
+
179
+ '
180
+ required:
181
+ - overall_precision
182
+ - weighted_precision
183
+ - ranking_quality_score
184
+ - expected_coverage_score
185
+ - retrieval_quality_score
186
+ - pass
187
+ - entity_relevance_analysis
188
+ - precision_at_k
189
+ - irrelevant_entities
190
+ - missing_expected_entities
191
+ - strengths
192
+ - critical_gaps
193
+ - improvement_suggestions
194
+ - confidence_in_grading
195
+ - grading_notes
196
+ version: 1.0.0
197
+ json_schema_extra:
198
+ kind: evaluator
199
+ name: rem-retrieval-precision