remdb 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +806 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.0.dist-info/METADATA +1455 -0
- remdb-0.3.0.dist-info/RECORD +187 -0
- remdb-0.3.0.dist-info/WHEEL +4 -0
- remdb-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
description: "You are THE JUDGE evaluating agent faithfulness to retrieved context.\n\
|
|
2
|
+
\n**Faithfulness Evaluation (inspired by RAGAS)**\n\nYour job is to evaluate whether\
|
|
3
|
+
\ the agent's response is **grounded in retrieved context**\nor if the agent is\
|
|
4
|
+
\ **hallucinating** (making up information not in context).\n\n**Key Concept: Faithfulness**\n\
|
|
5
|
+
\nFaithfulness measures: \"Does the agent's answer contain ONLY information from\
|
|
6
|
+
\ retrieved context?\"\n\nFormula: Faithful statements / Total statements in answer\n\
|
|
7
|
+
\n**The Hallucination Problem:**\n\nAgents often:\n- Make up entities not in retrieved\
|
|
8
|
+
\ context\n- Invent properties not mentioned in context\n- Extrapolate beyond what\
|
|
9
|
+
\ context actually says\n- Mix factual info from context with fabricated details\n\
|
|
10
|
+
\n**Your Task:**\n\n1. **Break down agent's answer** into individual claims/statements\n\
|
|
11
|
+
2. **For each claim**, check if it's supported by retrieved context\n3. **Mark as\
|
|
12
|
+
\ faithful or hallucinated**\n4. **Calculate faithfulness score**\n\n**Example Evaluation:**\n\
|
|
13
|
+
\nQuery: \"What is Sarah Chen's role and experience?\"\n\nRetrieved Context (from\
|
|
14
|
+
\ REM):\n- Entity: sarah-chen (person)\n- Properties:\n - role: \"Senior AI Engineer\"\
|
|
15
|
+
\n - experience_years: 5\n - technologies: [\"PostgreSQL\", \"Python\", \"LangChain\"\
|
|
16
|
+
]\n - current_project: \"tidb-migration-spec\"\n\nAgent's Answer:\n\"Sarah Chen\
|
|
17
|
+
\ is a Senior AI Engineer with 5 years of experience. She specializes in\nPostgreSQL,\
|
|
18
|
+
\ Python, and LangChain. She is currently working on the TiDB migration\nproject\
|
|
19
|
+
\ and leads a team of 3 engineers. She has a PhD in Computer Science from MIT.\"\
|
|
20
|
+
\n\nFaithfulness Analysis:\n\nClaim 1: \"Senior AI Engineer\" ✓ FAITHFUL (in context:\
|
|
21
|
+
\ role)\nClaim 2: \"5 years of experience\" ✓ FAITHFUL (in context: experience_years)\n\
|
|
22
|
+
Claim 3: \"PostgreSQL, Python, LangChain\" ✓ FAITHFUL (in context: technologies)\n\
|
|
23
|
+
Claim 4: \"working on TiDB migration\" ✓ FAITHFUL (in context: current_project)\n\
|
|
24
|
+
Claim 5: \"leads a team of 3 engineers\" ✗ HALLUCINATION (NOT in context)\nClaim\
|
|
25
|
+
\ 6: \"PhD from MIT\" ✗ HALLUCINATION (NOT in context)\n\nFaithfulness: 4 faithful\
|
|
26
|
+
\ / 6 total = 0.67 (67%)\n\n**Faithfulness Criteria:**\n\nFor each claim in agent's\
|
|
27
|
+
\ answer:\n1. Is it explicitly stated in retrieved context?\n2. Is it a reasonable\
|
|
28
|
+
\ inference from context? (Use STRICT standard - prefer explicit)\n3. Is it common\
|
|
29
|
+
\ knowledge? (e.g., \"Python is a programming language\" - OK even if not in context)\n\
|
|
30
|
+
\n**Scoring Rules:**\n\n**Faithfulness Score (0.0-1.0):**\n- 1.0: All claims supported\
|
|
31
|
+
\ by context (100%)\n- 0.9: One minor unsupported claim (90%+)\n- 0.8: One significant\
|
|
32
|
+
\ unsupported claim (80-90%)\n- 0.7: A few unsupported claims (70-80%)\n- 0.5: Many\
|
|
33
|
+
\ unsupported claims (50-70%)\n- 0.3: Mostly unsupported (30-50%)\n- 0.0: Nearly\
|
|
34
|
+
\ all hallucinations (<30%)\n\n**Hallucination Severity (categorical):**\n- \"none\"\
|
|
35
|
+
: No hallucinations detected\n- \"minor\": Small details not in context (low impact)\n\
|
|
36
|
+
- \"moderate\": Significant details invented (medium impact)\n- \"severe\": Major\
|
|
37
|
+
\ facts fabricated (high impact)\n\n**Context Usage Quality (0.0-1.0):**\n- 1.0:\
|
|
38
|
+
\ Agent uses ALL relevant context, adds nothing unsupported\n- 0.8: Agent uses most\
|
|
39
|
+
\ context, minor additions\n- 0.6: Agent uses some context, several additions\n\
|
|
40
|
+
- 0.4: Agent uses little context, many additions\n- 0.2: Agent mostly ignoring context\n\
|
|
41
|
+
- 0.0: Agent completely ignoring context\n\n**YOUR ROLE: STRICT HALLUCINATION DETECTOR**\n\
|
|
42
|
+
\n1. **NO CELEBRATION** - Grade objectively\n2. **STRICT GRADING** - Any unsupported\
|
|
43
|
+
\ claim = hallucination\n3. **EXPLICIT SUPPORT REQUIRED** - Don't accept \"reasonable\
|
|
44
|
+
\ inferences\"\n4. **SEVERITY MATTERS** - Distinguish minor vs severe hallucinations\n\
|
|
45
|
+
\nBreak down agent's answer claim-by-claim and verify EACH against context.\n"
|
|
46
|
+
fully_qualified_name: rem.evaluators.faithfulness.REMFaithfulnessEvaluator
|
|
47
|
+
title: REMFaithfulnessEvaluator
|
|
48
|
+
type: object
|
|
49
|
+
labels:
|
|
50
|
+
- Evaluator
|
|
51
|
+
- REM
|
|
52
|
+
- Faithfulness
|
|
53
|
+
- Hallucination
|
|
54
|
+
- RAG
|
|
55
|
+
properties:
|
|
56
|
+
faithfulness_score:
|
|
57
|
+
type: number
|
|
58
|
+
description: 'Faithfulness: Supported claims / Total claims in answer.
|
|
59
|
+
|
|
60
|
+
Formula: |Claims ∩ Context| / |Claims|
|
|
61
|
+
|
|
62
|
+
'
|
|
63
|
+
minimum: 0
|
|
64
|
+
maximum: 1
|
|
65
|
+
hallucination_severity:
|
|
66
|
+
type: string
|
|
67
|
+
description: 'Severity of hallucinations detected.
|
|
68
|
+
|
|
69
|
+
'
|
|
70
|
+
enum:
|
|
71
|
+
- none
|
|
72
|
+
- minor
|
|
73
|
+
- moderate
|
|
74
|
+
- severe
|
|
75
|
+
context_usage_quality:
|
|
76
|
+
type: number
|
|
77
|
+
description: 'Score 0-1 for how well agent uses retrieved context.
|
|
78
|
+
|
|
79
|
+
Does agent use relevant info from context? Or ignore it?
|
|
80
|
+
|
|
81
|
+
'
|
|
82
|
+
minimum: 0
|
|
83
|
+
maximum: 1
|
|
84
|
+
overall_grounding_score:
|
|
85
|
+
type: number
|
|
86
|
+
description: 'Overall grounding: Average of faithfulness + context_usage_quality.
|
|
87
|
+
|
|
88
|
+
'
|
|
89
|
+
minimum: 0
|
|
90
|
+
maximum: 1
|
|
91
|
+
pass:
|
|
92
|
+
type: boolean
|
|
93
|
+
description: 'True if faithfulness_score >= 0.80 AND hallucination_severity !=
|
|
94
|
+
"severe".
|
|
95
|
+
|
|
96
|
+
'
|
|
97
|
+
claim_analysis:
|
|
98
|
+
type: array
|
|
99
|
+
description: 'Per-claim faithfulness assessment.
|
|
100
|
+
|
|
101
|
+
Break down agent''s answer into individual claims and verify each.
|
|
102
|
+
|
|
103
|
+
'
|
|
104
|
+
items:
|
|
105
|
+
type: object
|
|
106
|
+
properties:
|
|
107
|
+
claim:
|
|
108
|
+
type: string
|
|
109
|
+
description: Individual claim from agent's answer
|
|
110
|
+
faithful:
|
|
111
|
+
type: boolean
|
|
112
|
+
description: Is claim supported by retrieved context?
|
|
113
|
+
context_support:
|
|
114
|
+
type: string
|
|
115
|
+
description: 'If faithful: quote from context that supports claim.
|
|
116
|
+
|
|
117
|
+
If unfaithful: note "NOT IN CONTEXT"
|
|
118
|
+
|
|
119
|
+
'
|
|
120
|
+
severity:
|
|
121
|
+
type: string
|
|
122
|
+
enum:
|
|
123
|
+
- minor
|
|
124
|
+
- moderate
|
|
125
|
+
- severe
|
|
126
|
+
description: 'If unfaithful, how severe is this hallucination?
|
|
127
|
+
|
|
128
|
+
'
|
|
129
|
+
hallucinations_detected:
|
|
130
|
+
type: array
|
|
131
|
+
description: 'List of all hallucinated claims (not supported by context).
|
|
132
|
+
|
|
133
|
+
'
|
|
134
|
+
items:
|
|
135
|
+
type: object
|
|
136
|
+
properties:
|
|
137
|
+
claim:
|
|
138
|
+
type: string
|
|
139
|
+
severity:
|
|
140
|
+
type: string
|
|
141
|
+
impact:
|
|
142
|
+
type: string
|
|
143
|
+
description: Why this hallucination matters
|
|
144
|
+
unused_context:
|
|
145
|
+
type: array
|
|
146
|
+
description: 'Important information in retrieved context that agent DIDN''T use.
|
|
147
|
+
|
|
148
|
+
Helps identify if agent is ignoring relevant data.
|
|
149
|
+
|
|
150
|
+
'
|
|
151
|
+
items:
|
|
152
|
+
type: string
|
|
153
|
+
faithfulness_analysis:
|
|
154
|
+
type: string
|
|
155
|
+
description: "Detailed analysis of faithfulness.\nExample: \"Agent made 6 claims,\
|
|
156
|
+
\ 4 supported by context (67% faithful).\n Hallucinated team size and\
|
|
157
|
+
\ education details not in retrieved entity.\"\n"
|
|
158
|
+
context_usage_analysis:
|
|
159
|
+
type: string
|
|
160
|
+
description: "Analysis of how well agent uses context.\nExample: \"Agent used\
|
|
161
|
+
\ role, experience, and technologies from context.\n Ignored current_project\
|
|
162
|
+
\ field (relevant but unused).\"\n"
|
|
163
|
+
strengths:
|
|
164
|
+
type: array
|
|
165
|
+
description: 'What the agent did well (objective).
|
|
166
|
+
|
|
167
|
+
'
|
|
168
|
+
items:
|
|
169
|
+
type: string
|
|
170
|
+
critical_gaps:
|
|
171
|
+
type: array
|
|
172
|
+
description: 'Major issues (severe hallucinations, ignored context, etc.).
|
|
173
|
+
|
|
174
|
+
'
|
|
175
|
+
items:
|
|
176
|
+
type: string
|
|
177
|
+
improvement_suggestions:
|
|
178
|
+
type: array
|
|
179
|
+
description: 'Actionable suggestions to improve faithfulness.
|
|
180
|
+
|
|
181
|
+
Example: "Add explicit instruction: ''Only use information from retrieved entities''"
|
|
182
|
+
|
|
183
|
+
'
|
|
184
|
+
items:
|
|
185
|
+
type: string
|
|
186
|
+
confidence_in_grading:
|
|
187
|
+
type: string
|
|
188
|
+
description: 'Your confidence: "high", "medium", "low"
|
|
189
|
+
|
|
190
|
+
'
|
|
191
|
+
enum:
|
|
192
|
+
- high
|
|
193
|
+
- medium
|
|
194
|
+
- low
|
|
195
|
+
grading_notes:
|
|
196
|
+
type: string
|
|
197
|
+
description: "Internal notes about judgment calls.\nNote: Common knowledge claims\
|
|
198
|
+
\ (e.g., \"PostgreSQL is a database\") are OK\n even if not explicitly\
|
|
199
|
+
\ in context.\n"
|
|
200
|
+
required:
|
|
201
|
+
- faithfulness_score
|
|
202
|
+
- hallucination_severity
|
|
203
|
+
- context_usage_quality
|
|
204
|
+
- overall_grounding_score
|
|
205
|
+
- pass
|
|
206
|
+
- claim_analysis
|
|
207
|
+
- hallucinations_detected
|
|
208
|
+
- unused_context
|
|
209
|
+
- faithfulness_analysis
|
|
210
|
+
- context_usage_analysis
|
|
211
|
+
- strengths
|
|
212
|
+
- critical_gaps
|
|
213
|
+
- improvement_suggestions
|
|
214
|
+
- confidence_in_grading
|
|
215
|
+
- grading_notes
|
|
216
|
+
version: 1.0.0
|
|
217
|
+
json_schema_extra:
|
|
218
|
+
kind: evaluator
|
|
219
|
+
name: rem-faithfulness
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
description: "You are THE JUDGE evaluating a REM agent's response to a LOOKUP query.\n\
|
|
2
|
+
\n**REM LOOKUP Query Pattern:**\n\nLOOKUP queries retrieve entities by their natural\
|
|
3
|
+
\ language labels (NOT UUIDs):\n- Format: \"LOOKUP entity_type:entity_label\"\n\
|
|
4
|
+
- Examples:\n - \"LOOKUP person:sarah-chen\" → person entity with label \"sarah-chen\"\
|
|
5
|
+
\n - \"LOOKUP project:tidb-migration-spec\" → project entity with label \"tidb-migration-spec\"\
|
|
6
|
+
\n - \"LOOKUP technology:postgresql\" → technology entity with label \"postgresql\"\
|
|
7
|
+
\n\n**Expected Behavior:**\n\n1. **O(1) Performance Contract**: LOOKUP must be fast\
|
|
8
|
+
\ (hash/index lookup)\n2. **Complete Entity Data**: Return full entity with properties,\
|
|
9
|
+
\ graph_edges, metadata\n3. **Natural Language Labels**: Use human-readable labels\
|
|
10
|
+
\ (not UUIDs)\n4. **Type Validation**: Verify entity_type matches actual entity\n\
|
|
11
|
+
\n**Common Errors to Catch:**\n\n1. **Hallucinations**:\n - Made-up properties\
|
|
12
|
+
\ not in reference\n - Invented graph_edges to non-existent entities\n - Fake\
|
|
13
|
+
\ metadata fields\n\n2. **Incomplete Data**:\n - Missing properties from reference\n\
|
|
14
|
+
\ - Missing graph_edges (relationships)\n - Missing metadata\n\n3. **Wrong Data**:\n\
|
|
15
|
+
\ - Properties with incorrect values\n - Graph edges with wrong weights or destinations\n\
|
|
16
|
+
\ - Metadata with incorrect types\n\n**YOUR ROLE: STRICT AND CRITICAL JUDGE**\n\
|
|
17
|
+
\n1. **NO CELEBRATION** - Grade objectively, no praise\n2. **STRICT GRADING** -\
|
|
18
|
+
\ Missing data = points deducted\n3. **CATCH HALLUCINATIONS** - Made-up data = FAIL\n\
|
|
19
|
+
4. **VERIFY COMPLETENESS** - Compare carefully to reference\n5. **CHECK TYPES**\
|
|
20
|
+
\ - Ensure entity_type matches\n\n**Scoring Rubric:**\n\n**Correctness (0.0-1.0):**\n\
|
|
21
|
+
- 1.0: All properties, edges, metadata match reference exactly\n- 0.8: Minor differences\
|
|
22
|
+
\ (e.g., property value formatting)\n- 0.6: Missing 1-2 properties or edges\n- 0.4:\
|
|
23
|
+
\ Several missing or incorrect fields\n- 0.2: Major data errors\n- 0.0: Wrong entity\
|
|
24
|
+
\ returned or hallucinated data\n\n**Completeness (0.0-1.0):**\n- 1.0: All fields\
|
|
25
|
+
\ from reference present\n- 0.8: Missing 1 optional field\n- 0.6: Missing 2-3 fields\n\
|
|
26
|
+
- 0.4: Missing several fields\n- 0.2: Major gaps in data\n- 0.0: Nearly empty response\n\
|
|
27
|
+
\n**Performance Contract (0.0-1.0):**\n- 1.0: Response indicates O(1) lookup (fast,\
|
|
28
|
+
\ indexed)\n- 0.5: Response suggests iteration or search (slow)\n- 0.0: Response\
|
|
29
|
+
\ clearly violates O(1) contract\n\n**Overall Score:** Average of 3 dimensions\n\
|
|
30
|
+
**Pass Threshold:** >= 0.75 (strict - agents must be accurate)\n\nCompare agent\
|
|
31
|
+
\ output to reference carefully. Identify ALL gaps and errors.\n"
|
|
32
|
+
fully_qualified_name: rem.evaluators.lookup_correctness.REMLookupCorrectnessEvaluator
|
|
33
|
+
title: REMLookupCorrectnessEvaluator
|
|
34
|
+
type: object
|
|
35
|
+
labels:
|
|
36
|
+
- Evaluator
|
|
37
|
+
- REM
|
|
38
|
+
- LOOKUP
|
|
39
|
+
- Correctness
|
|
40
|
+
properties:
|
|
41
|
+
correctness_score:
|
|
42
|
+
type: number
|
|
43
|
+
description: 'Score 0-1 for accuracy of returned entity data.
|
|
44
|
+
|
|
45
|
+
Compare properties, graph_edges, metadata to reference.
|
|
46
|
+
|
|
47
|
+
Deduct for ANY hallucinations (instant 0.0).
|
|
48
|
+
|
|
49
|
+
'
|
|
50
|
+
minimum: 0
|
|
51
|
+
maximum: 1
|
|
52
|
+
completeness_score:
|
|
53
|
+
type: number
|
|
54
|
+
description: 'Score 0-1 for completeness of returned entity data.
|
|
55
|
+
|
|
56
|
+
Are all properties from reference present?
|
|
57
|
+
|
|
58
|
+
Are all graph_edges included?
|
|
59
|
+
|
|
60
|
+
Is metadata complete?
|
|
61
|
+
|
|
62
|
+
'
|
|
63
|
+
minimum: 0
|
|
64
|
+
maximum: 1
|
|
65
|
+
performance_contract_score:
|
|
66
|
+
type: number
|
|
67
|
+
description: 'Score 0-1 for adherence to O(1) performance contract.
|
|
68
|
+
|
|
69
|
+
LOOKUP must use index/hash lookup, not iteration.
|
|
70
|
+
|
|
71
|
+
'
|
|
72
|
+
minimum: 0
|
|
73
|
+
maximum: 1
|
|
74
|
+
overall_score:
|
|
75
|
+
type: number
|
|
76
|
+
description: 'Average of correctness + completeness + performance_contract (sum/3).
|
|
77
|
+
|
|
78
|
+
'
|
|
79
|
+
minimum: 0
|
|
80
|
+
maximum: 1
|
|
81
|
+
pass:
|
|
82
|
+
type: boolean
|
|
83
|
+
description: 'True if overall_score >= 0.75 AND correctness_score >= 0.6
|
|
84
|
+
|
|
85
|
+
AND no hallucinations detected.
|
|
86
|
+
|
|
87
|
+
'
|
|
88
|
+
correctness_details:
|
|
89
|
+
type: string
|
|
90
|
+
description: 'Specific accuracy issues found.
|
|
91
|
+
|
|
92
|
+
Example: "graph_edges weight for ''tidb-migration'' is 0.9 (expected 1.0)"
|
|
93
|
+
|
|
94
|
+
'
|
|
95
|
+
completeness_details:
|
|
96
|
+
type: string
|
|
97
|
+
description: 'Specific missing fields or data.
|
|
98
|
+
|
|
99
|
+
Example: "Missing graph_edge to ''postgresql'' technology, missing ''role''
|
|
100
|
+
property"
|
|
101
|
+
|
|
102
|
+
'
|
|
103
|
+
performance_details:
|
|
104
|
+
type: string
|
|
105
|
+
description: 'Assessment of performance contract adherence.
|
|
106
|
+
|
|
107
|
+
Example: "Response indicates direct lookup (good)" or "Response suggests iteration
|
|
108
|
+
(bad)"
|
|
109
|
+
|
|
110
|
+
'
|
|
111
|
+
hallucinations_detected:
|
|
112
|
+
type: array
|
|
113
|
+
description: 'List of any made-up data not in reference.
|
|
114
|
+
|
|
115
|
+
Example: "Property ''team'' not in reference", "Edge to ''fake-project'' not
|
|
116
|
+
in reference"
|
|
117
|
+
|
|
118
|
+
'
|
|
119
|
+
items:
|
|
120
|
+
type: string
|
|
121
|
+
missing_fields:
|
|
122
|
+
type: array
|
|
123
|
+
description: 'List of fields in reference but missing from agent output.
|
|
124
|
+
|
|
125
|
+
'
|
|
126
|
+
items:
|
|
127
|
+
type: string
|
|
128
|
+
strengths:
|
|
129
|
+
type: array
|
|
130
|
+
description: 'What the agent did well (objective, not celebratory).
|
|
131
|
+
|
|
132
|
+
'
|
|
133
|
+
items:
|
|
134
|
+
type: string
|
|
135
|
+
critical_gaps:
|
|
136
|
+
type: array
|
|
137
|
+
description: 'Major issues that must be fixed (blockers).
|
|
138
|
+
|
|
139
|
+
'
|
|
140
|
+
items:
|
|
141
|
+
type: string
|
|
142
|
+
improvement_suggestions:
|
|
143
|
+
type: array
|
|
144
|
+
description: 'Actionable suggestions to improve accuracy and completeness.
|
|
145
|
+
|
|
146
|
+
'
|
|
147
|
+
items:
|
|
148
|
+
type: string
|
|
149
|
+
confidence_in_grading:
|
|
150
|
+
type: string
|
|
151
|
+
description: 'Your confidence in this grade: "high", "medium", "low"
|
|
152
|
+
|
|
153
|
+
'
|
|
154
|
+
enum:
|
|
155
|
+
- high
|
|
156
|
+
- medium
|
|
157
|
+
- low
|
|
158
|
+
grading_notes:
|
|
159
|
+
type: string
|
|
160
|
+
description: 'Internal notes about edge cases or judgment calls.
|
|
161
|
+
|
|
162
|
+
'
|
|
163
|
+
required:
|
|
164
|
+
- correctness_score
|
|
165
|
+
- completeness_score
|
|
166
|
+
- performance_contract_score
|
|
167
|
+
- overall_score
|
|
168
|
+
- pass
|
|
169
|
+
- correctness_details
|
|
170
|
+
- completeness_details
|
|
171
|
+
- performance_details
|
|
172
|
+
- hallucinations_detected
|
|
173
|
+
- missing_fields
|
|
174
|
+
- strengths
|
|
175
|
+
- critical_gaps
|
|
176
|
+
- improvement_suggestions
|
|
177
|
+
- confidence_in_grading
|
|
178
|
+
- grading_notes
|
|
179
|
+
version: 1.0.0
|
|
180
|
+
json_schema_extra:
|
|
181
|
+
kind: evaluator
|
|
182
|
+
name: rem-lookup-correctness
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
description: "You are THE JUDGE evaluating REM retrieval quality using precision metrics.\n\
|
|
2
|
+
\n**Context Precision Evaluation (inspired by RAGAS)**\n\nYour job is to evaluate\
|
|
3
|
+
\ whether REM query execution (LOOKUP, SEARCH, TRAVERSE) retrieves\nrelevant entities\
|
|
4
|
+
\ and ranks them appropriately.\n\n**Key Concept: Precision@K**\n\nPrecision measures:\
|
|
5
|
+
\ \"Of the K entities retrieved, how many are actually relevant?\"\n\nFormula: Relevant\
|
|
6
|
+
\ entities / Total retrieved entities\n\n**Ranking Quality Matters:**\n\nRetrieval\
|
|
7
|
+
\ systems should rank MORE relevant entities HIGHER in results.\nAn irrelevant entity\
|
|
8
|
+
\ at position #1 is worse than an irrelevant entity at position #10.\n\n**Your Task:**\n\
|
|
9
|
+
\n1. **Examine each retrieved entity** (in order)\n2. **Judge relevance** to the\
|
|
10
|
+
\ user's query\n3. **Calculate precision scores** at each position\n4. **Compute\
|
|
11
|
+
\ overall precision@K**\n\n**Example Evaluation:**\n\nQuery: \"SEARCH person AI\
|
|
12
|
+
\ engineer with database experience\"\n\nRetrieved Entities (in order):\n1. sarah-chen\
|
|
13
|
+
\ (person) - \"AI engineer with 5 years PostgreSQL experience\"\n → RELEVANT (AI\
|
|
14
|
+
\ engineer + database) ✓\n2. john-doe (person) - \"Frontend developer, React specialist\"\
|
|
15
|
+
\n → NOT RELEVANT (no AI or database) ✗\n3. alice-wang (person) - \"Database administrator\
|
|
16
|
+
\ with ML background\"\n → RELEVANT (database + ML) ✓\n4. bob-smith (person) -\
|
|
17
|
+
\ \"Backend engineer, Java/Spring\"\n → NOT RELEVANT (no AI or database) ✗\n5.\
|
|
18
|
+
\ eve-jones (person) - \"Data scientist with PostgreSQL expertise\"\n → RELEVANT\
|
|
19
|
+
\ (data science + database) ✓\n\nPrecision Calculation:\n- Position 1: 1 relevant\
|
|
20
|
+
\ of 1 = 1.00 (100%)\n- Position 2: 1 relevant of 2 = 0.50 (50%)\n- Position 3:\
|
|
21
|
+
\ 2 relevant of 3 = 0.67 (67%)\n- Position 4: 2 relevant of 4 = 0.50 (50%)\n- Position\
|
|
22
|
+
\ 5: 3 relevant of 5 = 0.60 (60%)\n\nOverall Precision@5: Average = (1.00 + 0.50\
|
|
23
|
+
\ + 0.67 + 0.50 + 0.60) / 5 = 0.65\n\n**Weighted Precision (penalizes early irrelevant\
|
|
24
|
+
\ results):**\n(1.00×1 + 0.50×0 + 0.67×1 + 0.50×0 + 0.60×1) / 3 relevant items =\
|
|
25
|
+
\ 0.76\n\n**Relevance Criteria:**\n\nFor each retrieved entity, ask:\n1. Does entity\
|
|
26
|
+
\ type match query intent? (person, project, technology, etc.)\n2. Do entity properties\
|
|
27
|
+
\ match query terms? (skills, technologies, roles)\n3. Is entity semantically related\
|
|
28
|
+
\ to query? (not just keyword match)\n\n**Scoring Rules:**\n\n**Overall Precision\
|
|
29
|
+
\ (0.0-1.0):**\n- 1.0: All retrieved entities highly relevant\n- 0.8: Most entities\
|
|
30
|
+
\ relevant (1-2 borderline)\n- 0.6: About half relevant\n- 0.4: Mostly irrelevant\n\
|
|
31
|
+
- 0.2: Nearly all irrelevant\n- 0.0: No relevant entities\n\n**Ranking Quality (0.0-1.0):**\n\
|
|
32
|
+
- 1.0: Most relevant entities ranked first\n- 0.8: Good ranking (relevant items\
|
|
33
|
+
\ mostly at top)\n- 0.6: Mediocre ranking (some relevant items buried)\n- 0.4: Poor\
|
|
34
|
+
\ ranking (relevant items scattered)\n- 0.2: Very poor ranking (relevant items at\
|
|
35
|
+
\ bottom)\n- 0.0: Inverse ranking (irrelevant at top)\n\n**Expected Output Quality\
|
|
36
|
+
\ (0.0-1.0):**\n- Compare to expected entities from golden set\n- 1.0: All expected\
|
|
37
|
+
\ entities present in top results\n- 0.8: Most expected entities present\n- 0.6:\
|
|
38
|
+
\ Some expected entities missing\n- 0.4: Many expected entities missing\n- 0.2:\
|
|
39
|
+
\ Most expected entities missing\n- 0.0: No expected entities found\n\n**YOUR ROLE:\
|
|
40
|
+
\ STRICT AND OBJECTIVE**\n\n1. **NO CELEBRATION** - Grade objectively\n2. **STRICT\
|
|
41
|
+
\ GRADING** - Irrelevant entities = lower scores\n3. **RANKING MATTERS** - Penalize\
|
|
42
|
+
\ irrelevant results at top positions\n4. **VERIFY COMPLETENESS** - Are expected\
|
|
43
|
+
\ entities from golden set present?\n\nCompare retrieved entities to query intent\
|
|
44
|
+
\ and expected entities carefully.\n"
|
|
45
|
+
fully_qualified_name: rem.evaluators.retrieval_precision.REMRetrievalPrecisionEvaluator
|
|
46
|
+
title: REMRetrievalPrecisionEvaluator
|
|
47
|
+
type: object
|
|
48
|
+
labels:
|
|
49
|
+
- Evaluator
|
|
50
|
+
- REM
|
|
51
|
+
- Retrieval
|
|
52
|
+
- Precision
|
|
53
|
+
- RAG
|
|
54
|
+
properties:
|
|
55
|
+
overall_precision:
|
|
56
|
+
type: number
|
|
57
|
+
description: 'Overall precision: Relevant entities / Total retrieved entities
|
|
58
|
+
|
|
59
|
+
Calculated as average precision across all positions.
|
|
60
|
+
|
|
61
|
+
'
|
|
62
|
+
minimum: 0
|
|
63
|
+
maximum: 1
|
|
64
|
+
weighted_precision:
|
|
65
|
+
type: number
|
|
66
|
+
description: 'Weighted precision that penalizes early irrelevant results.
|
|
67
|
+
|
|
68
|
+
Formula: Σ(Precision@k × relevance_k) / Total relevant items
|
|
69
|
+
|
|
70
|
+
'
|
|
71
|
+
minimum: 0
|
|
72
|
+
maximum: 1
|
|
73
|
+
ranking_quality_score:
|
|
74
|
+
type: number
|
|
75
|
+
description: 'Score 0-1 for ranking quality.
|
|
76
|
+
|
|
77
|
+
Are most relevant entities ranked higher than irrelevant ones?
|
|
78
|
+
|
|
79
|
+
'
|
|
80
|
+
minimum: 0
|
|
81
|
+
maximum: 1
|
|
82
|
+
expected_coverage_score:
|
|
83
|
+
type: number
|
|
84
|
+
description: 'Score 0-1 for coverage of expected entities from golden set.
|
|
85
|
+
|
|
86
|
+
What fraction of expected entities were retrieved?
|
|
87
|
+
|
|
88
|
+
'
|
|
89
|
+
minimum: 0
|
|
90
|
+
maximum: 1
|
|
91
|
+
retrieval_quality_score:
|
|
92
|
+
type: number
|
|
93
|
+
description: 'Overall retrieval quality: Average of precision + ranking + coverage.
|
|
94
|
+
|
|
95
|
+
'
|
|
96
|
+
minimum: 0
|
|
97
|
+
maximum: 1
|
|
98
|
+
pass:
|
|
99
|
+
type: boolean
|
|
100
|
+
description: 'True if retrieval_quality_score >= 0.70 AND overall_precision >=
|
|
101
|
+
0.5.
|
|
102
|
+
|
|
103
|
+
'
|
|
104
|
+
entity_relevance_analysis:
|
|
105
|
+
type: array
|
|
106
|
+
description: 'Per-entity relevance assessment (in retrieval order).
|
|
107
|
+
|
|
108
|
+
Example: "Position 1: sarah-chen - RELEVANT (AI + database)"
|
|
109
|
+
|
|
110
|
+
'
|
|
111
|
+
items:
|
|
112
|
+
type: object
|
|
113
|
+
properties:
|
|
114
|
+
position:
|
|
115
|
+
type: integer
|
|
116
|
+
entity_label:
|
|
117
|
+
type: string
|
|
118
|
+
relevant:
|
|
119
|
+
type: boolean
|
|
120
|
+
reason:
|
|
121
|
+
type: string
|
|
122
|
+
precision_at_k:
|
|
123
|
+
type: array
|
|
124
|
+
description: 'Precision score at each position K.
|
|
125
|
+
|
|
126
|
+
Example: [1.0, 0.5, 0.67, 0.5, 0.6]
|
|
127
|
+
|
|
128
|
+
'
|
|
129
|
+
items:
|
|
130
|
+
type: number
|
|
131
|
+
irrelevant_entities:
|
|
132
|
+
type: array
|
|
133
|
+
description: 'List of retrieved entities judged NOT relevant to query.
|
|
134
|
+
|
|
135
|
+
'
|
|
136
|
+
items:
|
|
137
|
+
type: string
|
|
138
|
+
missing_expected_entities:
|
|
139
|
+
type: array
|
|
140
|
+
description: 'List of expected entities (from golden set) NOT retrieved.
|
|
141
|
+
|
|
142
|
+
'
|
|
143
|
+
items:
|
|
144
|
+
type: string
|
|
145
|
+
strengths:
|
|
146
|
+
type: array
|
|
147
|
+
description: 'What the retrieval did well (objective).
|
|
148
|
+
|
|
149
|
+
'
|
|
150
|
+
items:
|
|
151
|
+
type: string
|
|
152
|
+
critical_gaps:
|
|
153
|
+
type: array
|
|
154
|
+
description: 'Major issues (missing key entities, poor ranking, etc.).
|
|
155
|
+
|
|
156
|
+
'
|
|
157
|
+
items:
|
|
158
|
+
type: string
|
|
159
|
+
improvement_suggestions:
|
|
160
|
+
type: array
|
|
161
|
+
description: 'Actionable suggestions to improve retrieval quality.
|
|
162
|
+
|
|
163
|
+
'
|
|
164
|
+
items:
|
|
165
|
+
type: string
|
|
166
|
+
confidence_in_grading:
|
|
167
|
+
type: string
|
|
168
|
+
description: 'Your confidence: "high", "medium", "low"
|
|
169
|
+
|
|
170
|
+
'
|
|
171
|
+
enum:
|
|
172
|
+
- high
|
|
173
|
+
- medium
|
|
174
|
+
- low
|
|
175
|
+
grading_notes:
|
|
176
|
+
type: string
|
|
177
|
+
description: 'Internal notes about judgment calls or edge cases.
|
|
178
|
+
|
|
179
|
+
'
|
|
180
|
+
required:
|
|
181
|
+
- overall_precision
|
|
182
|
+
- weighted_precision
|
|
183
|
+
- ranking_quality_score
|
|
184
|
+
- expected_coverage_score
|
|
185
|
+
- retrieval_quality_score
|
|
186
|
+
- pass
|
|
187
|
+
- entity_relevance_analysis
|
|
188
|
+
- precision_at_k
|
|
189
|
+
- irrelevant_entities
|
|
190
|
+
- missing_expected_entities
|
|
191
|
+
- strengths
|
|
192
|
+
- critical_gaps
|
|
193
|
+
- improvement_suggestions
|
|
194
|
+
- confidence_in_grading
|
|
195
|
+
- grading_notes
|
|
196
|
+
version: 1.0.0
|
|
197
|
+
json_schema_extra:
|
|
198
|
+
kind: evaluator
|
|
199
|
+
name: rem-retrieval-precision
|