remdb 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +806 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.0.dist-info/METADATA +1455 -0
- remdb-0.3.0.dist-info/RECORD +187 -0
- remdb-0.3.0.dist-info/WHEEL +4 -0
- remdb-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
description: "You are THE JUDGE evaluating REM retrieval quality using recall metrics.\n\
|
|
2
|
+
\n**Context Recall Evaluation (inspired by RAGAS)**\n\nYour job is to evaluate whether\
|
|
3
|
+
\ REM query execution retrieves ALL relevant entities\nthat should be found for\
|
|
4
|
+
\ a given query.\n\n**Key Concept: Recall**\n\nRecall measures: \"Of all the relevant\
|
|
5
|
+
\ entities that SHOULD be retrieved, how many were actually retrieved?\"\n\nFormula:\
|
|
6
|
+
\ Retrieved relevant entities / Total relevant entities (from golden set)\n\n**The\
|
|
7
|
+
\ Coverage Problem:**\n\n- **High Precision, Low Recall**: Retrieved entities are\
|
|
8
|
+
\ relevant, but many are missing\n- **Low Precision, High Recall**: Retrieved many\
|
|
9
|
+
\ entities, but also grabbed irrelevant ones\n- **Goal**: High precision AND high\
|
|
10
|
+
\ recall\n\n**Your Task:**\n\n1. **Review expected entities** from golden set (what\
|
|
11
|
+
\ SHOULD be retrieved)\n2. **Review retrieved entities** from REM query\n3. **Calculate\
|
|
12
|
+
\ recall** - what fraction of expected entities were found?\n4. **Identify gaps**\
|
|
13
|
+
\ - which expected entities are missing?\n\n**Example Evaluation:**\n\nQuery: \"\
|
|
14
|
+
SEARCH person AI engineer with database experience\"\n\nExpected Entities (from\
|
|
15
|
+
\ golden set):\n- sarah-chen (person) - \"AI engineer with 5 years PostgreSQL experience\"\
|
|
16
|
+
\n- alice-wang (person) - \"Database administrator with ML background\"\n- eve-jones\
|
|
17
|
+
\ (person) - \"Data scientist with PostgreSQL expertise\"\n\nRetrieved Entities:\n\
|
|
18
|
+
- sarah-chen ✓ (found)\n- john-doe (not expected - false positive)\n- alice-wang\
|
|
19
|
+
\ ✓ (found)\n- bob-smith (not expected - false positive)\n\nRecall Calculation:\n\
|
|
20
|
+
- Found: sarah-chen, alice-wang (2 entities)\n- Expected: sarah-chen, alice-wang,\
|
|
21
|
+
\ eve-jones (3 entities)\n- Recall: 2/3 = 0.67 (67%)\n\nMissing: eve-jones (why?\
|
|
22
|
+
\ Bad embedding? Wrong query parsing?)\n\n**Recall Criteria:**\n\nFor each expected\
|
|
23
|
+
\ entity from golden set:\n1. Was it retrieved? (present in results)\n2. If not,\
|
|
24
|
+
\ why might it be missing?\n - Embedding quality issue?\n - Query parsing problem?\n\
|
|
25
|
+
\ - Entity missing from database?\n - Ranking too low (buried beyond top-K)?\n\
|
|
26
|
+
\n**Scoring Rules:**\n\n**Recall Score (0.0-1.0):**\n- 1.0: All expected entities\
|
|
27
|
+
\ retrieved\n- 0.8: Missing 1 expected entity (90%+ recall)\n- 0.6: Missing 2-3\
|
|
28
|
+
\ expected entities (60-80% recall)\n- 0.4: Missing several expected entities (40-60%\
|
|
29
|
+
\ recall)\n- 0.2: Missing most expected entities (20-40% recall)\n- 0.0: Missing\
|
|
30
|
+
\ all expected entities (0% recall)\n\n**Ranking Depth (0.0-1.0):**\n- How deep\
|
|
31
|
+
\ in results are expected entities found?\n- 1.0: All expected entities in top 3\
|
|
32
|
+
\ positions\n- 0.8: All expected entities in top 5 positions\n- 0.6: All expected\
|
|
33
|
+
\ entities in top 10 positions\n- 0.4: Some expected entities beyond position 10\n\
|
|
34
|
+
- 0.2: Expected entities buried deep in results\n- 0.0: Expected entities not found\
|
|
35
|
+
\ at all\n\n**Coverage Quality (0.0-1.0):**\n- Balance between recall and precision\n\
|
|
36
|
+
- 1.0: High recall (>0.9) AND high precision (>0.8)\n- 0.8: Good recall (>0.7) AND\
|
|
37
|
+
\ good precision (>0.6)\n- 0.6: Moderate recall (>0.5) AND moderate precision (>0.5)\n\
|
|
38
|
+
- 0.4: Poor recall or precision\n- 0.2: Very poor recall and precision\n- 0.0: Nearly\
|
|
39
|
+
\ zero recall or precision\n\n**YOUR ROLE: STRICT AND DIAGNOSTIC**\n\n1. **NO CELEBRATION**\
|
|
40
|
+
\ - Grade objectively\n2. **STRICT GRADING** - Missing entities = lower recall\n\
|
|
41
|
+
3. **DIAGNOSE GAPS** - Why are expected entities missing?\n4. **RANKING DEPTH**\
|
|
42
|
+
\ - Are expected entities buried deep?\n\nCompare retrieved entities to expected\
|
|
43
|
+
\ golden set carefully.\nIdentify ALL missing entities and hypothesize why they're\
|
|
44
|
+
\ missing.\n"
|
|
45
|
+
fully_qualified_name: rem.evaluators.retrieval_recall.REMRetrievalRecallEvaluator
|
|
46
|
+
title: REMRetrievalRecallEvaluator
|
|
47
|
+
type: object
|
|
48
|
+
labels:
|
|
49
|
+
- Evaluator
|
|
50
|
+
- REM
|
|
51
|
+
- Retrieval
|
|
52
|
+
- Recall
|
|
53
|
+
- RAG
|
|
54
|
+
properties:
|
|
55
|
+
recall_score:
|
|
56
|
+
type: number
|
|
57
|
+
description: 'Recall: Retrieved expected entities / Total expected entities.
|
|
58
|
+
|
|
59
|
+
Formula: |Found ∩ Expected| / |Expected|
|
|
60
|
+
|
|
61
|
+
'
|
|
62
|
+
minimum: 0
|
|
63
|
+
maximum: 1
|
|
64
|
+
ranking_depth_score:
|
|
65
|
+
type: number
|
|
66
|
+
description: 'Score 0-1 for ranking depth of expected entities.
|
|
67
|
+
|
|
68
|
+
Are expected entities ranked high (top-K) or buried deep?
|
|
69
|
+
|
|
70
|
+
'
|
|
71
|
+
minimum: 0
|
|
72
|
+
maximum: 1
|
|
73
|
+
coverage_quality_score:
|
|
74
|
+
type: number
|
|
75
|
+
description: 'Balance between recall and precision.
|
|
76
|
+
|
|
77
|
+
Combines recall score with precision context.
|
|
78
|
+
|
|
79
|
+
'
|
|
80
|
+
minimum: 0
|
|
81
|
+
maximum: 1
|
|
82
|
+
retrieval_completeness_score:
|
|
83
|
+
type: number
|
|
84
|
+
description: 'Overall completeness: Average of recall + ranking_depth + coverage_quality.
|
|
85
|
+
|
|
86
|
+
'
|
|
87
|
+
minimum: 0
|
|
88
|
+
maximum: 1
|
|
89
|
+
pass:
|
|
90
|
+
type: boolean
|
|
91
|
+
description: 'True if recall_score >= 0.70 AND retrieval_completeness_score >=
|
|
92
|
+
0.70.
|
|
93
|
+
|
|
94
|
+
'
|
|
95
|
+
expected_entities_found:
|
|
96
|
+
type: array
|
|
97
|
+
description: 'List of expected entities that WERE retrieved.
|
|
98
|
+
|
|
99
|
+
Include position in results.
|
|
100
|
+
|
|
101
|
+
'
|
|
102
|
+
items:
|
|
103
|
+
type: object
|
|
104
|
+
properties:
|
|
105
|
+
entity_label:
|
|
106
|
+
type: string
|
|
107
|
+
position:
|
|
108
|
+
type: integer
|
|
109
|
+
notes:
|
|
110
|
+
type: string
|
|
111
|
+
missing_expected_entities:
|
|
112
|
+
type: array
|
|
113
|
+
description: 'List of expected entities that were NOT retrieved.
|
|
114
|
+
|
|
115
|
+
Include hypothesis for why missing.
|
|
116
|
+
|
|
117
|
+
'
|
|
118
|
+
items:
|
|
119
|
+
type: object
|
|
120
|
+
properties:
|
|
121
|
+
entity_label:
|
|
122
|
+
type: string
|
|
123
|
+
entity_type:
|
|
124
|
+
type: string
|
|
125
|
+
missing_reason_hypothesis:
|
|
126
|
+
type: string
|
|
127
|
+
description: "Why might this entity be missing?\nOptions: \"embedding_quality\"\
|
|
128
|
+
, \"query_parsing\", \"not_in_db\",\n \"ranking_too_low\", \"\
|
|
129
|
+
type_filtering\", \"other\"\n"
|
|
130
|
+
recall_analysis:
|
|
131
|
+
type: string
|
|
132
|
+
description: "Detailed analysis of recall performance.\nExample: \"Found 3 of\
|
|
133
|
+
\ 4 expected entities (75% recall). Missing 'eve-jones'\n likely due\
|
|
134
|
+
\ to poor embedding quality - her profile mentions 'data scientist'\n \
|
|
135
|
+
\ not 'AI engineer' explicitly.\"\n"
|
|
136
|
+
ranking_depth_analysis:
|
|
137
|
+
type: string
|
|
138
|
+
description: "Analysis of where expected entities appear in results.\nExample:\
|
|
139
|
+
\ \"Expected entities ranked at positions 1, 3, 8. Position 8 is too deep\n\
|
|
140
|
+
\ for typical user queries (most users check top 5).\"\n"
|
|
141
|
+
false_positives:
|
|
142
|
+
type: array
|
|
143
|
+
description: 'Entities retrieved but NOT in expected set.
|
|
144
|
+
|
|
145
|
+
Note: Not necessarily wrong (golden set may be incomplete).
|
|
146
|
+
|
|
147
|
+
'
|
|
148
|
+
items:
|
|
149
|
+
type: string
|
|
150
|
+
strengths:
|
|
151
|
+
type: array
|
|
152
|
+
description: 'What the retrieval did well (objective).
|
|
153
|
+
|
|
154
|
+
'
|
|
155
|
+
items:
|
|
156
|
+
type: string
|
|
157
|
+
critical_gaps:
|
|
158
|
+
type: array
|
|
159
|
+
description: 'Major issues (missing key entities, poor coverage, etc.).
|
|
160
|
+
|
|
161
|
+
'
|
|
162
|
+
items:
|
|
163
|
+
type: string
|
|
164
|
+
improvement_suggestions:
|
|
165
|
+
type: array
|
|
166
|
+
description: 'Actionable suggestions to improve recall.
|
|
167
|
+
|
|
168
|
+
Example: "Improve embeddings for ''data scientist'' → ''AI engineer'' semantic
|
|
169
|
+
similarity"
|
|
170
|
+
|
|
171
|
+
'
|
|
172
|
+
items:
|
|
173
|
+
type: string
|
|
174
|
+
confidence_in_grading:
|
|
175
|
+
type: string
|
|
176
|
+
description: 'Your confidence: "high", "medium", "low"
|
|
177
|
+
|
|
178
|
+
Note: Low confidence if golden set may be incomplete
|
|
179
|
+
|
|
180
|
+
'
|
|
181
|
+
enum:
|
|
182
|
+
- high
|
|
183
|
+
- medium
|
|
184
|
+
- low
|
|
185
|
+
grading_notes:
|
|
186
|
+
type: string
|
|
187
|
+
description: 'Internal notes about judgment calls.
|
|
188
|
+
|
|
189
|
+
Note if golden set seems incomplete (retrieved valid entities not in expected).
|
|
190
|
+
|
|
191
|
+
'
|
|
192
|
+
required:
|
|
193
|
+
- recall_score
|
|
194
|
+
- ranking_depth_score
|
|
195
|
+
- coverage_quality_score
|
|
196
|
+
- retrieval_completeness_score
|
|
197
|
+
- pass
|
|
198
|
+
- expected_entities_found
|
|
199
|
+
- missing_expected_entities
|
|
200
|
+
- recall_analysis
|
|
201
|
+
- ranking_depth_analysis
|
|
202
|
+
- false_positives
|
|
203
|
+
- strengths
|
|
204
|
+
- critical_gaps
|
|
205
|
+
- improvement_suggestions
|
|
206
|
+
- confidence_in_grading
|
|
207
|
+
- grading_notes
|
|
208
|
+
version: 1.0.0
|
|
209
|
+
json_schema_extra:
|
|
210
|
+
kind: evaluator
|
|
211
|
+
name: rem-retrieval-recall
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
description: "You are THE JUDGE evaluating a REM agent's response to a SEARCH query.\n\
|
|
2
|
+
\n**REM SEARCH Query Pattern:**\n\nSEARCH queries perform semantic vector search\
|
|
3
|
+
\ across entity types:\n- Format: \"SEARCH entity_types query_text\"\n- Examples:\n\
|
|
4
|
+
\ - \"SEARCH person,project AI engineer with database experience\"\n - \"SEARCH\
|
|
5
|
+
\ technology graph database with vector support\"\n - \"SEARCH document migration\
|
|
6
|
+
\ planning guide\"\n\n**Expected Behavior:**\n\n1. **Semantic Ranking**: Results\
|
|
7
|
+
\ ranked by relevance to query\n2. **Type Filtering**: Only return requested entity\
|
|
8
|
+
\ types\n3. **Top-K Results**: Typically return 5-10 most relevant entities\n4.\
|
|
9
|
+
\ **Relevance Scores**: Include similarity scores when available\n5. **Entity Labels**:\
|
|
10
|
+
\ Use natural language labels (not UUIDs)\n\n**Common Errors to Catch:**\n\n1. **Wrong\
|
|
11
|
+
\ Entity Types**:\n - Returns person when asked for project\n - Mixes types\
|
|
12
|
+
\ when specific type requested\n\n2. **Poor Relevance**:\n - Returns unrelated\
|
|
13
|
+
\ entities\n - Missing obviously relevant entities from reference\n - Poor ranking\
|
|
14
|
+
\ (irrelevant results ranked high)\n\n3. **Incomplete Results**:\n - Returns fewer\
|
|
15
|
+
\ results than expected\n - Missing key entities from reference golden set\n\n\
|
|
16
|
+
4. **Hallucinations**:\n - Invented entities not in reference\n - Made-up properties\
|
|
17
|
+
\ or metadata\n\n**YOUR ROLE: STRICT AND CRITICAL JUDGE**\n\n1. **NO CELEBRATION**\
|
|
18
|
+
\ - Grade objectively\n2. **STRICT GRADING** - Missing relevant results = points\
|
|
19
|
+
\ deducted\n3. **CATCH HALLUCINATIONS** - Made-up entities = FAIL\n4. **VERIFY RELEVANCE**\
|
|
20
|
+
\ - Are results actually related to query?\n5. **CHECK RANKING** - Are most relevant\
|
|
21
|
+
\ results ranked first?\n\n**Scoring Rubric:**\n\n**Relevance (0.0-1.0):**\n- 1.0:\
|
|
22
|
+
\ All results highly relevant to query\n- 0.8: Most results relevant, 1-2 borderline\n\
|
|
23
|
+
- 0.6: Several irrelevant results\n- 0.4: Many irrelevant results\n- 0.2: Mostly\
|
|
24
|
+
\ irrelevant\n- 0.0: Completely irrelevant or wrong types\n\n**Completeness (0.0-1.0):**\n\
|
|
25
|
+
- 1.0: All expected entities from reference present\n- 0.8: Missing 1 expected entity\n\
|
|
26
|
+
- 0.6: Missing 2-3 expected entities\n- 0.4: Missing several expected entities\n\
|
|
27
|
+
- 0.2: Missing most expected entities\n- 0.0: Missing all expected entities\n\n\
|
|
28
|
+
**Ranking Quality (0.0-1.0):**\n- 1.0: Most relevant results ranked first\n- 0.8:\
|
|
29
|
+
\ Good ranking with minor issues\n- 0.6: Mediocre ranking (some relevant buried)\n\
|
|
30
|
+
- 0.4: Poor ranking\n- 0.2: Very poor ranking\n- 0.0: No discernible ranking logic\n\
|
|
31
|
+
\n**Overall Score:** Average of 3 dimensions\n**Pass Threshold:** >= 0.70 (slightly\
|
|
32
|
+
\ lower than LOOKUP - semantic matching is harder)\n\nCompare agent results to reference\
|
|
33
|
+
\ golden set. Check relevance, completeness, ranking.\n"
|
|
34
|
+
fully_qualified_name: rem.evaluators.search_correctness.REMSearchCorrectnessEvaluator
|
|
35
|
+
title: REMSearchCorrectnessEvaluator
|
|
36
|
+
type: object
|
|
37
|
+
labels:
|
|
38
|
+
- Evaluator
|
|
39
|
+
- REM
|
|
40
|
+
- SEARCH
|
|
41
|
+
- Correctness
|
|
42
|
+
- Semantic
|
|
43
|
+
properties:
|
|
44
|
+
relevance_score:
|
|
45
|
+
type: number
|
|
46
|
+
description: 'Score 0-1 for relevance of returned entities to query.
|
|
47
|
+
|
|
48
|
+
Are results semantically related to query text?
|
|
49
|
+
|
|
50
|
+
Are entity types correct?
|
|
51
|
+
|
|
52
|
+
'
|
|
53
|
+
minimum: 0
|
|
54
|
+
maximum: 1
|
|
55
|
+
completeness_score:
|
|
56
|
+
type: number
|
|
57
|
+
description: 'Score 0-1 for completeness compared to reference.
|
|
58
|
+
|
|
59
|
+
Are all expected entities from reference present?
|
|
60
|
+
|
|
61
|
+
Are key relevant entities included?
|
|
62
|
+
|
|
63
|
+
'
|
|
64
|
+
minimum: 0
|
|
65
|
+
maximum: 1
|
|
66
|
+
ranking_quality_score:
|
|
67
|
+
type: number
|
|
68
|
+
description: 'Score 0-1 for ranking quality.
|
|
69
|
+
|
|
70
|
+
Are most relevant results ranked first?
|
|
71
|
+
|
|
72
|
+
Is there clear relevance ordering?
|
|
73
|
+
|
|
74
|
+
'
|
|
75
|
+
minimum: 0
|
|
76
|
+
maximum: 1
|
|
77
|
+
overall_score:
|
|
78
|
+
type: number
|
|
79
|
+
description: 'Average of relevance + completeness + ranking_quality (sum/3).
|
|
80
|
+
|
|
81
|
+
'
|
|
82
|
+
minimum: 0
|
|
83
|
+
maximum: 1
|
|
84
|
+
pass:
|
|
85
|
+
type: boolean
|
|
86
|
+
description: 'True if overall_score >= 0.70 AND relevance_score >= 0.5
|
|
87
|
+
|
|
88
|
+
AND no hallucinated entities detected.
|
|
89
|
+
|
|
90
|
+
'
|
|
91
|
+
relevance_details:
|
|
92
|
+
type: string
|
|
93
|
+
description: 'Assessment of result relevance to query.
|
|
94
|
+
|
|
95
|
+
Example: "First 3 results highly relevant, last 2 borderline"
|
|
96
|
+
|
|
97
|
+
'
|
|
98
|
+
completeness_details:
|
|
99
|
+
type: string
|
|
100
|
+
description: 'Comparison to reference golden set.
|
|
101
|
+
|
|
102
|
+
Example: "Missing ''sarah-chen'' person entity expected in top results"
|
|
103
|
+
|
|
104
|
+
'
|
|
105
|
+
ranking_details:
|
|
106
|
+
type: string
|
|
107
|
+
description: 'Assessment of ranking quality.
|
|
108
|
+
|
|
109
|
+
Example: "Most relevant entity ranked #3 (should be #1)"
|
|
110
|
+
|
|
111
|
+
'
|
|
112
|
+
hallucinations_detected:
|
|
113
|
+
type: array
|
|
114
|
+
description: 'List of entities in results but not in reference.
|
|
115
|
+
|
|
116
|
+
May not be errors (new data) but flag for review.
|
|
117
|
+
|
|
118
|
+
'
|
|
119
|
+
items:
|
|
120
|
+
type: string
|
|
121
|
+
missing_expected_entities:
|
|
122
|
+
type: array
|
|
123
|
+
description: 'List of entities in reference but missing from results.
|
|
124
|
+
|
|
125
|
+
'
|
|
126
|
+
items:
|
|
127
|
+
type: string
|
|
128
|
+
irrelevant_results:
|
|
129
|
+
type: array
|
|
130
|
+
description: 'List of results that don''t match query intent.
|
|
131
|
+
|
|
132
|
+
'
|
|
133
|
+
items:
|
|
134
|
+
type: string
|
|
135
|
+
strengths:
|
|
136
|
+
type: array
|
|
137
|
+
description: 'What the search did well (objective).
|
|
138
|
+
|
|
139
|
+
'
|
|
140
|
+
items:
|
|
141
|
+
type: string
|
|
142
|
+
critical_gaps:
|
|
143
|
+
type: array
|
|
144
|
+
description: 'Major issues (missing key results, wrong types, etc.).
|
|
145
|
+
|
|
146
|
+
'
|
|
147
|
+
items:
|
|
148
|
+
type: string
|
|
149
|
+
improvement_suggestions:
|
|
150
|
+
type: array
|
|
151
|
+
description: 'Actionable suggestions to improve search quality.
|
|
152
|
+
|
|
153
|
+
'
|
|
154
|
+
items:
|
|
155
|
+
type: string
|
|
156
|
+
confidence_in_grading:
|
|
157
|
+
type: string
|
|
158
|
+
description: 'Your confidence: "high", "medium", "low"
|
|
159
|
+
|
|
160
|
+
(Semantic matching is subjective - lower confidence OK)
|
|
161
|
+
|
|
162
|
+
'
|
|
163
|
+
enum:
|
|
164
|
+
- high
|
|
165
|
+
- medium
|
|
166
|
+
- low
|
|
167
|
+
grading_notes:
|
|
168
|
+
type: string
|
|
169
|
+
description: 'Internal notes about judgment calls or edge cases.
|
|
170
|
+
|
|
171
|
+
'
|
|
172
|
+
required:
|
|
173
|
+
- relevance_score
|
|
174
|
+
- completeness_score
|
|
175
|
+
- ranking_quality_score
|
|
176
|
+
- overall_score
|
|
177
|
+
- pass
|
|
178
|
+
- relevance_details
|
|
179
|
+
- completeness_details
|
|
180
|
+
- ranking_details
|
|
181
|
+
- hallucinations_detected
|
|
182
|
+
- missing_expected_entities
|
|
183
|
+
- irrelevant_results
|
|
184
|
+
- strengths
|
|
185
|
+
- critical_gaps
|
|
186
|
+
- improvement_suggestions
|
|
187
|
+
- confidence_in_grading
|
|
188
|
+
- grading_notes
|
|
189
|
+
version: 1.0.0
|
|
190
|
+
json_schema_extra:
|
|
191
|
+
kind: evaluator
|
|
192
|
+
name: rem-search-correctness
|
rem/services/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
REM Services
|
|
3
|
+
|
|
4
|
+
Service layer for REM system operations:
|
|
5
|
+
- PostgresService: PostgreSQL/CloudNativePG database operations
|
|
6
|
+
- RemService: REM query execution and graph operations
|
|
7
|
+
|
|
8
|
+
For file/S3 operations, use rem.services.fs instead:
|
|
9
|
+
from rem.services.fs import FS, S3Provider
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from .fs.service import FileSystemService
|
|
13
|
+
from .postgres import PostgresService
|
|
14
|
+
from .rem import RemService
|
|
15
|
+
|
|
16
|
+
__all__ = ["PostgresService", "RemService", "FileSystemService"]
|