ragtime-cli 0.2.14__tar.gz → 0.2.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ragtime_cli-0.2.14/ragtime_cli.egg-info → ragtime_cli-0.2.16}/PKG-INFO +57 -3
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/README.md +56 -2
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/pyproject.toml +1 -1
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16/ragtime_cli.egg-info}/PKG-INFO +57 -3
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/ragtime_cli.egg-info/SOURCES.txt +1 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/cli.py +9 -5
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/db.py +112 -0
- ragtime_cli-0.2.16/src/feedback.py +202 -0
- ragtime_cli-0.2.16/src/indexers/docs.py +312 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/mcp_server.py +128 -15
- ragtime_cli-0.2.14/src/indexers/docs.py +0 -134
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/LICENSE +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/ragtime_cli.egg-info/dependency_links.txt +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/ragtime_cli.egg-info/entry_points.txt +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/ragtime_cli.egg-info/requires.txt +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/ragtime_cli.egg-info/top_level.txt +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/setup.cfg +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/__init__.py +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/audit.md +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/create-pr.md +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/generate-docs.md +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/handoff.md +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/import-docs.md +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/pr-graduate.md +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/recall.md +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/remember.md +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/save.md +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/start.md +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/config.py +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/indexers/__init__.py +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/indexers/code.py +0 -0
- {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/memory.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragtime-cli
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.16
|
|
4
4
|
Summary: Local-first memory and RAG system for Claude Code - semantic search over code, docs, and team knowledge
|
|
5
5
|
Author-email: Bret Martineau <bretwardjames@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -263,9 +263,38 @@ This is intentional - embeddings work better on focused summaries than large cod
|
|
|
263
263
|
|
|
264
264
|
For Claude/MCP usage: The search tool description instructs Claude to read returned file paths for full implementations before making code changes.
|
|
265
265
|
|
|
266
|
+
### Smart Query Understanding
|
|
267
|
+
|
|
268
|
+
Search automatically detects qualifiers in natural language:
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
# These are equivalent - qualifiers are auto-detected
|
|
272
|
+
ragtime search "error handling in mobile app"
|
|
273
|
+
ragtime search "error handling" -r mobile
|
|
274
|
+
|
|
275
|
+
# Use --raw for literal/exact search
|
|
276
|
+
ragtime search "mobile error handling" --raw
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
Auto-detected qualifiers include: mobile, web, desktop, ios, android, flutter, react, vue, dart, python, typescript, auth, api, database, frontend, backend, and more.
|
|
280
|
+
|
|
281
|
+
### Tiered Search
|
|
282
|
+
|
|
283
|
+
Use tiered search to prioritize curated knowledge over raw code:
|
|
284
|
+
|
|
285
|
+
```bash
|
|
286
|
+
# Via MCP
|
|
287
|
+
search(query="authentication", tiered=True)
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
Tiered search returns results in priority order:
|
|
291
|
+
1. **Memories** - Curated, high-signal knowledge
|
|
292
|
+
2. **Documentation** - Indexed markdown files
|
|
293
|
+
3. **Code** - Function signatures and symbols
|
|
294
|
+
|
|
266
295
|
### Hybrid Search
|
|
267
296
|
|
|
268
|
-
|
|
297
|
+
For explicit keyword filtering, use `require_terms`:
|
|
269
298
|
|
|
270
299
|
```bash
|
|
271
300
|
# CLI
|
|
@@ -277,6 +306,29 @@ search(query="error handling", require_terms=["mobile", "dart"])
|
|
|
277
306
|
|
|
278
307
|
This combines semantic similarity (finds conceptually related content) with keyword filtering (ensures qualifiers aren't ignored).
|
|
279
308
|
|
|
309
|
+
### Hierarchical Doc Chunking
|
|
310
|
+
|
|
311
|
+
Long markdown files are automatically chunked by headers for better search accuracy:
|
|
312
|
+
|
|
313
|
+
- Each section becomes a separate searchable chunk
|
|
314
|
+
- Parent headers are preserved as context in the embedding
|
|
315
|
+
- Short docs (<500 chars) remain as single chunks
|
|
316
|
+
- Section path is stored (e.g., "Installation > Configuration > Environment Variables")
|
|
317
|
+
|
|
318
|
+
### Feedback Loop
|
|
319
|
+
|
|
320
|
+
Search quality improves over time based on usage patterns:
|
|
321
|
+
|
|
322
|
+
```bash
|
|
323
|
+
# Record when a result is useful (via MCP)
|
|
324
|
+
record_feedback(query="auth flow", result_file="src/auth.py", action="used")
|
|
325
|
+
|
|
326
|
+
# View usage statistics
|
|
327
|
+
feedback_stats()
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
Frequently-used files receive a boost in future search rankings.
|
|
331
|
+
|
|
280
332
|
## Code Indexing
|
|
281
333
|
|
|
282
334
|
The code indexer extracts meaningful symbols from your codebase:
|
|
@@ -379,13 +431,15 @@ Add to your Claude config (`.mcp.json`):
|
|
|
379
431
|
|
|
380
432
|
Available tools:
|
|
381
433
|
- `remember` - Store a memory
|
|
382
|
-
- `search` - Semantic search
|
|
434
|
+
- `search` - Semantic search (supports tiered mode and auto-extraction)
|
|
383
435
|
- `list_memories` - List with filters
|
|
384
436
|
- `get_memory` - Get by ID
|
|
385
437
|
- `store_doc` - Store document verbatim
|
|
386
438
|
- `forget` - Delete memory
|
|
387
439
|
- `graduate` - Promote branch → app
|
|
388
440
|
- `update_status` - Change memory status
|
|
441
|
+
- `record_feedback` - Record when search results are used (improves future rankings)
|
|
442
|
+
- `feedback_stats` - View search result usage patterns
|
|
389
443
|
|
|
390
444
|
## ghp-cli Integration
|
|
391
445
|
|
|
@@ -233,9 +233,38 @@ This is intentional - embeddings work better on focused summaries than large cod
|
|
|
233
233
|
|
|
234
234
|
For Claude/MCP usage: The search tool description instructs Claude to read returned file paths for full implementations before making code changes.
|
|
235
235
|
|
|
236
|
+
### Smart Query Understanding
|
|
237
|
+
|
|
238
|
+
Search automatically detects qualifiers in natural language:
|
|
239
|
+
|
|
240
|
+
```bash
|
|
241
|
+
# These are equivalent - qualifiers are auto-detected
|
|
242
|
+
ragtime search "error handling in mobile app"
|
|
243
|
+
ragtime search "error handling" -r mobile
|
|
244
|
+
|
|
245
|
+
# Use --raw for literal/exact search
|
|
246
|
+
ragtime search "mobile error handling" --raw
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
Auto-detected qualifiers include: mobile, web, desktop, ios, android, flutter, react, vue, dart, python, typescript, auth, api, database, frontend, backend, and more.
|
|
250
|
+
|
|
251
|
+
### Tiered Search
|
|
252
|
+
|
|
253
|
+
Use tiered search to prioritize curated knowledge over raw code:
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
# Via MCP
|
|
257
|
+
search(query="authentication", tiered=True)
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
Tiered search returns results in priority order:
|
|
261
|
+
1. **Memories** - Curated, high-signal knowledge
|
|
262
|
+
2. **Documentation** - Indexed markdown files
|
|
263
|
+
3. **Code** - Function signatures and symbols
|
|
264
|
+
|
|
236
265
|
### Hybrid Search
|
|
237
266
|
|
|
238
|
-
|
|
267
|
+
For explicit keyword filtering, use `require_terms`:
|
|
239
268
|
|
|
240
269
|
```bash
|
|
241
270
|
# CLI
|
|
@@ -247,6 +276,29 @@ search(query="error handling", require_terms=["mobile", "dart"])
|
|
|
247
276
|
|
|
248
277
|
This combines semantic similarity (finds conceptually related content) with keyword filtering (ensures qualifiers aren't ignored).
|
|
249
278
|
|
|
279
|
+
### Hierarchical Doc Chunking
|
|
280
|
+
|
|
281
|
+
Long markdown files are automatically chunked by headers for better search accuracy:
|
|
282
|
+
|
|
283
|
+
- Each section becomes a separate searchable chunk
|
|
284
|
+
- Parent headers are preserved as context in the embedding
|
|
285
|
+
- Short docs (<500 chars) remain as single chunks
|
|
286
|
+
- Section path is stored (e.g., "Installation > Configuration > Environment Variables")
|
|
287
|
+
|
|
288
|
+
### Feedback Loop
|
|
289
|
+
|
|
290
|
+
Search quality improves over time based on usage patterns:
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
# Record when a result is useful (via MCP)
|
|
294
|
+
record_feedback(query="auth flow", result_file="src/auth.py", action="used")
|
|
295
|
+
|
|
296
|
+
# View usage statistics
|
|
297
|
+
feedback_stats()
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
Frequently-used files receive a boost in future search rankings.
|
|
301
|
+
|
|
250
302
|
## Code Indexing
|
|
251
303
|
|
|
252
304
|
The code indexer extracts meaningful symbols from your codebase:
|
|
@@ -349,13 +401,15 @@ Add to your Claude config (`.mcp.json`):
|
|
|
349
401
|
|
|
350
402
|
Available tools:
|
|
351
403
|
- `remember` - Store a memory
|
|
352
|
-
- `search` - Semantic search
|
|
404
|
+
- `search` - Semantic search (supports tiered mode and auto-extraction)
|
|
353
405
|
- `list_memories` - List with filters
|
|
354
406
|
- `get_memory` - Get by ID
|
|
355
407
|
- `store_doc` - Store document verbatim
|
|
356
408
|
- `forget` - Delete memory
|
|
357
409
|
- `graduate` - Promote branch → app
|
|
358
410
|
- `update_status` - Change memory status
|
|
411
|
+
- `record_feedback` - Record when search results are used (improves future rankings)
|
|
412
|
+
- `feedback_stats` - View search result usage patterns
|
|
359
413
|
|
|
360
414
|
## ghp-cli Integration
|
|
361
415
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragtime-cli
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.16
|
|
4
4
|
Summary: Local-first memory and RAG system for Claude Code - semantic search over code, docs, and team knowledge
|
|
5
5
|
Author-email: Bret Martineau <bretwardjames@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -263,9 +263,38 @@ This is intentional - embeddings work better on focused summaries than large cod
|
|
|
263
263
|
|
|
264
264
|
For Claude/MCP usage: The search tool description instructs Claude to read returned file paths for full implementations before making code changes.
|
|
265
265
|
|
|
266
|
+
### Smart Query Understanding
|
|
267
|
+
|
|
268
|
+
Search automatically detects qualifiers in natural language:
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
# These are equivalent - qualifiers are auto-detected
|
|
272
|
+
ragtime search "error handling in mobile app"
|
|
273
|
+
ragtime search "error handling" -r mobile
|
|
274
|
+
|
|
275
|
+
# Use --raw for literal/exact search
|
|
276
|
+
ragtime search "mobile error handling" --raw
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
Auto-detected qualifiers include: mobile, web, desktop, ios, android, flutter, react, vue, dart, python, typescript, auth, api, database, frontend, backend, and more.
|
|
280
|
+
|
|
281
|
+
### Tiered Search
|
|
282
|
+
|
|
283
|
+
Use tiered search to prioritize curated knowledge over raw code:
|
|
284
|
+
|
|
285
|
+
```bash
|
|
286
|
+
# Via MCP
|
|
287
|
+
search(query="authentication", tiered=True)
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
Tiered search returns results in priority order:
|
|
291
|
+
1. **Memories** - Curated, high-signal knowledge
|
|
292
|
+
2. **Documentation** - Indexed markdown files
|
|
293
|
+
3. **Code** - Function signatures and symbols
|
|
294
|
+
|
|
266
295
|
### Hybrid Search
|
|
267
296
|
|
|
268
|
-
|
|
297
|
+
For explicit keyword filtering, use `require_terms`:
|
|
269
298
|
|
|
270
299
|
```bash
|
|
271
300
|
# CLI
|
|
@@ -277,6 +306,29 @@ search(query="error handling", require_terms=["mobile", "dart"])
|
|
|
277
306
|
|
|
278
307
|
This combines semantic similarity (finds conceptually related content) with keyword filtering (ensures qualifiers aren't ignored).
|
|
279
308
|
|
|
309
|
+
### Hierarchical Doc Chunking
|
|
310
|
+
|
|
311
|
+
Long markdown files are automatically chunked by headers for better search accuracy:
|
|
312
|
+
|
|
313
|
+
- Each section becomes a separate searchable chunk
|
|
314
|
+
- Parent headers are preserved as context in the embedding
|
|
315
|
+
- Short docs (<500 chars) remain as single chunks
|
|
316
|
+
- Section path is stored (e.g., "Installation > Configuration > Environment Variables")
|
|
317
|
+
|
|
318
|
+
### Feedback Loop
|
|
319
|
+
|
|
320
|
+
Search quality improves over time based on usage patterns:
|
|
321
|
+
|
|
322
|
+
```bash
|
|
323
|
+
# Record when a result is useful (via MCP)
|
|
324
|
+
record_feedback(query="auth flow", result_file="src/auth.py", action="used")
|
|
325
|
+
|
|
326
|
+
# View usage statistics
|
|
327
|
+
feedback_stats()
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
Frequently-used files receive a boost in future search rankings.
|
|
331
|
+
|
|
280
332
|
## Code Indexing
|
|
281
333
|
|
|
282
334
|
The code indexer extracts meaningful symbols from your codebase:
|
|
@@ -379,13 +431,15 @@ Add to your Claude config (`.mcp.json`):
|
|
|
379
431
|
|
|
380
432
|
Available tools:
|
|
381
433
|
- `remember` - Store a memory
|
|
382
|
-
- `search` - Semantic search
|
|
434
|
+
- `search` - Semantic search (supports tiered mode and auto-extraction)
|
|
383
435
|
- `list_memories` - List with filters
|
|
384
436
|
- `get_memory` - Get by ID
|
|
385
437
|
- `store_doc` - Store document verbatim
|
|
386
438
|
- `forget` - Delete memory
|
|
387
439
|
- `graduate` - Promote branch → app
|
|
388
440
|
- `update_status` - Change memory status
|
|
441
|
+
- `record_feedback` - Record when search results are used (improves future rankings)
|
|
442
|
+
- `feedback_stats` - View search result usage patterns
|
|
389
443
|
|
|
390
444
|
## ghp-cli Integration
|
|
391
445
|
|
|
@@ -381,13 +381,13 @@ def index(path: Path, index_type: str, clear: bool):
|
|
|
381
381
|
item_show_func=lambda f: f.name[:30] if f else "",
|
|
382
382
|
) as files:
|
|
383
383
|
for file_path in files:
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
384
|
+
# index_doc_file returns list (hierarchical chunks)
|
|
385
|
+
file_entries = index_doc_file(file_path)
|
|
386
|
+
entries.extend(file_entries)
|
|
387
387
|
|
|
388
388
|
if entries:
|
|
389
389
|
_upsert_entries(db, entries, "docs")
|
|
390
|
-
click.echo(f" Indexed {len(entries)}
|
|
390
|
+
click.echo(f" Indexed {len(entries)} document chunks")
|
|
391
391
|
elif not to_delete:
|
|
392
392
|
click.echo(" All docs up to date")
|
|
393
393
|
else:
|
|
@@ -2215,8 +2215,12 @@ def update(check: bool):
|
|
|
2215
2215
|
import json
|
|
2216
2216
|
from urllib.request import urlopen
|
|
2217
2217
|
from urllib.error import URLError
|
|
2218
|
+
from importlib.metadata import version as get_version
|
|
2218
2219
|
|
|
2219
|
-
|
|
2220
|
+
try:
|
|
2221
|
+
current = get_version("ragtime-cli")
|
|
2222
|
+
except Exception:
|
|
2223
|
+
current = "0.0.0" # Fallback if not installed as package
|
|
2220
2224
|
|
|
2221
2225
|
click.echo(f"Current version: {current}")
|
|
2222
2226
|
click.echo("Checking PyPI for updates...")
|
|
@@ -238,6 +238,118 @@ class RagtimeDB:
|
|
|
238
238
|
|
|
239
239
|
return output
|
|
240
240
|
|
|
241
|
+
def search_tiered(
|
|
242
|
+
self,
|
|
243
|
+
query: str,
|
|
244
|
+
limit: int = 10,
|
|
245
|
+
namespace: str | None = None,
|
|
246
|
+
require_terms: list[str] | None = None,
|
|
247
|
+
auto_extract: bool = True,
|
|
248
|
+
**filters,
|
|
249
|
+
) -> list[dict]:
|
|
250
|
+
"""
|
|
251
|
+
Tiered search: prioritizes memories > docs > code.
|
|
252
|
+
|
|
253
|
+
Searches in priority order, filling up to limit:
|
|
254
|
+
1. Memories (curated, high-signal knowledge)
|
|
255
|
+
2. Documentation (indexed markdown)
|
|
256
|
+
3. Code (broadest, implementation details)
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
query: Natural language search query
|
|
260
|
+
limit: Max total results to return
|
|
261
|
+
namespace: Filter by namespace
|
|
262
|
+
require_terms: Terms that MUST appear in results
|
|
263
|
+
auto_extract: Auto-detect qualifiers from query
|
|
264
|
+
**filters: Additional metadata filters
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
List of dicts with 'content', 'metadata', 'distance', 'tier'
|
|
268
|
+
"""
|
|
269
|
+
results = []
|
|
270
|
+
|
|
271
|
+
# Tier 1: Memories (not docs or code)
|
|
272
|
+
memory_results = self._search_tier(
|
|
273
|
+
query=query,
|
|
274
|
+
tier_name="memory",
|
|
275
|
+
exclude_types=["docs", "code"],
|
|
276
|
+
limit=limit,
|
|
277
|
+
namespace=namespace,
|
|
278
|
+
require_terms=require_terms,
|
|
279
|
+
auto_extract=auto_extract,
|
|
280
|
+
**filters,
|
|
281
|
+
)
|
|
282
|
+
results.extend(memory_results)
|
|
283
|
+
|
|
284
|
+
# Tier 2: Documentation
|
|
285
|
+
if len(results) < limit:
|
|
286
|
+
doc_results = self._search_tier(
|
|
287
|
+
query=query,
|
|
288
|
+
tier_name="docs",
|
|
289
|
+
type_filter="docs",
|
|
290
|
+
limit=limit - len(results),
|
|
291
|
+
namespace=namespace,
|
|
292
|
+
require_terms=require_terms,
|
|
293
|
+
auto_extract=auto_extract,
|
|
294
|
+
**filters,
|
|
295
|
+
)
|
|
296
|
+
results.extend(doc_results)
|
|
297
|
+
|
|
298
|
+
# Tier 3: Code
|
|
299
|
+
if len(results) < limit:
|
|
300
|
+
code_results = self._search_tier(
|
|
301
|
+
query=query,
|
|
302
|
+
tier_name="code",
|
|
303
|
+
type_filter="code",
|
|
304
|
+
limit=limit - len(results),
|
|
305
|
+
namespace=namespace,
|
|
306
|
+
require_terms=require_terms,
|
|
307
|
+
auto_extract=auto_extract,
|
|
308
|
+
**filters,
|
|
309
|
+
)
|
|
310
|
+
results.extend(code_results)
|
|
311
|
+
|
|
312
|
+
return results
|
|
313
|
+
|
|
314
|
+
def _search_tier(
|
|
315
|
+
self,
|
|
316
|
+
query: str,
|
|
317
|
+
tier_name: str,
|
|
318
|
+
limit: int,
|
|
319
|
+
type_filter: str | None = None,
|
|
320
|
+
exclude_types: list[str] | None = None,
|
|
321
|
+
**kwargs,
|
|
322
|
+
) -> list[dict]:
|
|
323
|
+
"""Search a single tier and tag results."""
|
|
324
|
+
# Build where clause for exclusion if needed
|
|
325
|
+
if exclude_types:
|
|
326
|
+
# Search without type filter, then exclude in post-processing
|
|
327
|
+
results = self.search(
|
|
328
|
+
query=query,
|
|
329
|
+
limit=limit * 2, # fetch more since we'll filter
|
|
330
|
+
type_filter=None,
|
|
331
|
+
**kwargs,
|
|
332
|
+
)
|
|
333
|
+
# Filter out excluded types
|
|
334
|
+
filtered = []
|
|
335
|
+
for r in results:
|
|
336
|
+
if r["metadata"].get("type") not in exclude_types:
|
|
337
|
+
r["tier"] = tier_name
|
|
338
|
+
filtered.append(r)
|
|
339
|
+
if len(filtered) >= limit:
|
|
340
|
+
break
|
|
341
|
+
return filtered
|
|
342
|
+
else:
|
|
343
|
+
results = self.search(
|
|
344
|
+
query=query,
|
|
345
|
+
limit=limit,
|
|
346
|
+
type_filter=type_filter,
|
|
347
|
+
**kwargs,
|
|
348
|
+
)
|
|
349
|
+
for r in results:
|
|
350
|
+
r["tier"] = tier_name
|
|
351
|
+
return results
|
|
352
|
+
|
|
241
353
|
def delete(self, ids: list[str]) -> None:
|
|
242
354
|
"""Delete documents by ID."""
|
|
243
355
|
self.collection.delete(ids=ids)
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Feedback loop for RAG result quality improvement.
|
|
3
|
+
|
|
4
|
+
Tracks which search results are actually used/referenced by Claude,
|
|
5
|
+
enabling re-ranking and quality improvements over time.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from dataclasses import dataclass, field, asdict
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class SearchFeedback:
|
|
17
|
+
"""Feedback for a single search result."""
|
|
18
|
+
query: str
|
|
19
|
+
result_id: str # ChromaDB document ID
|
|
20
|
+
result_file: str # File path for easier debugging
|
|
21
|
+
action: str # "used", "referenced", "ignored", "helpful", "not_helpful"
|
|
22
|
+
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
|
23
|
+
session_id: Optional[str] = None # Group related searches
|
|
24
|
+
position: int = 0 # Position in results (1-indexed)
|
|
25
|
+
distance: float = 0.0 # Original semantic distance
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class FeedbackStore:
|
|
29
|
+
"""
|
|
30
|
+
Simple file-based feedback storage.
|
|
31
|
+
|
|
32
|
+
Stores feedback as JSON lines for easy analysis.
|
|
33
|
+
Can be upgraded to SQLite or ChromaDB later.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, path: Path):
|
|
37
|
+
"""
|
|
38
|
+
Initialize feedback store.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
path: Directory to store feedback data
|
|
42
|
+
"""
|
|
43
|
+
self.path = path
|
|
44
|
+
self.feedback_file = path / "feedback.jsonl"
|
|
45
|
+
self.stats_file = path / "feedback_stats.json"
|
|
46
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
|
|
48
|
+
def record(self, feedback: SearchFeedback) -> None:
|
|
49
|
+
"""Record a single feedback entry."""
|
|
50
|
+
with open(self.feedback_file, "a") as f:
|
|
51
|
+
f.write(json.dumps(asdict(feedback)) + "\n")
|
|
52
|
+
|
|
53
|
+
def record_usage(
|
|
54
|
+
self,
|
|
55
|
+
query: str,
|
|
56
|
+
result_id: str,
|
|
57
|
+
result_file: str,
|
|
58
|
+
position: int = 0,
|
|
59
|
+
distance: float = 0.0,
|
|
60
|
+
session_id: Optional[str] = None,
|
|
61
|
+
) -> None:
|
|
62
|
+
"""Convenience method to record when a result is used."""
|
|
63
|
+
self.record(SearchFeedback(
|
|
64
|
+
query=query,
|
|
65
|
+
result_id=result_id,
|
|
66
|
+
result_file=result_file,
|
|
67
|
+
action="used",
|
|
68
|
+
position=position,
|
|
69
|
+
distance=distance,
|
|
70
|
+
session_id=session_id,
|
|
71
|
+
))
|
|
72
|
+
|
|
73
|
+
def record_batch(
|
|
74
|
+
self,
|
|
75
|
+
query: str,
|
|
76
|
+
used_ids: list[str],
|
|
77
|
+
all_results: list[dict],
|
|
78
|
+
session_id: Optional[str] = None,
|
|
79
|
+
) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Record feedback for a batch of results.
|
|
82
|
+
|
|
83
|
+
Marks used_ids as "used" and others as "ignored".
|
|
84
|
+
"""
|
|
85
|
+
used_set = set(used_ids)
|
|
86
|
+
|
|
87
|
+
for i, result in enumerate(all_results):
|
|
88
|
+
result_id = result.get("id", "")
|
|
89
|
+
result_file = result.get("metadata", {}).get("file", "")
|
|
90
|
+
distance = result.get("distance", 0.0)
|
|
91
|
+
|
|
92
|
+
action = "used" if result_id in used_set else "ignored"
|
|
93
|
+
|
|
94
|
+
self.record(SearchFeedback(
|
|
95
|
+
query=query,
|
|
96
|
+
result_id=result_id,
|
|
97
|
+
result_file=result_file,
|
|
98
|
+
action=action,
|
|
99
|
+
position=i + 1,
|
|
100
|
+
distance=distance,
|
|
101
|
+
session_id=session_id,
|
|
102
|
+
))
|
|
103
|
+
|
|
104
|
+
def get_usage_stats(self) -> dict:
|
|
105
|
+
"""
|
|
106
|
+
Get aggregated usage statistics.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Dict with usage counts, popular files, etc.
|
|
110
|
+
"""
|
|
111
|
+
if not self.feedback_file.exists():
|
|
112
|
+
return {"total": 0, "used": 0, "ignored": 0}
|
|
113
|
+
|
|
114
|
+
stats = {
|
|
115
|
+
"total": 0,
|
|
116
|
+
"used": 0,
|
|
117
|
+
"ignored": 0,
|
|
118
|
+
"helpful": 0,
|
|
119
|
+
"not_helpful": 0,
|
|
120
|
+
"files_used": {}, # file -> count
|
|
121
|
+
"avg_position_used": 0.0,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
positions = []
|
|
125
|
+
|
|
126
|
+
with open(self.feedback_file) as f:
|
|
127
|
+
for line in f:
|
|
128
|
+
if not line.strip():
|
|
129
|
+
continue
|
|
130
|
+
try:
|
|
131
|
+
entry = json.loads(line)
|
|
132
|
+
stats["total"] += 1
|
|
133
|
+
action = entry.get("action", "")
|
|
134
|
+
|
|
135
|
+
if action == "used":
|
|
136
|
+
stats["used"] += 1
|
|
137
|
+
positions.append(entry.get("position", 0))
|
|
138
|
+
file_path = entry.get("result_file", "")
|
|
139
|
+
stats["files_used"][file_path] = stats["files_used"].get(file_path, 0) + 1
|
|
140
|
+
elif action == "ignored":
|
|
141
|
+
stats["ignored"] += 1
|
|
142
|
+
elif action == "helpful":
|
|
143
|
+
stats["helpful"] += 1
|
|
144
|
+
elif action == "not_helpful":
|
|
145
|
+
stats["not_helpful"] += 1
|
|
146
|
+
except json.JSONDecodeError:
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
if positions:
|
|
150
|
+
stats["avg_position_used"] = sum(positions) / len(positions)
|
|
151
|
+
|
|
152
|
+
return stats
|
|
153
|
+
|
|
154
|
+
def get_boost_scores(self) -> dict[str, float]:
|
|
155
|
+
"""
|
|
156
|
+
Calculate boost scores for files based on historical usage.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Dict mapping file paths to boost multipliers (1.0 = no boost).
|
|
160
|
+
"""
|
|
161
|
+
stats = self.get_usage_stats()
|
|
162
|
+
files_used = stats.get("files_used", {})
|
|
163
|
+
|
|
164
|
+
if not files_used:
|
|
165
|
+
return {}
|
|
166
|
+
|
|
167
|
+
# Normalize to 0-1 range, then convert to boost multiplier
|
|
168
|
+
max_count = max(files_used.values())
|
|
169
|
+
boosts = {}
|
|
170
|
+
|
|
171
|
+
for file_path, count in files_used.items():
|
|
172
|
+
# Boost range: 1.0 (no boost) to 1.5 (50% boost for most-used)
|
|
173
|
+
normalized = count / max_count
|
|
174
|
+
boosts[file_path] = 1.0 + (normalized * 0.5)
|
|
175
|
+
|
|
176
|
+
return boosts
|
|
177
|
+
|
|
178
|
+
def apply_boosts(self, results: list[dict], boosts: dict[str, float]) -> list[dict]:
|
|
179
|
+
"""
|
|
180
|
+
Apply historical boost scores to search results.
|
|
181
|
+
|
|
182
|
+
Adjusts distances based on historical usage patterns.
|
|
183
|
+
Lower distance = more relevant, so we divide by boost.
|
|
184
|
+
"""
|
|
185
|
+
if not boosts:
|
|
186
|
+
return results
|
|
187
|
+
|
|
188
|
+
for result in results:
|
|
189
|
+
file_path = result.get("metadata", {}).get("file", "")
|
|
190
|
+
boost = boosts.get(file_path, 1.0)
|
|
191
|
+
if "distance" in result and result["distance"]:
|
|
192
|
+
# Reduce distance for frequently-used files
|
|
193
|
+
result["distance"] = result["distance"] / boost
|
|
194
|
+
result["boosted"] = boost > 1.0
|
|
195
|
+
|
|
196
|
+
# Re-sort by adjusted distance
|
|
197
|
+
return sorted(results, key=lambda r: r.get("distance", float("inf")))
|
|
198
|
+
|
|
199
|
+
def clear(self) -> None:
|
|
200
|
+
"""Clear all feedback data."""
|
|
201
|
+
if self.feedback_file.exists():
|
|
202
|
+
self.feedback_file.unlink()
|