ragtime-cli 0.2.14__tar.gz → 0.2.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {ragtime_cli-0.2.14/ragtime_cli.egg-info → ragtime_cli-0.2.16}/PKG-INFO +57 -3
  2. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/README.md +56 -2
  3. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/pyproject.toml +1 -1
  4. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16/ragtime_cli.egg-info}/PKG-INFO +57 -3
  5. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/ragtime_cli.egg-info/SOURCES.txt +1 -0
  6. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/cli.py +9 -5
  7. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/db.py +112 -0
  8. ragtime_cli-0.2.16/src/feedback.py +202 -0
  9. ragtime_cli-0.2.16/src/indexers/docs.py +312 -0
  10. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/mcp_server.py +128 -15
  11. ragtime_cli-0.2.14/src/indexers/docs.py +0 -134
  12. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/LICENSE +0 -0
  13. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/ragtime_cli.egg-info/dependency_links.txt +0 -0
  14. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/ragtime_cli.egg-info/entry_points.txt +0 -0
  15. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/ragtime_cli.egg-info/requires.txt +0 -0
  16. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/ragtime_cli.egg-info/top_level.txt +0 -0
  17. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/setup.cfg +0 -0
  18. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/__init__.py +0 -0
  19. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/audit.md +0 -0
  20. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/create-pr.md +0 -0
  21. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/generate-docs.md +0 -0
  22. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/handoff.md +0 -0
  23. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/import-docs.md +0 -0
  24. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/pr-graduate.md +0 -0
  25. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/recall.md +0 -0
  26. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/remember.md +0 -0
  27. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/save.md +0 -0
  28. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/commands/start.md +0 -0
  29. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/config.py +0 -0
  30. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/indexers/__init__.py +0 -0
  31. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/indexers/code.py +0 -0
  32. {ragtime_cli-0.2.14 → ragtime_cli-0.2.16}/src/memory.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragtime-cli
3
- Version: 0.2.14
3
+ Version: 0.2.16
4
4
  Summary: Local-first memory and RAG system for Claude Code - semantic search over code, docs, and team knowledge
5
5
  Author-email: Bret Martineau <bretwardjames@gmail.com>
6
6
  License-Expression: MIT
@@ -263,9 +263,38 @@ This is intentional - embeddings work better on focused summaries than large cod
263
263
 
264
264
  For Claude/MCP usage: The search tool description instructs Claude to read returned file paths for full implementations before making code changes.
265
265
 
266
+ ### Smart Query Understanding
267
+
268
+ Search automatically detects qualifiers in natural language:
269
+
270
+ ```bash
271
+ # These are equivalent - qualifiers are auto-detected
272
+ ragtime search "error handling in mobile app"
273
+ ragtime search "error handling" -r mobile
274
+
275
+ # Use --raw for literal/exact search
276
+ ragtime search "mobile error handling" --raw
277
+ ```
278
+
279
+ Auto-detected qualifiers include: mobile, web, desktop, ios, android, flutter, react, vue, dart, python, typescript, auth, api, database, frontend, backend, and more.
280
+
281
+ ### Tiered Search
282
+
283
+ Use tiered search to prioritize curated knowledge over raw code:
284
+
285
+ ```bash
286
+ # Via MCP
287
+ search(query="authentication", tiered=True)
288
+ ```
289
+
290
+ Tiered search returns results in priority order:
291
+ 1. **Memories** - Curated, high-signal knowledge
292
+ 2. **Documentation** - Indexed markdown files
293
+ 3. **Code** - Function signatures and symbols
294
+
266
295
  ### Hybrid Search
267
296
 
268
- Semantic search can lose qualifiers - "error handling in mobile app" might return web app results because "error handling" dominates the embedding. Use `require_terms` to ensure specific words appear:
297
+ For explicit keyword filtering, use `require_terms`:
269
298
 
270
299
  ```bash
271
300
  # CLI
@@ -277,6 +306,29 @@ search(query="error handling", require_terms=["mobile", "dart"])
277
306
 
278
307
  This combines semantic similarity (finds conceptually related content) with keyword filtering (ensures qualifiers aren't ignored).
279
308
 
309
+ ### Hierarchical Doc Chunking
310
+
311
+ Long markdown files are automatically chunked by headers for better search accuracy:
312
+
313
+ - Each section becomes a separate searchable chunk
314
+ - Parent headers are preserved as context in the embedding
315
+ - Short docs (<500 chars) remain as single chunks
316
+ - Section path is stored (e.g., "Installation > Configuration > Environment Variables")
317
+
318
+ ### Feedback Loop
319
+
320
+ Search quality improves over time based on usage patterns:
321
+
322
+ ```bash
323
+ # Record when a result is useful (via MCP)
324
+ record_feedback(query="auth flow", result_file="src/auth.py", action="used")
325
+
326
+ # View usage statistics
327
+ feedback_stats()
328
+ ```
329
+
330
+ Frequently-used files receive a boost in future search rankings.
331
+
280
332
  ## Code Indexing
281
333
 
282
334
  The code indexer extracts meaningful symbols from your codebase:
@@ -379,13 +431,15 @@ Add to your Claude config (`.mcp.json`):
379
431
 
380
432
  Available tools:
381
433
  - `remember` - Store a memory
382
- - `search` - Semantic search
434
+ - `search` - Semantic search (supports tiered mode and auto-extraction)
383
435
  - `list_memories` - List with filters
384
436
  - `get_memory` - Get by ID
385
437
  - `store_doc` - Store document verbatim
386
438
  - `forget` - Delete memory
387
439
  - `graduate` - Promote branch → app
388
440
  - `update_status` - Change memory status
441
+ - `record_feedback` - Record when search results are used (improves future rankings)
442
+ - `feedback_stats` - View search result usage patterns
389
443
 
390
444
  ## ghp-cli Integration
391
445
 
@@ -233,9 +233,38 @@ This is intentional - embeddings work better on focused summaries than large cod
233
233
 
234
234
  For Claude/MCP usage: The search tool description instructs Claude to read returned file paths for full implementations before making code changes.
235
235
 
236
+ ### Smart Query Understanding
237
+
238
+ Search automatically detects qualifiers in natural language:
239
+
240
+ ```bash
241
+ # These are equivalent - qualifiers are auto-detected
242
+ ragtime search "error handling in mobile app"
243
+ ragtime search "error handling" -r mobile
244
+
245
+ # Use --raw for literal/exact search
246
+ ragtime search "mobile error handling" --raw
247
+ ```
248
+
249
+ Auto-detected qualifiers include: mobile, web, desktop, ios, android, flutter, react, vue, dart, python, typescript, auth, api, database, frontend, backend, and more.
250
+
251
+ ### Tiered Search
252
+
253
+ Use tiered search to prioritize curated knowledge over raw code:
254
+
255
+ ```bash
256
+ # Via MCP
257
+ search(query="authentication", tiered=True)
258
+ ```
259
+
260
+ Tiered search returns results in priority order:
261
+ 1. **Memories** - Curated, high-signal knowledge
262
+ 2. **Documentation** - Indexed markdown files
263
+ 3. **Code** - Function signatures and symbols
264
+
236
265
  ### Hybrid Search
237
266
 
238
- Semantic search can lose qualifiers - "error handling in mobile app" might return web app results because "error handling" dominates the embedding. Use `require_terms` to ensure specific words appear:
267
+ For explicit keyword filtering, use `require_terms`:
239
268
 
240
269
  ```bash
241
270
  # CLI
@@ -247,6 +276,29 @@ search(query="error handling", require_terms=["mobile", "dart"])
247
276
 
248
277
  This combines semantic similarity (finds conceptually related content) with keyword filtering (ensures qualifiers aren't ignored).
249
278
 
279
+ ### Hierarchical Doc Chunking
280
+
281
+ Long markdown files are automatically chunked by headers for better search accuracy:
282
+
283
+ - Each section becomes a separate searchable chunk
284
+ - Parent headers are preserved as context in the embedding
285
+ - Short docs (<500 chars) remain as single chunks
286
+ - Section path is stored (e.g., "Installation > Configuration > Environment Variables")
287
+
288
+ ### Feedback Loop
289
+
290
+ Search quality improves over time based on usage patterns:
291
+
292
+ ```bash
293
+ # Record when a result is useful (via MCP)
294
+ record_feedback(query="auth flow", result_file="src/auth.py", action="used")
295
+
296
+ # View usage statistics
297
+ feedback_stats()
298
+ ```
299
+
300
+ Frequently-used files receive a boost in future search rankings.
301
+
250
302
  ## Code Indexing
251
303
 
252
304
  The code indexer extracts meaningful symbols from your codebase:
@@ -349,13 +401,15 @@ Add to your Claude config (`.mcp.json`):
349
401
 
350
402
  Available tools:
351
403
  - `remember` - Store a memory
352
- - `search` - Semantic search
404
+ - `search` - Semantic search (supports tiered mode and auto-extraction)
353
405
  - `list_memories` - List with filters
354
406
  - `get_memory` - Get by ID
355
407
  - `store_doc` - Store document verbatim
356
408
  - `forget` - Delete memory
357
409
  - `graduate` - Promote branch → app
358
410
  - `update_status` - Change memory status
411
+ - `record_feedback` - Record when search results are used (improves future rankings)
412
+ - `feedback_stats` - View search result usage patterns
359
413
 
360
414
  ## ghp-cli Integration
361
415
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ragtime-cli"
3
- version = "0.2.14"
3
+ version = "0.2.16"
4
4
  description = "Local-first memory and RAG system for Claude Code - semantic search over code, docs, and team knowledge"
5
5
  readme = "README.md"
6
6
  license = "MIT"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragtime-cli
3
- Version: 0.2.14
3
+ Version: 0.2.16
4
4
  Summary: Local-first memory and RAG system for Claude Code - semantic search over code, docs, and team knowledge
5
5
  Author-email: Bret Martineau <bretwardjames@gmail.com>
6
6
  License-Expression: MIT
@@ -263,9 +263,38 @@ This is intentional - embeddings work better on focused summaries than large cod
263
263
 
264
264
  For Claude/MCP usage: The search tool description instructs Claude to read returned file paths for full implementations before making code changes.
265
265
 
266
+ ### Smart Query Understanding
267
+
268
+ Search automatically detects qualifiers in natural language:
269
+
270
+ ```bash
271
+ # These are equivalent - qualifiers are auto-detected
272
+ ragtime search "error handling in mobile app"
273
+ ragtime search "error handling" -r mobile
274
+
275
+ # Use --raw for literal/exact search
276
+ ragtime search "mobile error handling" --raw
277
+ ```
278
+
279
+ Auto-detected qualifiers include: mobile, web, desktop, ios, android, flutter, react, vue, dart, python, typescript, auth, api, database, frontend, backend, and more.
280
+
281
+ ### Tiered Search
282
+
283
+ Use tiered search to prioritize curated knowledge over raw code:
284
+
285
+ ```bash
286
+ # Via MCP
287
+ search(query="authentication", tiered=True)
288
+ ```
289
+
290
+ Tiered search returns results in priority order:
291
+ 1. **Memories** - Curated, high-signal knowledge
292
+ 2. **Documentation** - Indexed markdown files
293
+ 3. **Code** - Function signatures and symbols
294
+
266
295
  ### Hybrid Search
267
296
 
268
- Semantic search can lose qualifiers - "error handling in mobile app" might return web app results because "error handling" dominates the embedding. Use `require_terms` to ensure specific words appear:
297
+ For explicit keyword filtering, use `require_terms`:
269
298
 
270
299
  ```bash
271
300
  # CLI
@@ -277,6 +306,29 @@ search(query="error handling", require_terms=["mobile", "dart"])
277
306
 
278
307
  This combines semantic similarity (finds conceptually related content) with keyword filtering (ensures qualifiers aren't ignored).
279
308
 
309
+ ### Hierarchical Doc Chunking
310
+
311
+ Long markdown files are automatically chunked by headers for better search accuracy:
312
+
313
+ - Each section becomes a separate searchable chunk
314
+ - Parent headers are preserved as context in the embedding
315
+ - Short docs (<500 chars) remain as single chunks
316
+ - Section path is stored (e.g., "Installation > Configuration > Environment Variables")
317
+
318
+ ### Feedback Loop
319
+
320
+ Search quality improves over time based on usage patterns:
321
+
322
+ ```bash
323
+ # Record when a result is useful (via MCP)
324
+ record_feedback(query="auth flow", result_file="src/auth.py", action="used")
325
+
326
+ # View usage statistics
327
+ feedback_stats()
328
+ ```
329
+
330
+ Frequently-used files receive a boost in future search rankings.
331
+
280
332
  ## Code Indexing
281
333
 
282
334
  The code indexer extracts meaningful symbols from your codebase:
@@ -379,13 +431,15 @@ Add to your Claude config (`.mcp.json`):
379
431
 
380
432
  Available tools:
381
433
  - `remember` - Store a memory
382
- - `search` - Semantic search
434
+ - `search` - Semantic search (supports tiered mode and auto-extraction)
383
435
  - `list_memories` - List with filters
384
436
  - `get_memory` - Get by ID
385
437
  - `store_doc` - Store document verbatim
386
438
  - `forget` - Delete memory
387
439
  - `graduate` - Promote branch → app
388
440
  - `update_status` - Change memory status
441
+ - `record_feedback` - Record when search results are used (improves future rankings)
442
+ - `feedback_stats` - View search result usage patterns
389
443
 
390
444
  ## ghp-cli Integration
391
445
 
@@ -11,6 +11,7 @@ src/__init__.py
11
11
  src/cli.py
12
12
  src/config.py
13
13
  src/db.py
14
+ src/feedback.py
14
15
  src/mcp_server.py
15
16
  src/memory.py
16
17
  src/commands/audit.md
@@ -381,13 +381,13 @@ def index(path: Path, index_type: str, clear: bool):
381
381
  item_show_func=lambda f: f.name[:30] if f else "",
382
382
  ) as files:
383
383
  for file_path in files:
384
- entry = index_doc_file(file_path)
385
- if entry:
386
- entries.append(entry)
384
+ # index_doc_file returns list (hierarchical chunks)
385
+ file_entries = index_doc_file(file_path)
386
+ entries.extend(file_entries)
387
387
 
388
388
  if entries:
389
389
  _upsert_entries(db, entries, "docs")
390
- click.echo(f" Indexed {len(entries)} documents")
390
+ click.echo(f" Indexed {len(entries)} document chunks")
391
391
  elif not to_delete:
392
392
  click.echo(" All docs up to date")
393
393
  else:
@@ -2215,8 +2215,12 @@ def update(check: bool):
2215
2215
  import json
2216
2216
  from urllib.request import urlopen
2217
2217
  from urllib.error import URLError
2218
+ from importlib.metadata import version as get_version
2218
2219
 
2219
- current = "0.2.9"
2220
+ try:
2221
+ current = get_version("ragtime-cli")
2222
+ except Exception:
2223
+ current = "0.0.0" # Fallback if not installed as package
2220
2224
 
2221
2225
  click.echo(f"Current version: {current}")
2222
2226
  click.echo("Checking PyPI for updates...")
@@ -238,6 +238,118 @@ class RagtimeDB:
238
238
 
239
239
  return output
240
240
 
241
+ def search_tiered(
242
+ self,
243
+ query: str,
244
+ limit: int = 10,
245
+ namespace: str | None = None,
246
+ require_terms: list[str] | None = None,
247
+ auto_extract: bool = True,
248
+ **filters,
249
+ ) -> list[dict]:
250
+ """
251
+ Tiered search: prioritizes memories > docs > code.
252
+
253
+ Searches in priority order, filling up to limit:
254
+ 1. Memories (curated, high-signal knowledge)
255
+ 2. Documentation (indexed markdown)
256
+ 3. Code (broadest, implementation details)
257
+
258
+ Args:
259
+ query: Natural language search query
260
+ limit: Max total results to return
261
+ namespace: Filter by namespace
262
+ require_terms: Terms that MUST appear in results
263
+ auto_extract: Auto-detect qualifiers from query
264
+ **filters: Additional metadata filters
265
+
266
+ Returns:
267
+ List of dicts with 'content', 'metadata', 'distance', 'tier'
268
+ """
269
+ results = []
270
+
271
+ # Tier 1: Memories (not docs or code)
272
+ memory_results = self._search_tier(
273
+ query=query,
274
+ tier_name="memory",
275
+ exclude_types=["docs", "code"],
276
+ limit=limit,
277
+ namespace=namespace,
278
+ require_terms=require_terms,
279
+ auto_extract=auto_extract,
280
+ **filters,
281
+ )
282
+ results.extend(memory_results)
283
+
284
+ # Tier 2: Documentation
285
+ if len(results) < limit:
286
+ doc_results = self._search_tier(
287
+ query=query,
288
+ tier_name="docs",
289
+ type_filter="docs",
290
+ limit=limit - len(results),
291
+ namespace=namespace,
292
+ require_terms=require_terms,
293
+ auto_extract=auto_extract,
294
+ **filters,
295
+ )
296
+ results.extend(doc_results)
297
+
298
+ # Tier 3: Code
299
+ if len(results) < limit:
300
+ code_results = self._search_tier(
301
+ query=query,
302
+ tier_name="code",
303
+ type_filter="code",
304
+ limit=limit - len(results),
305
+ namespace=namespace,
306
+ require_terms=require_terms,
307
+ auto_extract=auto_extract,
308
+ **filters,
309
+ )
310
+ results.extend(code_results)
311
+
312
+ return results
313
+
314
+ def _search_tier(
315
+ self,
316
+ query: str,
317
+ tier_name: str,
318
+ limit: int,
319
+ type_filter: str | None = None,
320
+ exclude_types: list[str] | None = None,
321
+ **kwargs,
322
+ ) -> list[dict]:
323
+ """Search a single tier and tag results."""
324
+ # Build where clause for exclusion if needed
325
+ if exclude_types:
326
+ # Search without type filter, then exclude in post-processing
327
+ results = self.search(
328
+ query=query,
329
+ limit=limit * 2, # fetch more since we'll filter
330
+ type_filter=None,
331
+ **kwargs,
332
+ )
333
+ # Filter out excluded types
334
+ filtered = []
335
+ for r in results:
336
+ if r["metadata"].get("type") not in exclude_types:
337
+ r["tier"] = tier_name
338
+ filtered.append(r)
339
+ if len(filtered) >= limit:
340
+ break
341
+ return filtered
342
+ else:
343
+ results = self.search(
344
+ query=query,
345
+ limit=limit,
346
+ type_filter=type_filter,
347
+ **kwargs,
348
+ )
349
+ for r in results:
350
+ r["tier"] = tier_name
351
+ return results
352
+
241
353
  def delete(self, ids: list[str]) -> None:
242
354
  """Delete documents by ID."""
243
355
  self.collection.delete(ids=ids)
@@ -0,0 +1,202 @@
1
+ """
2
+ Feedback loop for RAG result quality improvement.
3
+
4
+ Tracks which search results are actually used/referenced by Claude,
5
+ enabling re-ranking and quality improvements over time.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+ from dataclasses import dataclass, field, asdict
11
+ from datetime import datetime
12
+ from typing import Optional
13
+
14
+
15
+ @dataclass
16
+ class SearchFeedback:
17
+ """Feedback for a single search result."""
18
+ query: str
19
+ result_id: str # ChromaDB document ID
20
+ result_file: str # File path for easier debugging
21
+ action: str # "used", "referenced", "ignored", "helpful", "not_helpful"
22
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
23
+ session_id: Optional[str] = None # Group related searches
24
+ position: int = 0 # Position in results (1-indexed)
25
+ distance: float = 0.0 # Original semantic distance
26
+
27
+
28
+ class FeedbackStore:
29
+ """
30
+ Simple file-based feedback storage.
31
+
32
+ Stores feedback as JSON lines for easy analysis.
33
+ Can be upgraded to SQLite or ChromaDB later.
34
+ """
35
+
36
+ def __init__(self, path: Path):
37
+ """
38
+ Initialize feedback store.
39
+
40
+ Args:
41
+ path: Directory to store feedback data
42
+ """
43
+ self.path = path
44
+ self.feedback_file = path / "feedback.jsonl"
45
+ self.stats_file = path / "feedback_stats.json"
46
+ path.mkdir(parents=True, exist_ok=True)
47
+
48
+ def record(self, feedback: SearchFeedback) -> None:
49
+ """Record a single feedback entry."""
50
+ with open(self.feedback_file, "a") as f:
51
+ f.write(json.dumps(asdict(feedback)) + "\n")
52
+
53
+ def record_usage(
54
+ self,
55
+ query: str,
56
+ result_id: str,
57
+ result_file: str,
58
+ position: int = 0,
59
+ distance: float = 0.0,
60
+ session_id: Optional[str] = None,
61
+ ) -> None:
62
+ """Convenience method to record when a result is used."""
63
+ self.record(SearchFeedback(
64
+ query=query,
65
+ result_id=result_id,
66
+ result_file=result_file,
67
+ action="used",
68
+ position=position,
69
+ distance=distance,
70
+ session_id=session_id,
71
+ ))
72
+
73
+ def record_batch(
74
+ self,
75
+ query: str,
76
+ used_ids: list[str],
77
+ all_results: list[dict],
78
+ session_id: Optional[str] = None,
79
+ ) -> None:
80
+ """
81
+ Record feedback for a batch of results.
82
+
83
+ Marks used_ids as "used" and others as "ignored".
84
+ """
85
+ used_set = set(used_ids)
86
+
87
+ for i, result in enumerate(all_results):
88
+ result_id = result.get("id", "")
89
+ result_file = result.get("metadata", {}).get("file", "")
90
+ distance = result.get("distance", 0.0)
91
+
92
+ action = "used" if result_id in used_set else "ignored"
93
+
94
+ self.record(SearchFeedback(
95
+ query=query,
96
+ result_id=result_id,
97
+ result_file=result_file,
98
+ action=action,
99
+ position=i + 1,
100
+ distance=distance,
101
+ session_id=session_id,
102
+ ))
103
+
104
+ def get_usage_stats(self) -> dict:
105
+ """
106
+ Get aggregated usage statistics.
107
+
108
+ Returns:
109
+ Dict with usage counts, popular files, etc.
110
+ """
111
+ if not self.feedback_file.exists():
112
+ return {"total": 0, "used": 0, "ignored": 0}
113
+
114
+ stats = {
115
+ "total": 0,
116
+ "used": 0,
117
+ "ignored": 0,
118
+ "helpful": 0,
119
+ "not_helpful": 0,
120
+ "files_used": {}, # file -> count
121
+ "avg_position_used": 0.0,
122
+ }
123
+
124
+ positions = []
125
+
126
+ with open(self.feedback_file) as f:
127
+ for line in f:
128
+ if not line.strip():
129
+ continue
130
+ try:
131
+ entry = json.loads(line)
132
+ stats["total"] += 1
133
+ action = entry.get("action", "")
134
+
135
+ if action == "used":
136
+ stats["used"] += 1
137
+ positions.append(entry.get("position", 0))
138
+ file_path = entry.get("result_file", "")
139
+ stats["files_used"][file_path] = stats["files_used"].get(file_path, 0) + 1
140
+ elif action == "ignored":
141
+ stats["ignored"] += 1
142
+ elif action == "helpful":
143
+ stats["helpful"] += 1
144
+ elif action == "not_helpful":
145
+ stats["not_helpful"] += 1
146
+ except json.JSONDecodeError:
147
+ continue
148
+
149
+ if positions:
150
+ stats["avg_position_used"] = sum(positions) / len(positions)
151
+
152
+ return stats
153
+
154
+ def get_boost_scores(self) -> dict[str, float]:
155
+ """
156
+ Calculate boost scores for files based on historical usage.
157
+
158
+ Returns:
159
+ Dict mapping file paths to boost multipliers (1.0 = no boost).
160
+ """
161
+ stats = self.get_usage_stats()
162
+ files_used = stats.get("files_used", {})
163
+
164
+ if not files_used:
165
+ return {}
166
+
167
+ # Normalize to 0-1 range, then convert to boost multiplier
168
+ max_count = max(files_used.values())
169
+ boosts = {}
170
+
171
+ for file_path, count in files_used.items():
172
+ # Boost range: 1.0 (no boost) to 1.5 (50% boost for most-used)
173
+ normalized = count / max_count
174
+ boosts[file_path] = 1.0 + (normalized * 0.5)
175
+
176
+ return boosts
177
+
178
+ def apply_boosts(self, results: list[dict], boosts: dict[str, float]) -> list[dict]:
179
+ """
180
+ Apply historical boost scores to search results.
181
+
182
+ Adjusts distances based on historical usage patterns.
183
+ Lower distance = more relevant, so we divide by boost.
184
+ """
185
+ if not boosts:
186
+ return results
187
+
188
+ for result in results:
189
+ file_path = result.get("metadata", {}).get("file", "")
190
+ boost = boosts.get(file_path, 1.0)
191
+ if "distance" in result and result["distance"]:
192
+ # Reduce distance for frequently-used files
193
+ result["distance"] = result["distance"] / boost
194
+ result["boosted"] = boost > 1.0
195
+
196
+ # Re-sort by adjusted distance
197
+ return sorted(results, key=lambda r: r.get("distance", float("inf")))
198
+
199
+ def clear(self) -> None:
200
+ """Clear all feedback data."""
201
+ if self.feedback_file.exists():
202
+ self.feedback_file.unlink()