ragtime-cli 0.2.13__tar.gz → 0.2.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {ragtime_cli-0.2.13/ragtime_cli.egg-info → ragtime_cli-0.2.15}/PKG-INFO +1 -1
  2. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/pyproject.toml +1 -1
  3. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15/ragtime_cli.egg-info}/PKG-INFO +1 -1
  4. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/ragtime_cli.egg-info/SOURCES.txt +1 -0
  5. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/cli.py +14 -9
  6. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/db.py +194 -4
  7. ragtime_cli-0.2.15/src/feedback.py +202 -0
  8. ragtime_cli-0.2.15/src/indexers/docs.py +312 -0
  9. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/mcp_server.py +133 -14
  10. ragtime_cli-0.2.13/src/indexers/docs.py +0 -134
  11. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/LICENSE +0 -0
  12. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/README.md +0 -0
  13. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/ragtime_cli.egg-info/dependency_links.txt +0 -0
  14. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/ragtime_cli.egg-info/entry_points.txt +0 -0
  15. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/ragtime_cli.egg-info/requires.txt +0 -0
  16. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/ragtime_cli.egg-info/top_level.txt +0 -0
  17. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/setup.cfg +0 -0
  18. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/__init__.py +0 -0
  19. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/commands/audit.md +0 -0
  20. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/commands/create-pr.md +0 -0
  21. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/commands/generate-docs.md +0 -0
  22. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/commands/handoff.md +0 -0
  23. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/commands/import-docs.md +0 -0
  24. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/commands/pr-graduate.md +0 -0
  25. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/commands/recall.md +0 -0
  26. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/commands/remember.md +0 -0
  27. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/commands/save.md +0 -0
  28. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/commands/start.md +0 -0
  29. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/config.py +0 -0
  30. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/indexers/__init__.py +0 -0
  31. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/indexers/code.py +0 -0
  32. {ragtime_cli-0.2.13 → ragtime_cli-0.2.15}/src/memory.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragtime-cli
3
- Version: 0.2.13
3
+ Version: 0.2.15
4
4
  Summary: Local-first memory and RAG system for Claude Code - semantic search over code, docs, and team knowledge
5
5
  Author-email: Bret Martineau <bretwardjames@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ragtime-cli"
3
- version = "0.2.13"
3
+ version = "0.2.15"
4
4
  description = "Local-first memory and RAG system for Claude Code - semantic search over code, docs, and team knowledge"
5
5
  readme = "README.md"
6
6
  license = "MIT"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragtime-cli
3
- Version: 0.2.13
3
+ Version: 0.2.15
4
4
  Summary: Local-first memory and RAG system for Claude Code - semantic search over code, docs, and team knowledge
5
5
  Author-email: Bret Martineau <bretwardjames@gmail.com>
6
6
  License-Expression: MIT
@@ -11,6 +11,7 @@ src/__init__.py
11
11
  src/cli.py
12
12
  src/config.py
13
13
  src/db.py
14
+ src/feedback.py
14
15
  src/mcp_server.py
15
16
  src/memory.py
16
17
  src/commands/audit.md
@@ -381,13 +381,13 @@ def index(path: Path, index_type: str, clear: bool):
381
381
  item_show_func=lambda f: f.name[:30] if f else "",
382
382
  ) as files:
383
383
  for file_path in files:
384
- entry = index_doc_file(file_path)
385
- if entry:
386
- entries.append(entry)
384
+ # index_doc_file returns list (hierarchical chunks)
385
+ file_entries = index_doc_file(file_path)
386
+ entries.extend(file_entries)
387
387
 
388
388
  if entries:
389
389
  _upsert_entries(db, entries, "docs")
390
- click.echo(f" Indexed {len(entries)} documents")
390
+ click.echo(f" Indexed {len(entries)} document chunks")
391
391
  elif not to_delete:
392
392
  click.echo(" All docs up to date")
393
393
  else:
@@ -470,17 +470,21 @@ def index(path: Path, index_type: str, clear: bool):
470
470
  @click.option("--type", "type_filter", type=click.Choice(["all", "docs", "code"]), default="all")
471
471
  @click.option("--namespace", "-n", help="Filter by namespace")
472
472
  @click.option("--require", "-r", "require_terms", multiple=True,
473
- help="Terms that MUST appear in results (repeatable)")
473
+ help="Additional terms that MUST appear (usually auto-detected)")
474
+ @click.option("--raw", is_flag=True, help="Disable auto-detection of qualifiers")
474
475
  @click.option("--include-archive", is_flag=True, help="Also search archived branches")
475
476
  @click.option("--limit", "-l", default=5, help="Max results")
476
477
  @click.option("--verbose", "-v", is_flag=True, help="Show full content")
477
478
  def search(query: str, path: Path, type_filter: str, namespace: str,
478
- require_terms: tuple, include_archive: bool, limit: int, verbose: bool):
479
+ require_terms: tuple, raw: bool, include_archive: bool, limit: int, verbose: bool):
479
480
  """
480
- Hybrid search: semantic similarity + keyword filtering.
481
+ Smart search: auto-detects qualifiers like 'mobile', 'auth', 'dart'.
481
482
 
482
- Use --require/-r to ensure specific terms appear in results.
483
- Example: ragtime search "error handling" -r mobile -r dart
483
+ \b
484
+ Examples:
485
+ ragtime search "error handling in mobile" # auto-requires 'mobile'
486
+ ragtime search "auth flow" # auto-requires 'auth'
487
+ ragtime search "useAsyncState" --raw # literal search, no extraction
484
488
  """
485
489
  path = Path(path).resolve()
486
490
  db = get_db(path)
@@ -493,6 +497,7 @@ def search(query: str, path: Path, type_filter: str, namespace: str,
493
497
  type_filter=type_arg,
494
498
  namespace=namespace,
495
499
  require_terms=list(require_terms) if require_terms else None,
500
+ auto_extract=not raw,
496
501
  )
497
502
 
498
503
  if not results:
@@ -4,12 +4,74 @@ ChromaDB wrapper for ragtime.
4
4
  Handles storage and retrieval of indexed documents and code.
5
5
  """
6
6
 
7
+ import re
7
8
  from pathlib import Path
8
9
  from typing import Any
9
10
  import chromadb
10
11
  from chromadb.config import Settings
11
12
 
12
13
 
14
+ def extract_query_hints(query: str, known_components: list[str] | None = None) -> tuple[str, list[str]]:
15
+ """
16
+ Extract component/scope hints from a query for hybrid search.
17
+
18
+ Detects patterns like "X in mobile", "mobile X", "X for auth" and extracts
19
+ the qualifier to use as require_terms. This prevents qualifiers from being
20
+ diluted in semantic search.
21
+
22
+ Args:
23
+ query: The natural language search query
24
+ known_components: Optional list of known component names to detect
25
+
26
+ Returns:
27
+ (cleaned_query, extracted_terms) - query with hints removed, terms to require
28
+ """
29
+ # Default known components/scopes (common patterns)
30
+ default_components = [
31
+ # Platforms
32
+ "mobile", "web", "desktop", "ios", "android", "flutter", "react", "vue",
33
+ # Languages
34
+ "dart", "python", "typescript", "javascript", "ts", "js", "py",
35
+ # Common components
36
+ "auth", "authentication", "api", "database", "db", "ui", "frontend", "backend",
37
+ "server", "client", "admin", "user", "payment", "billing", "notification",
38
+ "email", "cache", "queue", "worker", "scheduler", "logging", "metrics",
39
+ ]
40
+
41
+ components = set(c.lower() for c in (known_components or default_components))
42
+ extracted = []
43
+ cleaned = query
44
+
45
+ # Pattern 1: "X in/for/on {component}" - extract component
46
+ patterns = [
47
+ r'\b(?:in|for|on|from|using|with)\s+(?:the\s+)?(\w+)\s*(?:app|code|module|service|codebase)?(?:\s|$)',
48
+ r'\b(\w+)\s+(?:app|code|module|service|codebase)\b',
49
+ ]
50
+
51
+ for pattern in patterns:
52
+ for match in re.finditer(pattern, query, re.IGNORECASE):
53
+ word = match.group(1).lower()
54
+ if word in components:
55
+ extracted.append(word)
56
+ # Remove the matched phrase from query
57
+ cleaned = cleaned[:match.start()] + " " + cleaned[match.end():]
58
+
59
+ # Pattern 2: Check if any known component appears as standalone word
60
+ words = re.findall(r'\b\w+\b', query.lower())
61
+ for word in words:
62
+ if word in components and word not in extracted:
63
+ # Only extract if it looks like a qualifier (not the main subject)
64
+ # Heuristic: if query has other meaningful words, it's likely a qualifier
65
+ other_words = [w for w in words if w != word and len(w) > 3]
66
+ if len(other_words) >= 2:
67
+ extracted.append(word)
68
+
69
+ # Clean up extra whitespace
70
+ cleaned = re.sub(r'\s+', ' ', cleaned).strip()
71
+
72
+ return cleaned, list(set(extracted))
73
+
74
+
13
75
  class RagtimeDB:
14
76
  """Vector database for ragtime indexes."""
15
77
 
@@ -85,6 +147,7 @@ class RagtimeDB:
85
147
  type_filter: str | None = None,
86
148
  namespace: str | None = None,
87
149
  require_terms: list[str] | None = None,
150
+ auto_extract: bool = True,
88
151
  **filters,
89
152
  ) -> list[dict]:
90
153
  """
@@ -98,11 +161,26 @@ class RagtimeDB:
98
161
  require_terms: List of terms that MUST appear in results (case-insensitive).
99
162
  Use for scoped queries like "error handling in mobile" with
100
163
  require_terms=["mobile"] to ensure "mobile" isn't ignored.
164
+ auto_extract: If True (default), automatically detect component qualifiers
165
+ in the query and add them to require_terms. Set to False
166
+ for raw/literal search.
101
167
  **filters: Additional metadata filters (None values are ignored)
102
168
 
103
169
  Returns:
104
170
  List of dicts with 'content', 'metadata', 'distance'
105
171
  """
172
+ # Auto-extract component hints from query if enabled
173
+ search_query = query
174
+ all_require_terms = list(require_terms) if require_terms else []
175
+
176
+ if auto_extract:
177
+ cleaned_query, extracted = extract_query_hints(query)
178
+ if extracted:
179
+ # Use cleaned query for embedding (removes noise)
180
+ search_query = cleaned_query
181
+ # Add extracted terms to require_terms
182
+ all_require_terms.extend(extracted)
183
+ all_require_terms = list(set(all_require_terms)) # dedupe
106
184
  # Build list of filter conditions, excluding None values
107
185
  conditions = []
108
186
 
@@ -126,10 +204,10 @@ class RagtimeDB:
126
204
  where = {"$and": conditions}
127
205
 
128
206
  # When using require_terms, fetch more results since we'll filter some out
129
- fetch_limit = limit * 5 if require_terms else limit
207
+ fetch_limit = limit * 5 if all_require_terms else limit
130
208
 
131
209
  results = self.collection.query(
132
- query_texts=[query],
210
+ query_texts=[search_query],
133
211
  n_results=fetch_limit,
134
212
  where=where,
135
213
  )
@@ -139,13 +217,13 @@ class RagtimeDB:
139
217
  if results["documents"] and results["documents"][0]:
140
218
  for i, doc in enumerate(results["documents"][0]):
141
219
  # Hybrid filtering: ensure required terms appear
142
- if require_terms:
220
+ if all_require_terms:
143
221
  doc_lower = doc.lower()
144
222
  # Also check file path in metadata for code/file matches
145
223
  file_path = (results["metadatas"][0][i].get("file", "") or "").lower()
146
224
  combined_text = f"{doc_lower} {file_path}"
147
225
 
148
- if not all(term.lower() in combined_text for term in require_terms):
226
+ if not all(term.lower() in combined_text for term in all_require_terms):
149
227
  continue
150
228
 
151
229
  output.append({
@@ -160,6 +238,118 @@ class RagtimeDB:
160
238
 
161
239
  return output
162
240
 
241
+ def search_tiered(
242
+ self,
243
+ query: str,
244
+ limit: int = 10,
245
+ namespace: str | None = None,
246
+ require_terms: list[str] | None = None,
247
+ auto_extract: bool = True,
248
+ **filters,
249
+ ) -> list[dict]:
250
+ """
251
+ Tiered search: prioritizes memories > docs > code.
252
+
253
+ Searches in priority order, filling up to limit:
254
+ 1. Memories (curated, high-signal knowledge)
255
+ 2. Documentation (indexed markdown)
256
+ 3. Code (broadest, implementation details)
257
+
258
+ Args:
259
+ query: Natural language search query
260
+ limit: Max total results to return
261
+ namespace: Filter by namespace
262
+ require_terms: Terms that MUST appear in results
263
+ auto_extract: Auto-detect qualifiers from query
264
+ **filters: Additional metadata filters
265
+
266
+ Returns:
267
+ List of dicts with 'content', 'metadata', 'distance', 'tier'
268
+ """
269
+ results = []
270
+
271
+ # Tier 1: Memories (not docs or code)
272
+ memory_results = self._search_tier(
273
+ query=query,
274
+ tier_name="memory",
275
+ exclude_types=["docs", "code"],
276
+ limit=limit,
277
+ namespace=namespace,
278
+ require_terms=require_terms,
279
+ auto_extract=auto_extract,
280
+ **filters,
281
+ )
282
+ results.extend(memory_results)
283
+
284
+ # Tier 2: Documentation
285
+ if len(results) < limit:
286
+ doc_results = self._search_tier(
287
+ query=query,
288
+ tier_name="docs",
289
+ type_filter="docs",
290
+ limit=limit - len(results),
291
+ namespace=namespace,
292
+ require_terms=require_terms,
293
+ auto_extract=auto_extract,
294
+ **filters,
295
+ )
296
+ results.extend(doc_results)
297
+
298
+ # Tier 3: Code
299
+ if len(results) < limit:
300
+ code_results = self._search_tier(
301
+ query=query,
302
+ tier_name="code",
303
+ type_filter="code",
304
+ limit=limit - len(results),
305
+ namespace=namespace,
306
+ require_terms=require_terms,
307
+ auto_extract=auto_extract,
308
+ **filters,
309
+ )
310
+ results.extend(code_results)
311
+
312
+ return results
313
+
314
+ def _search_tier(
315
+ self,
316
+ query: str,
317
+ tier_name: str,
318
+ limit: int,
319
+ type_filter: str | None = None,
320
+ exclude_types: list[str] | None = None,
321
+ **kwargs,
322
+ ) -> list[dict]:
323
+ """Search a single tier and tag results."""
324
+ # Build where clause for exclusion if needed
325
+ if exclude_types:
326
+ # Search without type filter, then exclude in post-processing
327
+ results = self.search(
328
+ query=query,
329
+ limit=limit * 2, # fetch more since we'll filter
330
+ type_filter=None,
331
+ **kwargs,
332
+ )
333
+ # Filter out excluded types
334
+ filtered = []
335
+ for r in results:
336
+ if r["metadata"].get("type") not in exclude_types:
337
+ r["tier"] = tier_name
338
+ filtered.append(r)
339
+ if len(filtered) >= limit:
340
+ break
341
+ return filtered
342
+ else:
343
+ results = self.search(
344
+ query=query,
345
+ limit=limit,
346
+ type_filter=type_filter,
347
+ **kwargs,
348
+ )
349
+ for r in results:
350
+ r["tier"] = tier_name
351
+ return results
352
+
163
353
  def delete(self, ids: list[str]) -> None:
164
354
  """Delete documents by ID."""
165
355
  self.collection.delete(ids=ids)
@@ -0,0 +1,202 @@
1
+ """
2
+ Feedback loop for RAG result quality improvement.
3
+
4
+ Tracks which search results are actually used/referenced by Claude,
5
+ enabling re-ranking and quality improvements over time.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+ from dataclasses import dataclass, field, asdict
11
+ from datetime import datetime
12
+ from typing import Optional
13
+
14
+
15
+ @dataclass
16
+ class SearchFeedback:
17
+ """Feedback for a single search result."""
18
+ query: str
19
+ result_id: str # ChromaDB document ID
20
+ result_file: str # File path for easier debugging
21
+ action: str # "used", "referenced", "ignored", "helpful", "not_helpful"
22
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
23
+ session_id: Optional[str] = None # Group related searches
24
+ position: int = 0 # Position in results (1-indexed)
25
+ distance: float = 0.0 # Original semantic distance
26
+
27
+
28
+ class FeedbackStore:
29
+ """
30
+ Simple file-based feedback storage.
31
+
32
+ Stores feedback as JSON lines for easy analysis.
33
+ Can be upgraded to SQLite or ChromaDB later.
34
+ """
35
+
36
+ def __init__(self, path: Path):
37
+ """
38
+ Initialize feedback store.
39
+
40
+ Args:
41
+ path: Directory to store feedback data
42
+ """
43
+ self.path = path
44
+ self.feedback_file = path / "feedback.jsonl"
45
+ self.stats_file = path / "feedback_stats.json"
46
+ path.mkdir(parents=True, exist_ok=True)
47
+
48
+ def record(self, feedback: SearchFeedback) -> None:
49
+ """Record a single feedback entry."""
50
+ with open(self.feedback_file, "a") as f:
51
+ f.write(json.dumps(asdict(feedback)) + "\n")
52
+
53
+ def record_usage(
54
+ self,
55
+ query: str,
56
+ result_id: str,
57
+ result_file: str,
58
+ position: int = 0,
59
+ distance: float = 0.0,
60
+ session_id: Optional[str] = None,
61
+ ) -> None:
62
+ """Convenience method to record when a result is used."""
63
+ self.record(SearchFeedback(
64
+ query=query,
65
+ result_id=result_id,
66
+ result_file=result_file,
67
+ action="used",
68
+ position=position,
69
+ distance=distance,
70
+ session_id=session_id,
71
+ ))
72
+
73
+ def record_batch(
74
+ self,
75
+ query: str,
76
+ used_ids: list[str],
77
+ all_results: list[dict],
78
+ session_id: Optional[str] = None,
79
+ ) -> None:
80
+ """
81
+ Record feedback for a batch of results.
82
+
83
+ Marks used_ids as "used" and others as "ignored".
84
+ """
85
+ used_set = set(used_ids)
86
+
87
+ for i, result in enumerate(all_results):
88
+ result_id = result.get("id", "")
89
+ result_file = result.get("metadata", {}).get("file", "")
90
+ distance = result.get("distance", 0.0)
91
+
92
+ action = "used" if result_id in used_set else "ignored"
93
+
94
+ self.record(SearchFeedback(
95
+ query=query,
96
+ result_id=result_id,
97
+ result_file=result_file,
98
+ action=action,
99
+ position=i + 1,
100
+ distance=distance,
101
+ session_id=session_id,
102
+ ))
103
+
104
+ def get_usage_stats(self) -> dict:
105
+ """
106
+ Get aggregated usage statistics.
107
+
108
+ Returns:
109
+ Dict with usage counts, popular files, etc.
110
+ """
111
+ if not self.feedback_file.exists():
112
+ return {"total": 0, "used": 0, "ignored": 0}
113
+
114
+ stats = {
115
+ "total": 0,
116
+ "used": 0,
117
+ "ignored": 0,
118
+ "helpful": 0,
119
+ "not_helpful": 0,
120
+ "files_used": {}, # file -> count
121
+ "avg_position_used": 0.0,
122
+ }
123
+
124
+ positions = []
125
+
126
+ with open(self.feedback_file) as f:
127
+ for line in f:
128
+ if not line.strip():
129
+ continue
130
+ try:
131
+ entry = json.loads(line)
132
+ stats["total"] += 1
133
+ action = entry.get("action", "")
134
+
135
+ if action == "used":
136
+ stats["used"] += 1
137
+ positions.append(entry.get("position", 0))
138
+ file_path = entry.get("result_file", "")
139
+ stats["files_used"][file_path] = stats["files_used"].get(file_path, 0) + 1
140
+ elif action == "ignored":
141
+ stats["ignored"] += 1
142
+ elif action == "helpful":
143
+ stats["helpful"] += 1
144
+ elif action == "not_helpful":
145
+ stats["not_helpful"] += 1
146
+ except json.JSONDecodeError:
147
+ continue
148
+
149
+ if positions:
150
+ stats["avg_position_used"] = sum(positions) / len(positions)
151
+
152
+ return stats
153
+
154
+ def get_boost_scores(self) -> dict[str, float]:
155
+ """
156
+ Calculate boost scores for files based on historical usage.
157
+
158
+ Returns:
159
+ Dict mapping file paths to boost multipliers (1.0 = no boost).
160
+ """
161
+ stats = self.get_usage_stats()
162
+ files_used = stats.get("files_used", {})
163
+
164
+ if not files_used:
165
+ return {}
166
+
167
+ # Normalize to 0-1 range, then convert to boost multiplier
168
+ max_count = max(files_used.values())
169
+ boosts = {}
170
+
171
+ for file_path, count in files_used.items():
172
+ # Boost range: 1.0 (no boost) to 1.5 (50% boost for most-used)
173
+ normalized = count / max_count
174
+ boosts[file_path] = 1.0 + (normalized * 0.5)
175
+
176
+ return boosts
177
+
178
+ def apply_boosts(self, results: list[dict], boosts: dict[str, float]) -> list[dict]:
179
+ """
180
+ Apply historical boost scores to search results.
181
+
182
+ Adjusts distances based on historical usage patterns.
183
+ Lower distance = more relevant, so we divide by boost.
184
+ """
185
+ if not boosts:
186
+ return results
187
+
188
+ for result in results:
189
+ file_path = result.get("metadata", {}).get("file", "")
190
+ boost = boosts.get(file_path, 1.0)
191
+ if "distance" in result and result["distance"]:
192
+ # Reduce distance for frequently-used files
193
+ result["distance"] = result["distance"] / boost
194
+ result["boosted"] = boost > 1.0
195
+
196
+ # Re-sort by adjusted distance
197
+ return sorted(results, key=lambda r: r.get("distance", float("inf")))
198
+
199
+ def clear(self) -> None:
200
+ """Clear all feedback data."""
201
+ if self.feedback_file.exists():
202
+ self.feedback_file.unlink()
@@ -0,0 +1,312 @@
1
+ """
2
+ Docs indexer - parses markdown files with YAML frontmatter.
3
+
4
+ Designed for .claude/memory/ style files but works with any markdown.
5
+ """
6
+
7
+ import os
8
+ import re
9
+ from pathlib import Path
10
+ from dataclasses import dataclass
11
+ import yaml
12
+
13
+
14
+ @dataclass
15
+ class DocEntry:
16
+ """A parsed document ready for indexing."""
17
+ content: str
18
+ file_path: str
19
+ namespace: str | None = None
20
+ category: str | None = None
21
+ component: str | None = None
22
+ title: str | None = None
23
+ mtime: float | None = None # File modification time for incremental indexing
24
+ # Hierarchical chunking fields
25
+ section_path: str | None = None # e.g., "Installation > Configuration > Environment Variables"
26
+ section_level: int = 0 # Header depth (0=whole doc, 1=h1, 2=h2, etc.)
27
+ chunk_index: int = 0 # Position within file (for stable IDs)
28
+
29
+ def to_metadata(self) -> dict:
30
+ """Convert to ChromaDB metadata dict."""
31
+ return {
32
+ "type": "docs",
33
+ "file": self.file_path,
34
+ "namespace": self.namespace or "default",
35
+ "category": self.category or "",
36
+ "component": self.component or "",
37
+ "title": self.title or Path(self.file_path).stem,
38
+ "mtime": self.mtime or 0.0,
39
+ "section_path": self.section_path or "",
40
+ "section_level": self.section_level,
41
+ }
42
+
43
+
44
+ def parse_frontmatter(content: str) -> tuple[dict, str]:
45
+ """
46
+ Parse YAML frontmatter from markdown content.
47
+
48
+ Returns (metadata_dict, body_content).
49
+ If no frontmatter, returns ({}, full_content).
50
+ """
51
+ pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
52
+ match = re.match(pattern, content, re.DOTALL)
53
+
54
+ if not match:
55
+ return {}, content
56
+
57
+ try:
58
+ metadata = yaml.safe_load(match.group(1)) or {}
59
+ body = match.group(2)
60
+ return metadata, body
61
+ except yaml.YAMLError:
62
+ return {}, content
63
+
64
+
65
+ @dataclass
66
+ class Section:
67
+ """A markdown section for hierarchical chunking."""
68
+ title: str
69
+ level: int # 1-6 for h1-h6
70
+ content: str
71
+ line_start: int
72
+ parent_path: list[str] # Parent headers for context
73
+
74
+
75
+ def chunk_by_headers(
76
+ content: str,
77
+ min_chunk_size: int = 100,
78
+ max_chunk_size: int = 2000,
79
+ ) -> list[Section]:
80
+ """
81
+ Split markdown into sections by headers, preserving hierarchy.
82
+
83
+ Args:
84
+ content: Markdown body (without frontmatter)
85
+ min_chunk_size: Minimum chars to make a standalone section
86
+ max_chunk_size: Maximum chars before splitting further
87
+
88
+ Returns:
89
+ List of Section objects with hierarchical context
90
+ """
91
+ lines = content.split('\n')
92
+ sections: list[Section] = []
93
+ header_stack: list[tuple[int, str]] = [] # (level, title) for building paths
94
+
95
+ current_section_lines: list[str] = []
96
+ current_section_start = 0
97
+ current_title = ""
98
+ current_level = 0
99
+
100
+ def flush_section():
101
+ """Save accumulated lines as a section."""
102
+ nonlocal current_section_lines, current_section_start, current_title, current_level
103
+
104
+ text = '\n'.join(current_section_lines).strip()
105
+ if text:
106
+ # Build parent path from stack (excluding current)
107
+ parent_path = [h[1] for h in header_stack[:-1]] if header_stack else []
108
+
109
+ sections.append(Section(
110
+ title=current_title or "Introduction",
111
+ level=current_level,
112
+ content=text,
113
+ line_start=current_section_start,
114
+ parent_path=parent_path,
115
+ ))
116
+ current_section_lines = []
117
+
118
+ for i, line in enumerate(lines):
119
+ # Detect markdown headers
120
+ header_match = re.match(r'^(#{1,6})\s+(.+)$', line)
121
+
122
+ if header_match:
123
+ # Save previous section
124
+ flush_section()
125
+
126
+ level = len(header_match.group(1))
127
+ title = header_match.group(2).strip()
128
+
129
+ # Update header stack - pop headers at same or lower level
130
+ while header_stack and header_stack[-1][0] >= level:
131
+ header_stack.pop()
132
+ header_stack.append((level, title))
133
+
134
+ current_title = title
135
+ current_level = level
136
+ current_section_start = i
137
+ current_section_lines = [line] # Include header in content
138
+ else:
139
+ current_section_lines.append(line)
140
+
141
+ # Don't forget the last section
142
+ flush_section()
143
+
144
+ # Post-process: merge tiny sections into parents, split huge ones
145
+ processed: list[Section] = []
146
+ for section in sections:
147
+ if len(section.content) < min_chunk_size and processed:
148
+ # Merge into previous section
149
+ processed[-1].content += '\n\n' + section.content
150
+ elif len(section.content) > max_chunk_size:
151
+ # Split by paragraphs
152
+ paragraphs = re.split(r'\n\n+', section.content)
153
+ current_chunk = ""
154
+ chunk_num = 0
155
+
156
+ for para in paragraphs:
157
+ if len(current_chunk) + len(para) > max_chunk_size and current_chunk:
158
+ processed.append(Section(
159
+ title=f"{section.title} (part {chunk_num + 1})",
160
+ level=section.level,
161
+ content=current_chunk.strip(),
162
+ line_start=section.line_start,
163
+ parent_path=section.parent_path,
164
+ ))
165
+ current_chunk = para
166
+ chunk_num += 1
167
+ else:
168
+ current_chunk += '\n\n' + para if current_chunk else para
169
+
170
+ if current_chunk.strip():
171
+ title = f"{section.title} (part {chunk_num + 1})" if chunk_num > 0 else section.title
172
+ processed.append(Section(
173
+ title=title,
174
+ level=section.level,
175
+ content=current_chunk.strip(),
176
+ line_start=section.line_start,
177
+ parent_path=section.parent_path,
178
+ ))
179
+ else:
180
+ processed.append(section)
181
+
182
+ return processed
183
+
184
+
185
+ def index_file(file_path: Path, hierarchical: bool = True) -> list[DocEntry]:
186
+ """
187
+ Parse a single markdown file into DocEntry objects.
188
+
189
+ Args:
190
+ file_path: Path to the markdown file
191
+ hierarchical: If True, chunk by headers for better semantic search.
192
+ If False, return whole file as single entry.
193
+
194
+ Returns:
195
+ List of DocEntry objects (one per section if hierarchical, else one for whole file).
196
+ Empty list if file can't be parsed.
197
+ """
198
+ try:
199
+ content = file_path.read_text(encoding='utf-8')
200
+ mtime = os.path.getmtime(file_path)
201
+ except (IOError, UnicodeDecodeError, OSError):
202
+ return []
203
+
204
+ metadata, body = parse_frontmatter(content)
205
+
206
+ # Skip empty documents
207
+ if not body.strip():
208
+ return []
209
+
210
+ # Base metadata from frontmatter
211
+ base_namespace = metadata.get("namespace")
212
+ base_category = metadata.get("category")
213
+ base_component = metadata.get("component")
214
+ base_title = metadata.get("title") or file_path.stem
215
+
216
+ # Short docs: return as single entry
217
+ if not hierarchical or len(body) < 500:
218
+ return [DocEntry(
219
+ content=body.strip(),
220
+ file_path=str(file_path),
221
+ namespace=base_namespace,
222
+ category=base_category,
223
+ component=base_component,
224
+ title=base_title,
225
+ mtime=mtime,
226
+ section_path="",
227
+ section_level=0,
228
+ chunk_index=0,
229
+ )]
230
+
231
+ # Hierarchical chunking for longer docs
232
+ sections = chunk_by_headers(body)
233
+ entries = []
234
+
235
+ for i, section in enumerate(sections):
236
+ # Build full section path: "Parent > Child > Current"
237
+ path_parts = section.parent_path + [section.title]
238
+ section_path = " > ".join(path_parts)
239
+
240
+ # Prepend context for better embeddings
241
+ context_prefix = f"# {base_title}\n"
242
+ if section.parent_path:
243
+ context_prefix += f"Section: {' > '.join(section.parent_path)}\n\n"
244
+
245
+ entries.append(DocEntry(
246
+ content=context_prefix + section.content,
247
+ file_path=str(file_path),
248
+ namespace=base_namespace,
249
+ category=base_category,
250
+ component=base_component,
251
+ title=section.title,
252
+ mtime=mtime,
253
+ section_path=section_path,
254
+ section_level=section.level,
255
+ chunk_index=i,
256
+ ))
257
+
258
+ return entries
259
+
260
+
261
+ def discover_docs(
262
+ root: Path,
263
+ patterns: list[str] | None = None,
264
+ exclude: list[str] | None = None,
265
+ ) -> list[Path]:
266
+ """
267
+ Find all markdown files to index.
268
+
269
+ Args:
270
+ root: Directory to search
271
+ patterns: Glob patterns to include (default: ["**/*.md"])
272
+ exclude: Patterns to exclude (default: ["**/node_modules/**", "**/.git/**"])
273
+ """
274
+ patterns = patterns or ["**/*.md"]
275
+ exclude = exclude or ["**/node_modules/**", "**/.git/**", "**/.ragtime/**"]
276
+
277
+ files = []
278
+ for pattern in patterns:
279
+ for path in root.glob(pattern):
280
+ if path.is_file():
281
+ # Check exclusions
282
+ skip = False
283
+ for ex in exclude:
284
+ if path.match(ex):
285
+ skip = True
286
+ break
287
+ if not skip:
288
+ files.append(path)
289
+
290
+ return files
291
+
292
+
293
+ def index_directory(root: Path, hierarchical: bool = True, **kwargs) -> list[DocEntry]:
294
+ """
295
+ Index all markdown files in a directory.
296
+
297
+ Args:
298
+ root: Directory to search
299
+ hierarchical: If True, chunk long docs by headers
300
+ **kwargs: Passed to discover_docs (patterns, exclude)
301
+
302
+ Returns:
303
+ List of DocEntry objects ready for vector DB.
304
+ """
305
+ files = discover_docs(root, **kwargs)
306
+ entries = []
307
+
308
+ for file_path in files:
309
+ file_entries = index_file(file_path, hierarchical=hierarchical)
310
+ entries.extend(file_entries)
311
+
312
+ return entries
@@ -13,6 +13,7 @@ from typing import Any
13
13
 
14
14
  from .db import RagtimeDB
15
15
  from .memory import Memory, MemoryStore
16
+ from .feedback import FeedbackStore, SearchFeedback
16
17
 
17
18
 
18
19
  class RagtimeMCPServer:
@@ -28,6 +29,7 @@ class RagtimeMCPServer:
28
29
  self.project_path = project_path or Path.cwd()
29
30
  self._db = None
30
31
  self._store = None
32
+ self._feedback = None
31
33
 
32
34
  @property
33
35
  def db(self) -> RagtimeDB:
@@ -44,6 +46,14 @@ class RagtimeMCPServer:
44
46
  self._store = MemoryStore(self.project_path, self.db)
45
47
  return self._store
46
48
 
49
+ @property
50
+ def feedback(self) -> FeedbackStore:
51
+ """Lazy-load the feedback store."""
52
+ if self._feedback is None:
53
+ feedback_path = self.project_path / ".ragtime" / "feedback"
54
+ self._feedback = FeedbackStore(feedback_path)
55
+ return self._feedback
56
+
47
57
  def get_author(self) -> str:
48
58
  """Get the current developer's username."""
49
59
  try:
@@ -132,13 +142,18 @@ class RagtimeMCPServer:
132
142
  },
133
143
  {
134
144
  "name": "search",
135
- "description": "Hybrid search over indexed code and docs (semantic + keyword). Returns function signatures, class definitions, and doc summaries with file paths and line numbers. IMPORTANT: Results are summaries only - use the Read tool on returned file paths to see full implementations before making code changes or decisions.",
145
+ "description": "Smart hybrid search over indexed content. Auto-detects qualifiers like 'mobile', 'auth', 'dart' and ensures they appear in results. Use tiered=true for priority ordering (memories > docs > code). Returns summaries with file paths - use Read tool for full implementations.",
136
146
  "inputSchema": {
137
147
  "type": "object",
138
148
  "properties": {
139
149
  "query": {
140
150
  "type": "string",
141
- "description": "Natural language search query"
151
+ "description": "Natural language search query. Qualifiers like 'in mobile', 'for auth', 'dart' are auto-detected."
152
+ },
153
+ "tiered": {
154
+ "type": "boolean",
155
+ "default": False,
156
+ "description": "If true, search in priority order: memories (curated) > docs > code. Good for conceptual queries."
142
157
  },
143
158
  "namespace": {
144
159
  "type": "string",
@@ -146,7 +161,7 @@ class RagtimeMCPServer:
146
161
  },
147
162
  "type": {
148
163
  "type": "string",
149
- "description": "Filter by type (docs, code, architecture, etc.)"
164
+ "description": "Filter by type (docs, code, architecture, etc.). Ignored if tiered=true."
150
165
  },
151
166
  "component": {
152
167
  "type": "string",
@@ -155,7 +170,12 @@ class RagtimeMCPServer:
155
170
  "require_terms": {
156
171
  "type": "array",
157
172
  "items": {"type": "string"},
158
- "description": "Terms that MUST appear in results (case-insensitive). Use for scoped queries like 'error handling in mobile' with require_terms=['mobile'] to ensure the qualifier isn't lost in semantic search."
173
+ "description": "Additional terms that MUST appear in results. Usually not needed since qualifiers are auto-detected."
174
+ },
175
+ "auto_extract": {
176
+ "type": "boolean",
177
+ "default": True,
178
+ "description": "Auto-detect component qualifiers from query. Set to false for literal search."
159
179
  },
160
180
  "limit": {
161
181
  "type": "integer",
@@ -287,6 +307,42 @@ class RagtimeMCPServer:
287
307
  },
288
308
  "required": ["memory_id", "status"]
289
309
  }
310
+ },
311
+ {
312
+ "name": "record_feedback",
313
+ "description": "Record feedback when search results are used or referenced. Call this after using a search result to improve future rankings.",
314
+ "inputSchema": {
315
+ "type": "object",
316
+ "properties": {
317
+ "query": {
318
+ "type": "string",
319
+ "description": "The original search query"
320
+ },
321
+ "result_file": {
322
+ "type": "string",
323
+ "description": "File path of the result that was used"
324
+ },
325
+ "action": {
326
+ "type": "string",
327
+ "enum": ["used", "referenced", "helpful", "not_helpful"],
328
+ "default": "used",
329
+ "description": "What happened with this result"
330
+ },
331
+ "position": {
332
+ "type": "integer",
333
+ "description": "Position in search results (1-indexed)"
334
+ }
335
+ },
336
+ "required": ["query", "result_file"]
337
+ }
338
+ },
339
+ {
340
+ "name": "feedback_stats",
341
+ "description": "Get statistics about search result usage patterns",
342
+ "inputSchema": {
343
+ "type": "object",
344
+ "properties": {}
345
+ }
290
346
  }
291
347
  ]
292
348
 
@@ -308,6 +364,10 @@ class RagtimeMCPServer:
308
364
  return self._graduate(arguments)
309
365
  elif name == "update_status":
310
366
  return self._update_status(arguments)
367
+ elif name == "record_feedback":
368
+ return self._record_feedback(arguments)
369
+ elif name == "feedback_stats":
370
+ return self._feedback_stats(arguments)
311
371
  else:
312
372
  raise ValueError(f"Unknown tool: {name}")
313
373
 
@@ -338,23 +398,43 @@ class RagtimeMCPServer:
338
398
  }
339
399
 
340
400
  def _search(self, args: dict) -> dict:
341
- """Search indexed content with hybrid semantic + keyword matching."""
342
- results = self.db.search(
343
- query=args["query"],
344
- limit=args.get("limit", 10),
345
- namespace=args.get("namespace"),
346
- type_filter=args.get("type"),
347
- component=args.get("component"),
348
- require_terms=args.get("require_terms"),
349
- )
401
+ """Search indexed content with smart query understanding."""
402
+ if args.get("tiered", False):
403
+ # Tiered search: memories > docs > code
404
+ results = self.db.search_tiered(
405
+ query=args["query"],
406
+ limit=args.get("limit", 10),
407
+ namespace=args.get("namespace"),
408
+ require_terms=args.get("require_terms"),
409
+ auto_extract=args.get("auto_extract", True),
410
+ component=args.get("component"),
411
+ )
412
+ else:
413
+ results = self.db.search(
414
+ query=args["query"],
415
+ limit=args.get("limit", 10),
416
+ namespace=args.get("namespace"),
417
+ type_filter=args.get("type"),
418
+ component=args.get("component"),
419
+ require_terms=args.get("require_terms"),
420
+ auto_extract=args.get("auto_extract", True),
421
+ )
422
+
423
+ # Apply feedback-based boosts
424
+ boosts = self.feedback.get_boost_scores()
425
+ if boosts:
426
+ results = self.feedback.apply_boosts(results, boosts)
350
427
 
351
428
  return {
352
429
  "count": len(results),
430
+ "query": args["query"],
353
431
  "results": [
354
432
  {
355
433
  "content": r["content"],
356
434
  "metadata": r["metadata"],
357
435
  "score": 1 - r["distance"] if r["distance"] else None,
436
+ "boosted": r.get("boosted", False),
437
+ "tier": r.get("tier"), # For tiered search
358
438
  }
359
439
  for r in results
360
440
  ]
@@ -479,6 +559,45 @@ class RagtimeMCPServer:
479
559
  "status": args["status"],
480
560
  }
481
561
 
562
+ def _record_feedback(self, args: dict) -> dict:
563
+ """Record feedback for a search result."""
564
+ feedback = SearchFeedback(
565
+ query=args["query"],
566
+ result_id="", # We match by file path
567
+ result_file=args["result_file"],
568
+ action=args.get("action", "used"),
569
+ position=args.get("position", 0),
570
+ )
571
+
572
+ self.feedback.record(feedback)
573
+
574
+ return {
575
+ "success": True,
576
+ "query": args["query"],
577
+ "result_file": args["result_file"],
578
+ "action": feedback.action,
579
+ }
580
+
581
+ def _feedback_stats(self, args: dict) -> dict:
582
+ """Get feedback statistics."""
583
+ stats = self.feedback.get_usage_stats()
584
+ boosts = self.feedback.get_boost_scores()
585
+
586
+ # Get top boosted files
587
+ top_files = sorted(boosts.items(), key=lambda x: x[1], reverse=True)[:10]
588
+
589
+ return {
590
+ "total_feedback": stats["total"],
591
+ "results_used": stats["used"],
592
+ "results_ignored": stats["ignored"],
593
+ "helpful_count": stats["helpful"],
594
+ "not_helpful_count": stats["not_helpful"],
595
+ "avg_position_used": round(stats["avg_position_used"], 2),
596
+ "top_boosted_files": [
597
+ {"file": f, "boost": round(b, 2)} for f, b in top_files
598
+ ],
599
+ }
600
+
482
601
  def handle_message(self, message: dict) -> dict:
483
602
  """Handle an incoming JSON-RPC message."""
484
603
  method = message.get("method")
@@ -493,7 +612,7 @@ class RagtimeMCPServer:
493
612
  "protocolVersion": "2024-11-05",
494
613
  "serverInfo": {
495
614
  "name": "ragtime",
496
- "version": "0.2.13",
615
+ "version": "0.2.15",
497
616
  },
498
617
  "capabilities": {
499
618
  "tools": {},
@@ -1,134 +0,0 @@
1
- """
2
- Docs indexer - parses markdown files with YAML frontmatter.
3
-
4
- Designed for .claude/memory/ style files but works with any markdown.
5
- """
6
-
7
- import os
8
- import re
9
- from pathlib import Path
10
- from dataclasses import dataclass
11
- import yaml
12
-
13
-
14
- @dataclass
15
- class DocEntry:
16
- """A parsed document ready for indexing."""
17
- content: str
18
- file_path: str
19
- namespace: str | None = None
20
- category: str | None = None
21
- component: str | None = None
22
- title: str | None = None
23
- mtime: float | None = None # File modification time for incremental indexing
24
-
25
- def to_metadata(self) -> dict:
26
- """Convert to ChromaDB metadata dict."""
27
- return {
28
- "type": "docs",
29
- "file": self.file_path,
30
- "namespace": self.namespace or "default",
31
- "category": self.category or "",
32
- "component": self.component or "",
33
- "title": self.title or Path(self.file_path).stem,
34
- "mtime": self.mtime or 0.0,
35
- }
36
-
37
-
38
- def parse_frontmatter(content: str) -> tuple[dict, str]:
39
- """
40
- Parse YAML frontmatter from markdown content.
41
-
42
- Returns (metadata_dict, body_content).
43
- If no frontmatter, returns ({}, full_content).
44
- """
45
- pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
46
- match = re.match(pattern, content, re.DOTALL)
47
-
48
- if not match:
49
- return {}, content
50
-
51
- try:
52
- metadata = yaml.safe_load(match.group(1)) or {}
53
- body = match.group(2)
54
- return metadata, body
55
- except yaml.YAMLError:
56
- return {}, content
57
-
58
-
59
- def index_file(file_path: Path) -> DocEntry | None:
60
- """
61
- Parse a single markdown file into a DocEntry.
62
-
63
- Returns None if file can't be parsed.
64
- """
65
- try:
66
- content = file_path.read_text(encoding='utf-8')
67
- mtime = os.path.getmtime(file_path)
68
- except (IOError, UnicodeDecodeError, OSError):
69
- return None
70
-
71
- metadata, body = parse_frontmatter(content)
72
-
73
- # Skip empty documents
74
- if not body.strip():
75
- return None
76
-
77
- return DocEntry(
78
- content=body.strip(),
79
- file_path=str(file_path),
80
- namespace=metadata.get("namespace"),
81
- category=metadata.get("category"),
82
- component=metadata.get("component"),
83
- title=metadata.get("title"),
84
- mtime=mtime,
85
- )
86
-
87
-
88
- def discover_docs(
89
- root: Path,
90
- patterns: list[str] | None = None,
91
- exclude: list[str] | None = None,
92
- ) -> list[Path]:
93
- """
94
- Find all markdown files to index.
95
-
96
- Args:
97
- root: Directory to search
98
- patterns: Glob patterns to include (default: ["**/*.md"])
99
- exclude: Patterns to exclude (default: ["**/node_modules/**", "**/.git/**"])
100
- """
101
- patterns = patterns or ["**/*.md"]
102
- exclude = exclude or ["**/node_modules/**", "**/.git/**", "**/.ragtime/**"]
103
-
104
- files = []
105
- for pattern in patterns:
106
- for path in root.glob(pattern):
107
- if path.is_file():
108
- # Check exclusions
109
- skip = False
110
- for ex in exclude:
111
- if path.match(ex):
112
- skip = True
113
- break
114
- if not skip:
115
- files.append(path)
116
-
117
- return files
118
-
119
-
120
- def index_directory(root: Path, **kwargs) -> list[DocEntry]:
121
- """
122
- Index all markdown files in a directory.
123
-
124
- Returns list of DocEntry objects ready for vector DB.
125
- """
126
- files = discover_docs(root, **kwargs)
127
- entries = []
128
-
129
- for file_path in files:
130
- entry = index_file(file_path)
131
- if entry:
132
- entries.append(entry)
133
-
134
- return entries
File without changes
File without changes
File without changes
File without changes
File without changes