code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. code_review_graph/__init__.py +20 -0
  2. code_review_graph/__main__.py +4 -0
  3. code_review_graph/analysis.py +410 -0
  4. code_review_graph/changes.py +409 -0
  5. code_review_graph/cli.py +1255 -0
  6. code_review_graph/communities.py +874 -0
  7. code_review_graph/constants.py +23 -0
  8. code_review_graph/context_savings.py +317 -0
  9. code_review_graph/custom_languages.py +322 -0
  10. code_review_graph/daemon.py +1009 -0
  11. code_review_graph/daemon_cli.py +320 -0
  12. code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
  13. code_review_graph/embeddings.py +1006 -0
  14. code_review_graph/enrich.py +303 -0
  15. code_review_graph/eval/__init__.py +33 -0
  16. code_review_graph/eval/benchmarks/__init__.py +1 -0
  17. code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
  18. code_review_graph/eval/benchmarks/build_performance.py +60 -0
  19. code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
  20. code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
  21. code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
  22. code_review_graph/eval/benchmarks/search_quality.py +59 -0
  23. code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
  24. code_review_graph/eval/configs/code-review-graph.yaml +50 -0
  25. code_review_graph/eval/configs/express.yaml +45 -0
  26. code_review_graph/eval/configs/fastapi.yaml +48 -0
  27. code_review_graph/eval/configs/flask.yaml +50 -0
  28. code_review_graph/eval/configs/gin.yaml +51 -0
  29. code_review_graph/eval/configs/httpx.yaml +48 -0
  30. code_review_graph/eval/reporter.py +301 -0
  31. code_review_graph/eval/runner.py +211 -0
  32. code_review_graph/eval/scorer.py +85 -0
  33. code_review_graph/eval/token_benchmark.py +182 -0
  34. code_review_graph/exports.py +409 -0
  35. code_review_graph/flows.py +698 -0
  36. code_review_graph/graph.py +1427 -0
  37. code_review_graph/graph_diff.py +122 -0
  38. code_review_graph/hints.py +384 -0
  39. code_review_graph/incremental.py +1245 -0
  40. code_review_graph/jedi_resolver.py +303 -0
  41. code_review_graph/main.py +1079 -0
  42. code_review_graph/memory.py +142 -0
  43. code_review_graph/migrations.py +284 -0
  44. code_review_graph/parser.py +6957 -0
  45. code_review_graph/postprocessing.py +134 -0
  46. code_review_graph/prompts.py +159 -0
  47. code_review_graph/refactor.py +852 -0
  48. code_review_graph/registry.py +319 -0
  49. code_review_graph/rescript_resolver.py +206 -0
  50. code_review_graph/search.py +447 -0
  51. code_review_graph/skills.py +1481 -0
  52. code_review_graph/spring_resolver.py +200 -0
  53. code_review_graph/temporal_resolver.py +199 -0
  54. code_review_graph/token_benchmark.py +125 -0
  55. code_review_graph/tools/__init__.py +156 -0
  56. code_review_graph/tools/_common.py +176 -0
  57. code_review_graph/tools/analysis_tools.py +184 -0
  58. code_review_graph/tools/build.py +541 -0
  59. code_review_graph/tools/community_tools.py +246 -0
  60. code_review_graph/tools/context.py +152 -0
  61. code_review_graph/tools/docs.py +274 -0
  62. code_review_graph/tools/flows_tools.py +176 -0
  63. code_review_graph/tools/query.py +692 -0
  64. code_review_graph/tools/refactor_tools.py +168 -0
  65. code_review_graph/tools/registry_tools.py +125 -0
  66. code_review_graph/tools/review.py +477 -0
  67. code_review_graph/tsconfig_resolver.py +257 -0
  68. code_review_graph/visualization.py +2184 -0
  69. code_review_graph/wiki.py +305 -0
  70. code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
  71. code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
  72. code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
  73. code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
  74. code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,447 @@
1
+ """Hybrid search engine combining FTS5 (BM25) and vector embeddings.
2
+
3
+ Uses Reciprocal Rank Fusion (RRF) to merge results from full-text search
4
+ and semantic similarity, with query-aware kind boosting and context-file
5
+ boosting for relevance tuning.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import re
12
+ import sqlite3
13
+ from typing import Any, Optional
14
+
15
+ from .graph import GraphStore, _sanitize_name
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # FTS5 index management
22
+ # ---------------------------------------------------------------------------
23
+
24
+
25
+ def rebuild_fts_index(store: GraphStore) -> int:
26
+ """Rebuild the FTS5 index from the nodes table.
27
+
28
+ Checks whether the ``nodes_fts`` virtual table exists, clears it, then
29
+ repopulates it from every row in ``nodes``.
30
+
31
+ Returns:
32
+ Number of rows indexed.
33
+ """
34
+ # NOTE: rebuild_fts_index uses store._conn directly because it manages
35
+ # the FTS5 virtual table DDL, which is tightly coupled to SQLite internals.
36
+ conn = store._conn
37
+
38
+ # Wrap the full DROP + CREATE + INSERT sequence in an explicit transaction
39
+ # so a crash mid-rebuild cannot leave the DB without an FTS table at all
40
+ # (DROP succeeded but CREATE/INSERT didn't). See #259.
41
+ if conn.in_transaction:
42
+ logger.warning("Rolling back uncommitted transaction before BEGIN IMMEDIATE")
43
+ conn.rollback()
44
+ conn.execute("BEGIN IMMEDIATE")
45
+ try:
46
+ # Drop and recreate the FTS table with content sync to match migration v5
47
+ conn.execute("DROP TABLE IF EXISTS nodes_fts")
48
+ conn.execute("""
49
+ CREATE VIRTUAL TABLE nodes_fts USING fts5(
50
+ name, qualified_name, file_path, signature,
51
+ content='nodes', content_rowid='rowid',
52
+ tokenize='porter unicode61'
53
+ )
54
+ """)
55
+
56
+ # Rebuild from the content table (nodes) using the FTS5 rebuild command
57
+ conn.execute("INSERT INTO nodes_fts(nodes_fts) VALUES('rebuild')")
58
+
59
+
60
+ conn.commit()
61
+ except BaseException:
62
+ conn.rollback()
63
+ raise
64
+
65
+ count = conn.execute("SELECT count(*) FROM nodes_fts").fetchone()[0]
66
+ logger.info("FTS index rebuilt: %d rows indexed", count)
67
+ return count
68
+
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # Query kind boosting heuristics
72
+ # ---------------------------------------------------------------------------
73
+
74
+
75
+ _DOTTED_IDENT_RE = re.compile(r'\b[A-Za-z_][\w]*(?:\.[A-Za-z_][\w]*)+\b')
76
+ _SNAKE_IDENT_RE = re.compile(r'\b[a-z][a-z0-9]*(?:_[a-z0-9]+)+\b')
77
+ _PASCAL_IDENT_RE = re.compile(r'\b[A-Z][a-z0-9]+(?:[A-Z][a-z0-9]+)+\b')
78
+
79
+
80
+ def extract_query_identifiers(query: str) -> list[str]:
81
+ """Pull out identifier-shaped tokens from anywhere in a query.
82
+
83
+ Catches dotted forms (``Context.Next``), snake_case (``get_dependant``),
84
+ and CamelCase (``APIRoute``) even when they're embedded in a natural-
85
+ language sentence. Used to boost search hits whose qualified_name
86
+ contains any of these tokens, so an LLM asking "Who advances the gin
87
+ middleware chain via Context.Next" lands on ``Context.Next`` instead of
88
+ the bare ``Context`` class.
89
+ """
90
+ found: list[str] = []
91
+ seen: set[str] = set()
92
+ for pat in (_DOTTED_IDENT_RE, _SNAKE_IDENT_RE, _PASCAL_IDENT_RE):
93
+ for match in pat.findall(query):
94
+ lo = match.lower()
95
+ if lo not in seen and len(lo) >= 3:
96
+ seen.add(lo)
97
+ found.append(lo)
98
+ return found
99
+
100
+
101
+ def detect_query_kind_boost(query: str) -> dict[str, Any]:
102
+ """Detect query patterns and return per-node boost multipliers.
103
+
104
+ Heuristics:
105
+ - PascalCase queries (e.g. ``MyClass``) boost Class/Type by 1.5x
106
+ - snake_case queries (e.g. ``get_users``) boost Function by 1.5x
107
+ - Queries containing ``.`` boost qualified name matches by 2.0x
108
+ - Identifier-shaped tokens *anywhere* in the query (dotted, snake_case,
109
+ CamelCase) boost results whose qualified_name contains them by 2.0x.
110
+ See ``extract_query_identifiers``.
111
+
112
+ Returns:
113
+ Dict whose keys are either node kind strings (mapped to float
114
+ multipliers) or one of the special keys ``_qualified``,
115
+ ``_qualified_identifiers``.
116
+ """
117
+ boosts: dict[str, Any] = {}
118
+
119
+ if not query or not query.strip():
120
+ return boosts
121
+
122
+ q = query.strip()
123
+
124
+ # PascalCase: starts with uppercase, has at least one lowercase after
125
+ if re.match(r'^[A-Z][a-z]', q) and not q.isupper():
126
+ boosts["Class"] = 1.5
127
+ boosts["Type"] = 1.5
128
+
129
+ # snake_case or SCREAMING_SNAKE_CASE: contains underscore with letters
130
+ if '_' in q and re.search(r'[a-zA-Z]', q):
131
+ boosts["Function"] = 1.5
132
+
133
+ # Dotted path: boost qualified name matches
134
+ if '.' in q:
135
+ boosts["_qualified"] = 2.0
136
+
137
+ # Identifiers extracted from anywhere in the query
138
+ idents = extract_query_identifiers(q)
139
+ if idents:
140
+ boosts["_qualified_identifiers"] = idents
141
+
142
+ return boosts
143
+
144
+
145
+ # ---------------------------------------------------------------------------
146
+ # Reciprocal Rank Fusion
147
+ # ---------------------------------------------------------------------------
148
+
149
+
150
+ def rrf_merge(*result_lists: list[tuple[int, float]], k: int = 60) -> list[tuple[int, float]]:
151
+ """Merge multiple ranked result lists using Reciprocal Rank Fusion.
152
+
153
+ Each input list contains ``(id, score)`` tuples, ordered by score
154
+ descending. The RRF score for each item is the sum of
155
+ ``1 / (k + rank + 1)`` across all lists it appears in, where rank is
156
+ the 0-based position.
157
+
158
+ Args:
159
+ *result_lists: Variable number of ranked result lists.
160
+ k: RRF constant (default 60). Higher values reduce the impact of
161
+ rank differences.
162
+
163
+ Returns:
164
+ Merged list of ``(id, rrf_score)`` tuples sorted by score descending.
165
+ """
166
+ scores: dict[int, float] = {}
167
+
168
+ for result_list in result_lists:
169
+ for rank, (item_id, _score) in enumerate(result_list):
170
+ scores[item_id] = scores.get(item_id, 0.0) + 1.0 / (k + rank + 1)
171
+
172
+ merged = sorted(scores.items(), key=lambda x: x[1], reverse=True)
173
+ return merged
174
+
175
+
176
+ # ---------------------------------------------------------------------------
177
+ # FTS5 search
178
+ # ---------------------------------------------------------------------------
179
+
180
+
181
+ def _fts_search(
182
+ conn: sqlite3.Connection,
183
+ query: str,
184
+ limit: int = 50,
185
+ ) -> list[tuple[int, float]]:
186
+ """Run an FTS5 BM25 search against the nodes_fts table.
187
+
188
+ Returns list of ``(node_id, bm25_score)`` tuples. The BM25 score is
189
+ negated so higher = better (FTS5 returns negative BM25).
190
+ """
191
+ # Sanitize: wrap in double quotes to prevent FTS5 operator injection
192
+ safe_query = '"' + query.replace('"', '""') + '"'
193
+
194
+ try:
195
+ rows = conn.execute(
196
+ "SELECT rowid, rank FROM nodes_fts WHERE nodes_fts MATCH ? "
197
+ "ORDER BY rank LIMIT ?",
198
+ (safe_query, limit),
199
+ ).fetchall()
200
+ # FTS5 rank is negative BM25 (lower = better), negate for consistency
201
+ return [(row[0], -row[1]) for row in rows]
202
+ except sqlite3.OperationalError as e:
203
+ logger.warning("FTS5 search failed: %s", e)
204
+ return []
205
+
206
+
207
+ # ---------------------------------------------------------------------------
208
+ # Embedding search (optional)
209
+ # ---------------------------------------------------------------------------
210
+
211
+
212
+ def _embedding_search(
213
+ store: GraphStore,
214
+ query: str,
215
+ limit: int = 50,
216
+ model: str | None = None,
217
+ provider: str | None = None,
218
+ ) -> list[tuple[int, float]]:
219
+ """Run a vector similarity search using the embedding store.
220
+
221
+ Returns list of ``(node_id, similarity_score)`` tuples.
222
+ Gracefully returns an empty list if embeddings are not available.
223
+ """
224
+ try:
225
+ from .embeddings import EmbeddingStore
226
+ except ImportError:
227
+ return []
228
+
229
+ try:
230
+ emb_store = EmbeddingStore(store.db_path, provider=provider, model=model)
231
+ try:
232
+ if not emb_store.available or emb_store.count() == 0:
233
+ return []
234
+
235
+ results = emb_store.search(query, limit=limit)
236
+ # Map qualified names back to node IDs
237
+ id_scores: list[tuple[int, float]] = []
238
+ for qn, score in results:
239
+ node = store.get_node(qn)
240
+ if node:
241
+ id_scores.append((node.id, score))
242
+ return id_scores
243
+ finally:
244
+ emb_store.close()
245
+ except Exception as e:
246
+ logger.warning("Embedding search failed: %s", e)
247
+ return []
248
+
249
+
250
+ # ---------------------------------------------------------------------------
251
+ # Keyword LIKE fallback
252
+ # ---------------------------------------------------------------------------
253
+
254
+
255
+ def _keyword_search(
256
+ conn: sqlite3.Connection,
257
+ query: str,
258
+ limit: int = 50,
259
+ ) -> list[tuple[int, float]]:
260
+ """Fall back to simple LIKE keyword matching.
261
+
262
+ Each word in the query must match independently (AND logic).
263
+ Returns ``(node_id, score)`` tuples with a basic relevance score.
264
+ """
265
+ words = query.lower().split()
266
+ if not words:
267
+ return []
268
+
269
+ conditions: list[str] = []
270
+ params: list[str | int] = []
271
+ for word in words:
272
+ conditions.append(
273
+ "(LOWER(name) LIKE ? OR LOWER(qualified_name) LIKE ?)"
274
+ )
275
+ params.extend([f"%{word}%", f"%{word}%"])
276
+
277
+ where = " AND ".join(conditions)
278
+ params.append(limit)
279
+ sql = f"SELECT id, name, qualified_name FROM nodes WHERE {where} LIMIT ?" # nosec B608
280
+
281
+ try:
282
+ rows = conn.execute(sql, params).fetchall()
283
+ except sqlite3.OperationalError:
284
+ return []
285
+
286
+ # Assign a simple relevance score: exact name match > prefix > contains
287
+ q_lower = query.lower()
288
+ results: list[tuple[int, float]] = []
289
+ for row in rows:
290
+ name_lower = row["name"].lower()
291
+ if name_lower == q_lower:
292
+ score = 3.0
293
+ elif name_lower.startswith(q_lower):
294
+ score = 2.0
295
+ else:
296
+ score = 1.0
297
+ results.append((row["id"], score))
298
+
299
+ results.sort(key=lambda x: x[1], reverse=True)
300
+ return results
301
+
302
+
303
+ # ---------------------------------------------------------------------------
304
+ # Main hybrid search
305
+ # ---------------------------------------------------------------------------
306
+
307
+
308
+ def hybrid_search(
309
+ store: GraphStore,
310
+ query: str,
311
+ kind: Optional[str] = None,
312
+ limit: int = 20,
313
+ context_files: Optional[list[str]] = None,
314
+ model: Optional[str] = None,
315
+ provider: Optional[str] = None,
316
+ ) -> list[dict[str, Any]]:
317
+ """Hybrid search combining FTS5 BM25 and vector embeddings via RRF.
318
+
319
+ Attempts FTS5 + embedding search first, falling back to FTS5-only,
320
+ then keyword LIKE matching if FTS5 is unavailable.
321
+
322
+ Args:
323
+ store: The graph store to search.
324
+ query: Search query string.
325
+ kind: Optional node kind filter (e.g. ``"Function"``, ``"Class"``).
326
+ limit: Maximum results to return (default 20).
327
+ context_files: Optional list of file paths. Nodes in these files
328
+ receive a 1.5x score boost.
329
+
330
+ Returns:
331
+ List of dicts with node metadata and ``score`` field.
332
+ """
333
+ if not query or not query.strip():
334
+ return []
335
+
336
+ # NOTE: hybrid_search uses store._conn for FTS5 and keyword queries
337
+ # because those operate on the FTS virtual table or need raw Row
338
+ # access for batch-fetch performance. This is documented coupling.
339
+ conn = store._conn
340
+ fetch_limit = limit * 3 # Fetch extra to allow for filtering and boosting
341
+
342
+ # ------ Phase 1: Gather ranked lists ------
343
+ fts_results: list[tuple[int, float]] = []
344
+ emb_results: list[tuple[int, float]] = []
345
+
346
+ # Try FTS5 search
347
+ try:
348
+ fts_results = _fts_search(conn, query, limit=fetch_limit)
349
+ except Exception as e:
350
+ logger.warning("FTS5 unavailable, will use fallback: %s", e)
351
+
352
+ # Try embedding search
353
+ emb_results = _embedding_search(
354
+ store, query, limit=fetch_limit, model=model, provider=provider,
355
+ )
356
+
357
+ # ------ Phase 2: Merge via RRF or fallback ------
358
+ if fts_results or emb_results:
359
+ lists_to_merge = []
360
+ if fts_results:
361
+ lists_to_merge.append(fts_results)
362
+ if emb_results:
363
+ lists_to_merge.append(emb_results)
364
+ merged = rrf_merge(*lists_to_merge)
365
+ else:
366
+ # Fallback: keyword LIKE matching
367
+ keyword_results = _keyword_search(conn, query, limit=fetch_limit)
368
+ if not keyword_results:
369
+ return []
370
+ merged = keyword_results
371
+
372
+ # ------ Phase 3+4: Batch-fetch nodes, apply boosting and kind filter ------
373
+ kind_boosts = detect_query_kind_boost(query)
374
+ context_set = set(context_files) if context_files else set()
375
+
376
+ # Batch-fetch all candidate nodes in one query
377
+ candidate_ids = [node_id for node_id, _ in merged]
378
+ node_rows: dict[int, Any] = {}
379
+ batch_size = 450
380
+ for i in range(0, len(candidate_ids), batch_size):
381
+ batch = candidate_ids[i:i + batch_size]
382
+ placeholders = ",".join("?" for _ in batch)
383
+ rows = conn.execute(
384
+ f"SELECT * FROM nodes WHERE id IN ({placeholders})", # nosec B608
385
+ batch,
386
+ ).fetchall()
387
+ for row in rows:
388
+ node_rows[row["id"]] = row
389
+
390
+ # Apply boosting
391
+ boosted: list[tuple[int, float]] = []
392
+ for node_id, score in merged:
393
+ row = node_rows.get(node_id)
394
+ if not row:
395
+ continue
396
+
397
+ node_kind = row["kind"]
398
+ file_path = row["file_path"]
399
+ qualified_name = row["qualified_name"]
400
+
401
+ boost = 1.0
402
+ if node_kind in kind_boosts:
403
+ boost *= kind_boosts[node_kind]
404
+ if "_qualified" in kind_boosts and '.' in query:
405
+ if query.lower() in qualified_name.lower():
406
+ boost *= kind_boosts["_qualified"]
407
+ idents = kind_boosts.get("_qualified_identifiers")
408
+ if idents:
409
+ qn_lo = qualified_name.lower()
410
+ if any(ident in qn_lo for ident in idents):
411
+ boost *= 2.0
412
+ if context_set and file_path in context_set:
413
+ boost *= 1.5
414
+
415
+ boosted.append((node_id, score * boost))
416
+
417
+ boosted.sort(key=lambda x: x[1], reverse=True)
418
+
419
+ # Build results from the already-fetched rows
420
+ results: list[dict[str, Any]] = []
421
+ for node_id, final_score in boosted:
422
+ if len(results) >= limit:
423
+ break
424
+
425
+ row = node_rows.get(node_id)
426
+ if not row:
427
+ continue
428
+
429
+ node_kind = row["kind"]
430
+ if kind and node_kind != kind:
431
+ continue
432
+
433
+ results.append({
434
+ "name": _sanitize_name(row["name"]),
435
+ "qualified_name": _sanitize_name(row["qualified_name"]),
436
+ "kind": node_kind,
437
+ "file_path": row["file_path"],
438
+ "line_start": row["line_start"],
439
+ "line_end": row["line_end"],
440
+ "language": row["language"] or "",
441
+ "params": row["params"],
442
+ "return_type": row["return_type"],
443
+ "signature": row["signature"] if "signature" in row.keys() else None,
444
+ "score": round(final_score, 6),
445
+ })
446
+
447
+ return results