codedocent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codedocent/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """codedocent — code visualization for non-programmers."""
codedocent/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ """Allow running codedocent as `python3 -m codedocent`."""
2
+ from codedocent.cli import main
3
+
4
+ main()
codedocent/analyzer.py ADDED
@@ -0,0 +1,620 @@
1
+ """AI-powered analysis: summaries, pseudocode, quality scoring, and caching."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ import os
8
+ import re
9
+ import sys
10
+ import threading
11
+ import time
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
+
14
+ from codedocent.parser import CodeNode
15
+
16
+ try:
17
+ import ollama
18
+ except ImportError:
19
+ ollama = None # type: ignore[assignment]
20
+
21
+ CACHE_FILENAME = ".codedocent_cache.json"
22
+ MAX_SOURCE_LINES = 200
23
+ MIN_LINES_FOR_AI = 3
24
+
25
+ # Quality scoring thresholds: (yellow_threshold, red_threshold)
26
+ # yellow = "complex", red = "warning"
27
+ LINE_THRESHOLDS: dict[str, tuple[int, int]] = {
28
+ "function": (50, 100),
29
+ "method": (50, 100),
30
+ "file": (500, 1000),
31
+ "class": (300, 600),
32
+ }
33
+ PARAM_THRESHOLD = 5
34
+
35
+
36
+ def _count_nodes(node: CodeNode) -> int:
37
+ """Recursive count of all nodes in tree."""
38
+ return 1 + sum(_count_nodes(c) for c in node.children)
39
+
40
+
41
+ def _build_prompt(node: CodeNode, model: str = "") -> str:
42
+ """Build the AI prompt for a given node."""
43
+ language = node.language or "unknown"
44
+ source = node.source
45
+ lines = source.splitlines()
46
+ if len(lines) > MAX_SOURCE_LINES:
47
+ source = "\n".join(lines[:MAX_SOURCE_LINES])
48
+
49
+ prompt = (
50
+ f"You are a code explainer for non-programmers. "
51
+ f"Given the following {language} code, provide:\n\n"
52
+ f"1. SUMMARY: A plain English explanation (1-3 sentences) "
53
+ f"that a "
54
+ f"non-programmer can understand. Explain WHAT it does "
55
+ f"and WHY, not HOW. "
56
+ f"Avoid jargon.\n\n"
57
+ f"2. PSEUDOCODE: A simplified pseudocode version using plain English "
58
+ f"function/variable names. Keep it short.\n\n"
59
+ f"Respond in exactly this format:\n"
60
+ f"SUMMARY: <your summary>\n"
61
+ f"PSEUDOCODE:\n"
62
+ f"<your pseudocode>\n\n"
63
+ f"Here is the code:\n"
64
+ f"```{language}\n"
65
+ f"{source}\n"
66
+ f"```"
67
+ )
68
+
69
+ if "qwen3" in model.lower():
70
+ prompt += "\n\n/no_think"
71
+
72
+ return prompt
73
+
74
+
75
+ def _strip_think_tags(text: str) -> str:
76
+ """Remove <think>...</think> blocks from model output.
77
+
78
+ Handles variants: <think>, <|think|>, and unclosed tags.
79
+ """
80
+ # Remove well-formed pairs (including <|think|> variants)
81
+ text = re.sub(r"<\|?think\|?>.*?<\|?/think\|?>", "", text, flags=re.DOTALL)
82
+ # Remove unclosed tags (tag to end of string)
83
+ text = re.sub(r"<\|?think\|?>.*", "", text, flags=re.DOTALL)
84
+ return text.strip()
85
+
86
+
87
+ def _parse_ai_response(text: str) -> tuple[str, str]:
88
+ """Parse SUMMARY and PSEUDOCODE from AI response text."""
89
+ summary = ""
90
+ pseudocode = ""
91
+
92
+ summary_match = re.search(
93
+ r"SUMMARY:\s*(.*?)(?=\nPSEUDOCODE:|$)", text, re.DOTALL
94
+ )
95
+ pseudocode_match = re.search(r"PSEUDOCODE:\s*(.*)", text, re.DOTALL)
96
+
97
+ if summary_match:
98
+ summary = summary_match.group(1).strip()
99
+ if pseudocode_match:
100
+ pseudocode = pseudocode_match.group(1).strip()
101
+
102
+ # Fallback: first line as summary if parsing failed
103
+ if not summary:
104
+ lines = text.strip().splitlines()
105
+ if lines:
106
+ summary = lines[0].strip()
107
+
108
+ return summary, pseudocode
109
+
110
+
111
+ def _summarize_with_ai(
112
+ node: CodeNode, model: str
113
+ ) -> tuple[str, str]:
114
+ """Call ollama to get summary and pseudocode for a node."""
115
+ prompt = _build_prompt(node, model)
116
+ response = ollama.chat(
117
+ model=model, messages=[{"role": "user", "content": prompt}]
118
+ )
119
+ raw = response.message.content or "" # pylint: disable=no-member
120
+ raw = _strip_think_tags(raw)
121
+ # Garbage response fallback: empty or very short after stripping
122
+ if not raw or len(raw) < 10:
123
+ return ("Could not generate summary", "")
124
+ summary, pseudocode = _parse_ai_response(raw)
125
+ # Final guard: if summary is empty or too short, replace it
126
+ if not summary or len(summary) < 5:
127
+ summary = "Could not generate summary"
128
+ return summary, pseudocode
129
+
130
+
131
+ def _count_parameters(node: CodeNode) -> int:
132
+ """Count parameters of a function/method using tree-sitter."""
133
+ if not node.source or not node.language:
134
+ return 0
135
+
136
+ import tree_sitter_language_pack as tslp # pylint: disable=import-outside-toplevel # noqa: E501
137
+
138
+ try:
139
+ parser = tslp.get_parser(node.language) # type: ignore[arg-type]
140
+ except (KeyError, ValueError):
141
+ return 0
142
+
143
+ tree = parser.parse(node.source.encode())
144
+ root = tree.root_node
145
+
146
+ # Find the parameters / formal_parameters node
147
+ param_node = None
148
+
149
+ def _find_params(n):
150
+ nonlocal param_node
151
+ if param_node is not None:
152
+ return
153
+ if n.type in ("parameters", "formal_parameters"):
154
+ param_node = n
155
+ return
156
+ for child in n.children:
157
+ _find_params(child)
158
+
159
+ _find_params(root)
160
+ if param_node is None:
161
+ return 0
162
+
163
+ count = 0
164
+ for child in param_node.children:
165
+ # Skip punctuation like ( ) ,
166
+ if child.type in ("(", ")", ","):
167
+ continue
168
+ # For Python, skip self/cls
169
+ if node.language == "python":
170
+ text = child.text.decode() if child.text else ""
171
+ if text in ("self", "cls"):
172
+ continue
173
+ count += 1
174
+
175
+ return count
176
+
177
+
178
+ def _worst_quality(a: str, b: str) -> str:
179
+ """Return the worse of two quality labels."""
180
+ order = {"clean": 0, "complex": 1, "warning": 2}
181
+ return a if order.get(a, 0) >= order.get(b, 0) else b
182
+
183
+
184
+ def _score_quality(
185
+ node: CodeNode,
186
+ ) -> tuple[str | None, list[str] | None]:
187
+ """Score code quality using radon and heuristics.
188
+
189
+ Returns (quality, warnings) where quality is 'clean', 'complex',
190
+ or 'warning', and warnings is a list of warning strings.
191
+ For directories, returns (None, None).
192
+ """
193
+ if node.node_type == "directory":
194
+ return None, None
195
+
196
+ warnings: list[str] = []
197
+ quality = "clean"
198
+
199
+ # Radon complexity for Python
200
+ if node.language == "python" and node.source:
201
+ try:
202
+ from radon.complexity import cc_visit, cc_rank # type: ignore[import-untyped] # pylint: disable=import-outside-toplevel # noqa: E501
203
+
204
+ blocks = cc_visit(node.source)
205
+ if blocks:
206
+ worst = max(b.complexity for b in blocks)
207
+ rank = cc_rank(worst)
208
+ if rank in ("A", "B"):
209
+ pass # clean
210
+ elif rank == "C":
211
+ quality = _worst_quality(quality, "complex")
212
+ warnings.append(
213
+ f"Moderate complexity (grade {rank},"
214
+ f" score {worst})"
215
+ )
216
+ else:
217
+ quality = _worst_quality(quality, "warning")
218
+ warnings.append(
219
+ f"High complexity (grade {rank},"
220
+ f" score {worst})"
221
+ )
222
+ except (ImportError, AttributeError): # nosec B110
223
+ pass
224
+
225
+ # Line-count check (two-tier: yellow/red)
226
+ thresholds = LINE_THRESHOLDS.get(node.node_type)
227
+ if thresholds and node.line_count:
228
+ yellow, red = thresholds
229
+ if node.line_count > red:
230
+ quality = _worst_quality(quality, "warning")
231
+ warnings.append(
232
+ f"This {node.node_type} is"
233
+ f" {node.line_count} lines long"
234
+ )
235
+ elif node.line_count > yellow:
236
+ quality = _worst_quality(quality, "complex")
237
+ warnings.append(f"Long {node.node_type}: {node.line_count} lines")
238
+
239
+ # Heuristic: many parameters
240
+ if node.node_type in ("function", "method"):
241
+ param_count = _count_parameters(node)
242
+ if param_count > PARAM_THRESHOLD:
243
+ quality = _worst_quality(quality, "complex")
244
+ warnings.append("Many parameters: consider grouping")
245
+
246
+ return quality, warnings if warnings else None
247
+
248
+
249
+ def _summarize_directory(node: CodeNode) -> None:
250
+ """Synthesize a directory summary from children. No AI needed."""
251
+ if node.node_type != "directory":
252
+ return
253
+
254
+ file_children = [c for c in node.children if c.node_type == "file"]
255
+ dir_children = [c for c in node.children if c.node_type == "directory"]
256
+
257
+ parts: list[str] = []
258
+ if file_children:
259
+ names = ", ".join(c.name for c in file_children)
260
+ parts.append(f"{len(file_children)} files: {names}")
261
+ if dir_children:
262
+ names = ", ".join(c.name for c in dir_children)
263
+ parts.append(f"{len(dir_children)} directories: {names}")
264
+
265
+ node.summary = (
266
+ f"Contains {'; '.join(parts)}" if parts else "Empty directory"
267
+ )
268
+
269
+ # Quality = worst child quality with descriptive rollup
270
+ quality_order = {"warning": 2, "complex": 1, "clean": 0}
271
+ worst = "clean"
272
+ rollup_warnings: list[str] = []
273
+ complex_count = 0
274
+ warning_count = 0
275
+ for child in node.children:
276
+ child_rank = quality_order.get(
277
+ child.quality or "clean", 0
278
+ )
279
+ worst_rank = quality_order.get(worst, 0)
280
+ if child.quality and child_rank > worst_rank:
281
+ worst = child.quality
282
+ if child.quality == "complex":
283
+ complex_count += 1
284
+ if child.quality == "warning":
285
+ warning_count += 1
286
+
287
+ if warning_count:
288
+ label = "child" if warning_count == 1 else "children"
289
+ rollup_warnings.append(f"Contains {warning_count} high-risk {label}")
290
+ if complex_count:
291
+ label = "child" if complex_count == 1 else "children"
292
+ rollup_warnings.append(f"{complex_count} complex {label} inside")
293
+
294
+ node.quality = worst
295
+ node.warnings = rollup_warnings if rollup_warnings else None
296
+
297
+
298
+ def _rollup_quality(node: CodeNode) -> None:
299
+ """Roll up child quality into a file or class node."""
300
+ if not node.children:
301
+ return
302
+ quality_order = {"warning": 2, "complex": 1, "clean": 0}
303
+ own_quality = node.quality or "clean"
304
+ own_warnings = list(node.warnings) if node.warnings else []
305
+ complex_count = sum(1 for c in node.children if c.quality == "complex")
306
+ warning_count = sum(1 for c in node.children if c.quality == "warning")
307
+ worst_child = (
308
+ "warning" if warning_count
309
+ else ("complex" if complex_count else "clean")
310
+ )
311
+ if quality_order[worst_child] > quality_order.get(own_quality, 0):
312
+ node.quality = worst_child
313
+ if warning_count:
314
+ label = "function" if warning_count == 1 else "functions"
315
+ own_warnings.append(f"Contains {warning_count} high-risk {label}")
316
+ if complex_count:
317
+ label = "function" if complex_count == 1 else "functions"
318
+ own_warnings.append(f"{complex_count} complex {label} inside")
319
+ node.warnings = own_warnings if own_warnings else None
320
+
321
+
322
+ # ---------------------------------------------------------------------------
323
+ # Cache
324
+ # ---------------------------------------------------------------------------
325
+
326
+
327
+ def _cache_key(node: CodeNode) -> str:
328
+ """Generate a cache key based on filepath, name, and source hash."""
329
+ source_hash = hashlib.md5(
330
+ node.source.encode(), usedforsecurity=False
331
+ ).hexdigest()
332
+ return f"{node.filepath}::{node.name}::{source_hash}"
333
+
334
+
335
+ def _load_cache(path: str) -> dict:
336
+ """Load cache from JSON file."""
337
+ try:
338
+ with open(path, encoding="utf-8") as f:
339
+ data = json.load(f)
340
+ if isinstance(data, dict) and data.get("version") == 1:
341
+ return data
342
+ except (FileNotFoundError, json.JSONDecodeError, OSError):
343
+ pass
344
+ return {"version": 1, "model": "", "entries": {}}
345
+
346
+
347
+ def _save_cache(path: str, data: dict) -> None:
348
+ """Save cache to JSON file."""
349
+ try:
350
+ with open(path, "w", encoding="utf-8") as f:
351
+ json.dump(data, f, indent=2)
352
+ except OSError as e:
353
+ print(f"Warning: could not save cache: {e}", file=sys.stderr)
354
+
355
+
356
+ # ---------------------------------------------------------------------------
357
+ # Node ID assignment
358
+ # ---------------------------------------------------------------------------
359
+
360
+
361
+ def assign_node_ids(root: CodeNode) -> dict[str, CodeNode]:
362
+ """Walk tree, assign a unique 12-char hex node_id to every node.
363
+
364
+ Returns a lookup dict mapping node_id -> CodeNode.
365
+ """
366
+ lookup: dict[str, CodeNode] = {}
367
+
368
+ def _walk(node: CodeNode, path_parts: list[str]) -> None:
369
+ key = "::".join(path_parts)
370
+ node_id = hashlib.md5(
371
+ key.encode(), usedforsecurity=False
372
+ ).hexdigest()[:12]
373
+ node.node_id = node_id
374
+ lookup[node_id] = node
375
+ for child in node.children:
376
+ child_parts = path_parts + [child.node_type, child.name]
377
+ _walk(child, child_parts)
378
+
379
+ _walk(root, [root.name])
380
+ return lookup
381
+
382
+
383
+ # ---------------------------------------------------------------------------
384
+ # Single-node analysis (used by server)
385
+ # ---------------------------------------------------------------------------
386
+
387
+
388
+ def analyze_single_node(node: CodeNode, model: str, cache_dir: str) -> None:
389
+ """Run quality scoring + AI analysis on a single node.
390
+
391
+ Reads/writes the cache. Applies min-lines guard and garbage fallback.
392
+ """
393
+ if ollama is None:
394
+ node.summary = "AI unavailable (ollama not installed)"
395
+ return
396
+
397
+ # Quality scoring
398
+ quality, warnings = _score_quality(node)
399
+ node.quality = quality
400
+ node.warnings = warnings
401
+
402
+ # Min-lines guard
403
+ if node.line_count < MIN_LINES_FOR_AI:
404
+ node.summary = f"Small {node.node_type} ({node.line_count} lines)"
405
+ return
406
+
407
+ # Directory nodes get synthesized summaries, not AI
408
+ if node.node_type == "directory":
409
+ _summarize_directory(node)
410
+ return
411
+
412
+ # Cache
413
+ cache_path = os.path.join(cache_dir, CACHE_FILENAME)
414
+ cache = _load_cache(cache_path)
415
+
416
+ if cache.get("model") != model:
417
+ cache = {"version": 1, "model": model, "entries": {}}
418
+
419
+ key = _cache_key(node)
420
+ if key in cache["entries"]:
421
+ entry = cache["entries"][key]
422
+ node.summary = entry.get("summary")
423
+ node.pseudocode = entry.get("pseudocode")
424
+ return
425
+
426
+ try:
427
+ summary, pseudocode = _summarize_with_ai(node, model)
428
+ node.summary = summary
429
+ node.pseudocode = pseudocode
430
+ cache["entries"][key] = {"summary": summary, "pseudocode": pseudocode}
431
+ _save_cache(cache_path, cache)
432
+ except (ConnectionError, RuntimeError, ValueError, OSError) as e:
433
+ node.summary = f"Summary generation failed: {e}"
434
+
435
+
436
+ # ---------------------------------------------------------------------------
437
+ # Main entry points
438
+ # ---------------------------------------------------------------------------
439
+
440
+
441
+ def _collect_nodes(
442
+ node: CodeNode, depth: int = 0,
443
+ ) -> list[tuple[CodeNode, int]]:
444
+ """Collect all nodes with their depth for priority batching."""
445
+ result = [(node, depth)]
446
+ for child in node.children:
447
+ result.extend(_collect_nodes(child, depth + 1))
448
+ return result
449
+
450
+
451
+ def analyze( # pylint: disable=too-many-locals,too-many-statements
452
+ root: CodeNode,
453
+ model: str = "qwen3:14b",
454
+ workers: int = 1,
455
+ ) -> CodeNode:
456
+ """Analyze the full tree with AI summaries and quality scoring.
457
+
458
+ Uses priority batching:
459
+ 1. Quality-score all nodes (fast pass).
460
+ 2. AI-analyze files (shallowest first).
461
+ 3. AI-analyze classes/functions/methods (shallowest first).
462
+ 4. Synthesize directory summaries (deepest first / bottom-up).
463
+ """
464
+ if ollama is None:
465
+ print(
466
+ "Error: ollama package not installed. "
467
+ "Install with: pip install ollama\n"
468
+ "Or use --no-ai to skip AI analysis.",
469
+ file=sys.stderr,
470
+ )
471
+ sys.exit(1)
472
+
473
+ # Determine cache path
474
+ cache_dir = root.filepath or "."
475
+ cache_path = os.path.join(cache_dir, CACHE_FILENAME)
476
+ cache = _load_cache(cache_path)
477
+
478
+ # Invalidate cache if model changed
479
+ if cache.get("model") != model:
480
+ cache = {"version": 1, "model": model, "entries": {}}
481
+
482
+ all_nodes = _collect_nodes(root)
483
+ total = len(all_nodes)
484
+ counter = [0]
485
+ cache_lock = threading.Lock()
486
+ progress_lock = threading.Lock()
487
+ start_time = time.monotonic()
488
+
489
+ def _progress(label: str) -> None:
490
+ with progress_lock:
491
+ counter[0] += 1
492
+ print(f"[{counter[0]}/{total}] {label}...", file=sys.stderr)
493
+
494
+ def _ai_analyze(node: CodeNode) -> None:
495
+ """Run AI analysis on a single non-directory node."""
496
+ label = node.name
497
+ if node.line_count < MIN_LINES_FOR_AI:
498
+ node.summary = f"Small {node.node_type} ({node.line_count} lines)"
499
+ _progress(f"Skipping small {label}")
500
+ return
501
+
502
+ key = _cache_key(node)
503
+ with cache_lock:
504
+ if key in cache["entries"]:
505
+ entry = cache["entries"][key]
506
+ node.summary = entry.get("summary")
507
+ node.pseudocode = entry.get("pseudocode")
508
+ _progress(f"Cache hit: {label}")
509
+ return
510
+
511
+ _progress(f"Analyzing {label}")
512
+ try:
513
+ summary, pseudocode = _summarize_with_ai(node, model)
514
+ with cache_lock:
515
+ node.summary = summary
516
+ node.pseudocode = pseudocode
517
+ cache["entries"][key] = {
518
+ "summary": summary,
519
+ "pseudocode": pseudocode,
520
+ }
521
+ except Exception as e: # pylint: disable=broad-exception-caught
522
+ node.summary = "Summary generation failed"
523
+ print(
524
+ f" AI error for {label}: {e}",
525
+ file=sys.stderr,
526
+ )
527
+
528
+ try:
529
+ # Phase 1: Quality-score all nodes
530
+ for node, _depth in all_nodes:
531
+ quality, warnings = _score_quality(node)
532
+ node.quality = quality
533
+ node.warnings = warnings
534
+
535
+ # Phase 1b: Rollup quality to files and classes (deepest first)
536
+ rollup_nodes = [
537
+ (n, d) for n, d in all_nodes
538
+ if n.node_type in ("file", "class")
539
+ ]
540
+ rollup_nodes.sort(key=lambda x: x[1], reverse=True)
541
+ for node, _depth in rollup_nodes:
542
+ _rollup_quality(node)
543
+
544
+ # Phase 2: AI-analyze files (shallowest first)
545
+ files = [(n, d) for n, d in all_nodes if n.node_type == "file"]
546
+ files.sort(key=lambda x: x[1])
547
+
548
+ # Phase 3: AI-analyze classes/functions/methods (shallowest first)
549
+ code_nodes = [(n, d) for n, d in all_nodes
550
+ if n.node_type in ("class", "function", "method")]
551
+ code_nodes.sort(key=lambda x: x[1])
552
+
553
+ # Combine phases 2 & 3 into a single list for submission
554
+ ai_nodes = [n for n, _d in files] + [n for n, _d in code_nodes]
555
+
556
+ if workers == 1:
557
+ for node in ai_nodes:
558
+ _ai_analyze(node)
559
+ else:
560
+ with ThreadPoolExecutor(max_workers=workers) as executor:
561
+ futures = {executor.submit(_ai_analyze, node): node
562
+ for node in ai_nodes}
563
+ for future in as_completed(futures):
564
+ exc = future.exception()
565
+ if isinstance(exc, ConnectionError):
566
+ raise exc
567
+
568
+ # Phase 4: Synthesize directory summaries (deepest first)
569
+ dirs = [(n, d) for n, d in all_nodes if n.node_type == "directory"]
570
+ dirs.sort(key=lambda x: x[1], reverse=True)
571
+ for node, _depth in dirs:
572
+ _summarize_directory(node)
573
+
574
+ except ConnectionError as e:
575
+ print(
576
+ f"\nError: Could not connect to ollama: {e}\n"
577
+ "Make sure ollama is running (ollama serve),"
578
+ " or use --no-ai to skip AI analysis.",
579
+ file=sys.stderr,
580
+ )
581
+ sys.exit(1)
582
+
583
+ _save_cache(cache_path, cache)
584
+
585
+ elapsed = time.monotonic() - start_time
586
+ ai_count = len(files) + len(code_nodes)
587
+ print(
588
+ f"Analysis complete: {ai_count} nodes in {elapsed:.1f}s "
589
+ f"({workers} workers, model: {model})",
590
+ file=sys.stderr,
591
+ )
592
+
593
+ return root
594
+
595
+
596
+ def analyze_no_ai(root: CodeNode) -> CodeNode:
597
+ """Analyze with quality scoring only — no ollama calls."""
598
+ total = _count_nodes(root)
599
+ counter = [0]
600
+
601
+ def _walk(node: CodeNode) -> None:
602
+ counter[0] += 1
603
+ idx = counter[0]
604
+ print(f"[{idx}/{total}] Scoring {node.name}...", file=sys.stderr)
605
+
606
+ quality, warnings = _score_quality(node)
607
+ node.quality = quality
608
+ node.warnings = warnings
609
+
610
+ for child in node.children:
611
+ _walk(child)
612
+
613
+ if node.node_type in ("file", "class"):
614
+ _rollup_quality(node)
615
+
616
+ if node.node_type == "directory":
617
+ _summarize_directory(node)
618
+
619
+ _walk(root)
620
+ return root