code-finder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. claude_context/__init__.py +33 -0
  2. claude_context/agentic_integration.py +309 -0
  3. claude_context/ast_chunker.py +646 -0
  4. claude_context/config.py +239 -0
  5. claude_context/context_manager.py +627 -0
  6. claude_context/embeddings.py +307 -0
  7. claude_context/embeddings_interface.py +226 -0
  8. claude_context/enhanced_ast_chunker.py +1129 -0
  9. claude_context/explorer.py +951 -0
  10. claude_context/explorer_with_context.py +1008 -0
  11. claude_context/indexer.py +893 -0
  12. claude_context/markdown_chunker.py +421 -0
  13. claude_context/mode_handler.py +1774 -0
  14. claude_context/query_metrics.py +164 -0
  15. claude_context/question_generator.py +800 -0
  16. claude_context/readme_extractor.py +485 -0
  17. claude_context/repository_adapter.py +399 -0
  18. claude_context/search.py +493 -0
  19. claude_context/skills/__init__.py +11 -0
  20. claude_context/skills/_cli_common.py +74 -0
  21. claude_context/skills/_index_manager.py +98 -0
  22. claude_context/skills/api_surface.py +219 -0
  23. claude_context/skills/evidence_retrieval.py +151 -0
  24. claude_context/skills/grounded_review.py +212 -0
  25. claude_context/synthesis/__init__.py +8 -0
  26. claude_context/synthesis/editor_agent.py +391 -0
  27. claude_context/synthesis/llm_synthesizer.py +153 -0
  28. claude_context/synthesis/logic_explainer.py +235 -0
  29. claude_context/synthesis/multi_review_pipeline.py +717 -0
  30. claude_context/synthesis/prompt_builder.py +439 -0
  31. claude_context/synthesis/providers.py +115 -0
  32. claude_context/synthesis/validators.py +458 -0
  33. code_finder-0.1.0.dist-info/METADATA +823 -0
  34. code_finder-0.1.0.dist-info/RECORD +37 -0
  35. code_finder-0.1.0.dist-info/WHEEL +5 -0
  36. code_finder-0.1.0.dist-info/entry_points.txt +4 -0
  37. code_finder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,717 @@
1
+ """
2
+ Multi-Review Documentation Pipeline
3
+
4
+ Orchestrates a writer agent with 3 sequential reviewer agents to produce
5
+ high-quality documentation through iterative refinement with regression checks.
6
+
7
+ Design:
8
+ - Writer: Applies feedback and maintains citations
9
+ - Reviewer A: Deep technical accuracy check (establishes baseline)
10
+ - Reviewer B: Progressive disclosure focus + critical technical regression checks
11
+ - Reviewer C: Consistency/formatting focus + final safety checks for any regressions
12
+
13
+ Each reviewer sees only the current draft (fresh eyes, no previous feedback).
14
+ Later reviewers flag CRITICAL issues outside their domain to catch regressions.
15
+ """
16
+
17
+ import json
18
+ import logging
19
+ from dataclasses import dataclass, field
20
+ from enum import Enum
21
+ from pathlib import Path
22
+ from typing import Dict, List, Any, Optional
23
+
24
+ from .providers import create_generator, TextGenerator
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class ReviewerRole(Enum):
30
+ """Reviewer types with distinct focus areas."""
31
+ TECHNICAL_ACCURACY = "reviewer_a"
32
+ PROGRESSIVE_DISCLOSURE = "reviewer_b"
33
+ CONSISTENCY = "reviewer_c"
34
+
35
+
36
+ @dataclass
37
+ class ReviewFeedback:
38
+ """Structured feedback from a reviewer pass."""
39
+ reviewer: str
40
+ overall_assessment: str
41
+ issues: List[Dict[str, str]] = field(default_factory=list)
42
+ strengths: List[str] = field(default_factory=list)
43
+ questions: List[str] = field(default_factory=list)
44
+ raw_response: str = ""
45
+
46
+ def to_dict(self) -> Dict[str, Any]:
47
+ return {
48
+ "reviewer": self.reviewer,
49
+ "overall_assessment": self.overall_assessment,
50
+ "issues": self.issues,
51
+ "strengths": self.strengths,
52
+ "questions": self.questions
53
+ }
54
+
55
+
56
+ # Role definitions with regression check instructions
57
+ REVIEWER_A_ROLE = """You are Reviewer A: Technical Accuracy & Architecture Specialist
58
+
59
+ **PRIMARY FOCUS (Your Main Responsibility):**
60
+ 1. **Technical Correctness**
61
+ - Code examples match actual implementation
62
+ - API signatures are accurate (parameters, return types, exceptions)
63
+ - Dependencies and versions are correct
64
+ - Error handling is properly documented
65
+ - Command-line examples work as shown
66
+
67
+ 2. **Architecture & Terminology**
68
+ - Architectural diagrams match code structure
69
+ - Terminology is consistent and precise
70
+ - Component relationships are accurate
71
+ - Design patterns are correctly identified
72
+ - File paths and module names match actual codebase
73
+
74
+ **SECONDARY FOCUS:** None (you're establishing the technical baseline)
75
+
76
+ **Output Format (JSON):**
77
+ Return ONLY valid JSON. No markdown code blocks, no commentary before or after.
78
+ Start your response with `{` and end with `}`.
79
+
80
+ ```json
81
+ {
82
+ "overall_assessment": "Brief verdict (2-3 sentences)",
83
+ "issues": [
84
+ {
85
+ "location": "Section name or line reference",
86
+ "severity": "critical|major|minor",
87
+ "category": "accuracy|architecture|terminology",
88
+ "description": "What's wrong and why it matters",
89
+ "suggested_fix": "Concrete suggestion with correct information"
90
+ }
91
+ ],
92
+ "strengths": ["What works well technically"],
93
+ "questions": ["Unclear items needing human review"]
94
+ }
95
+ ```
96
+
97
+ CRITICAL: Ensure all strings are properly closed with quotes. Keep descriptions concise to avoid JSON parsing errors.
98
+
99
+ **Review Criteria:**
100
+ - Does the code actually work this way?
101
+ - Are API signatures exactly correct?
102
+ - Is terminology aligned with codebase naming?
103
+ - Would a developer be misled by any technical claims?
104
+ """
105
+
106
+ REVIEWER_B_ROLE = """You are Reviewer B: Progressive Disclosure & Conceptual Layering Specialist
107
+
108
+ **PRIMARY FOCUS (Your Main Responsibility):**
109
+ 1. **Progressive Disclosure**
110
+ - Start with essentials (install, quickstart, basic usage)
111
+ - Build complexity gradually
112
+ - Avoid overwhelming beginners with advanced topics too early
113
+ - Clear path from "first steps" to "advanced usage"
114
+
115
+ 2. **Conceptual Coherence**
116
+ - Clear mental models before implementation details
117
+ - Analogies and examples aid understanding
118
+ - Concepts build on each other logically
119
+ - Appropriate detail level for target audience
120
+
121
+ 3. **Boilerplate Repetition Check (CRITICAL)**
122
+ - ONLY "Getting Started" should have full Installation + Quickstart + Citation blocks
123
+ - ALL other documents should replace these with a cross-reference like:
124
+ "For installation instructions, see [Getting Started](getting-started.md)"
125
+ - Flag ANY document (except Getting Started) that duplicates Installation/Quickstart/Citation sections
126
+ - This is a MAJOR disclosure issue: readers must scroll past 15-20 lines of boilerplate on every page
127
+
128
+ **SECONDARY FOCUS (Regression Check):**
129
+ - Quick scan for CRITICAL technical errors introduced during revision
130
+ - Only flag if obvious/critical (e.g., broken code example, wrong command syntax)
131
+ - NOT a full technical re-review, just a safety net
132
+ - Examples to flag: `pip install wrong-package`, incorrect function signature in code block
133
+
134
+ **Output Format (JSON):**
135
+ Return ONLY valid JSON. No markdown code blocks, no commentary before or after.
136
+ Start your response with `{` and end with `}`.
137
+
138
+ ```json
139
+ {
140
+ "overall_assessment": "Brief verdict on information flow and any critical regressions",
141
+ "issues": [
142
+ {
143
+ "location": "Section or paragraph",
144
+ "severity": "critical|major|minor",
145
+ "category": "disclosure|layering|clarity|regression_technical|boilerplate",
146
+ "description": "What disrupts learning flow OR critical technical error introduced",
147
+ "suggested_fix": "How to restructure/simplify OR correct the technical error"
148
+ }
149
+ ],
150
+ "strengths": ["Well-structured explanations"],
151
+ "questions": ["Ambiguous ordering decisions"]
152
+ }
153
+ ```
154
+
155
+ CRITICAL: Ensure all strings are properly closed with quotes. Keep descriptions concise to avoid JSON parsing errors.
156
+
157
+ **Review Criteria:**
158
+ - Can a newcomer follow this?
159
+ - Do advanced topics appear too early?
160
+ - Are concepts properly scaffolded?
161
+ - **REGRESSION CHECK:** Are there obvious broken code examples or commands?
162
+ """
163
+
164
+ REVIEWER_C_ROLE = """You are Reviewer C: Consistency, Formatting & Cross-Reference Specialist
165
+
166
+ **PRIMARY FOCUS (Your Main Responsibility):**
167
+ 1. **Consistency & Naming**
168
+ - Variable/function names match codebase exactly
169
+ - Consistent terminology across sections
170
+ - Cross-references are accurate
171
+ - Examples use real file paths from the repository
172
+
173
+ 2. **Formatting & Patterns**
174
+ - Code blocks are properly formatted
175
+ - Citations follow [CITE:source] convention
176
+ - Headings follow proper hierarchy
177
+ - Lists and tables are well-structured
178
+ - Links work and point to correct locations
179
+
180
+ 3. **Boilerplate Duplication Check (CRITICAL)**
181
+ - ONLY "Getting Started" should contain full Installation + Quickstart + Citation blocks
182
+ - ALL other documents must use cross-references instead:
183
+ "For installation, see [Getting Started](getting-started.md)"
184
+ - Flag ANY document (except Getting Started) that duplicates these sections
185
+ - This is a consistency violation: same boilerplate repeated across multiple files
186
+
187
+ **SECONDARY FOCUS (Final Safety Check):**
188
+ - Quick scan for CRITICAL regressions (technical OR disclosure)
189
+ - Last chance to catch obvious errors before final draft
190
+ - Only flag if severe (e.g., completely broken example, major inconsistency introduced)
191
+ - Examples to flag: duplicate boilerplate text, contradictory instructions, broken citations
192
+
193
+ **Output Format (JSON):**
194
+ Return ONLY valid JSON. No markdown code blocks, no commentary before or after.
195
+ Start your response with `{` and end with `}`.
196
+
197
+ ```json
198
+ {
199
+ "overall_assessment": "Brief verdict on polish, consistency, and any critical regressions",
200
+ "issues": [
201
+ {
202
+ "location": "Specific location",
203
+ "severity": "critical|major|minor",
204
+ "category": "consistency|formatting|naming|citations|boilerplate|regression_technical|regression_disclosure",
205
+ "description": "What's inconsistent/malformed OR critical regression",
206
+ "suggested_fix": "Correction"
207
+ }
208
+ ],
209
+ "strengths": ["Well-formatted sections"],
210
+ "questions": ["Questionable naming choices"]
211
+ }
212
+ ```
213
+
214
+ CRITICAL: Ensure all strings are properly closed with quotes. Keep descriptions concise to avoid JSON parsing errors.
215
+
216
+ **Review Criteria:**
217
+ - Is naming consistent with the repo?
218
+ - Are citations properly formatted?
219
+ - Do cross-references work?
220
+ - **REGRESSION CHECK:** Any obvious technical or disclosure errors introduced?
221
+ - **CONSISTENCY CHECK:** Is same boilerplate text repeated across multiple sections?
222
+ """
223
+
224
+ WRITER_ROLE = """You are a technical documentation writer with deep codebase expertise.
225
+
226
+ **Your Responsibilities:**
227
+ - Apply reviewer feedback systematically and accurately
228
+ - Verify all changes against the codebase
229
+ - Preserve ALL existing [CITE:...] citations
230
+ - Maintain document structure and formatting
231
+
232
+ **Feedback Application Process:**
233
+ 1. Read reviewer feedback carefully
234
+ 2. For each issue:
235
+ - Assess validity against codebase
236
+ - Apply fix if valid
237
+ - Preserve or add appropriate citations
238
+ - Document any feedback you cannot apply
239
+ 3. Maintain consistency across all sections
240
+
241
+ **Critical Rules:**
242
+ - NEVER remove existing [CITE:...] citations
243
+ - If you move text, keep its citation attached
244
+ - If you add new factual claims, add appropriate citations
245
+ - Be conservative: if unsure about a fix, keep the original with a note
246
+ - When fixing technical errors, verify against actual code
247
+
248
+ **Boilerplate Content Rules:**
249
+ - ONLY "Getting Started" should have full Installation + Quickstart + Citation blocks
250
+ - ALL other documents should replace these sections with cross-references like:
251
+ "For installation instructions, see [Getting Started](getting-started.md)"
252
+ - Remove duplicate Installation/Quickstart/Citation blocks from non-Getting-Started documents
253
+
254
+ **Style Guidelines:**
255
+ - Progressive disclosure (simple → complex)
256
+ - Technical accuracy over brevity
257
+ - Concrete examples with citations
258
+ - Clear architecture explanations
259
+
260
+ **CRITICAL OUTPUT FORMAT:**
261
+ You MUST return ONLY the revised document content.
262
+ DO NOT include explanations, commentary, analysis, or acknowledgments.
263
+ DO NOT wrap content in markdown code blocks.
264
+ ONLY output documents in this exact delimiter format:
265
+
266
+ === FILENAME ===
267
+ [full revised content]
268
+
269
+ === NEXT_FILENAME ===
270
+ [full revised content]
271
+ """
272
+
273
+
274
+ class MultiReviewPipeline:
275
+ """
276
+ Orchestrate writer + 3 reviewers for iterative documentation refinement.
277
+
278
+ Each reviewer has:
279
+ - Primary focus (their specialty)
280
+ - Secondary focus (regression checks for later reviewers)
281
+ - Isolated context (fresh eyes, no previous feedback)
282
+ """
283
+
284
+ def __init__(self, searcher=None, repo_path: str = "."):
285
+ """
286
+ Initialize the multi-review pipeline.
287
+
288
+ Args:
289
+ searcher: Optional hybrid searcher for evidence grounding
290
+ repo_path: Path to repository root
291
+ """
292
+ self.searcher = searcher
293
+ self.repo_path = Path(repo_path)
294
+ self.generator = create_generator()
295
+
296
+ def run(
297
+ self,
298
+ drafts: Dict[str, str],
299
+ output_dir: Path
300
+ ) -> Dict[str, str]:
301
+ """
302
+ Run the complete multi-review pipeline.
303
+
304
+ Args:
305
+ drafts: Dictionary of {filename: content} for documentation
306
+ output_dir: Where to save final outputs
307
+
308
+ Returns:
309
+ Dictionary of final revised documentation
310
+ """
311
+ logger.info("Starting multi-review pipeline (Writer + 3 Reviewers)")
312
+
313
+ output_dir = Path(output_dir)
314
+ output_dir.mkdir(parents=True, exist_ok=True)
315
+
316
+ # Track all feedback for audit trail
317
+ review_log = {}
318
+
319
+ # Current working drafts
320
+ current_drafts = drafts.copy()
321
+
322
+ # Cycle 1: Reviewer A (Technical Accuracy)
323
+ logger.info("Cycle 1: Reviewer A (Technical Accuracy)")
324
+ feedback_a = self._spawn_reviewer_agent(
325
+ role=ReviewerRole.TECHNICAL_ACCURACY,
326
+ drafts=current_drafts
327
+ )
328
+ review_log["reviewer_a"] = feedback_a.to_dict()
329
+
330
+ current_drafts = self._spawn_writer_agent(
331
+ drafts=current_drafts,
332
+ feedback=feedback_a
333
+ )
334
+
335
+ # Cycle 2: Reviewer B (Progressive Disclosure + Regression Check)
336
+ logger.info("Cycle 2: Reviewer B (Progressive Disclosure + Regression Check)")
337
+ feedback_b = self._spawn_reviewer_agent(
338
+ role=ReviewerRole.PROGRESSIVE_DISCLOSURE,
339
+ drafts=current_drafts
340
+ )
341
+ review_log["reviewer_b"] = feedback_b.to_dict()
342
+
343
+ current_drafts = self._spawn_writer_agent(
344
+ drafts=current_drafts,
345
+ feedback=feedback_b
346
+ )
347
+
348
+ # Cycle 3: Reviewer C (Consistency + Final Safety Check)
349
+ logger.info("Cycle 3: Reviewer C (Consistency + Final Safety Check)")
350
+ feedback_c = self._spawn_reviewer_agent(
351
+ role=ReviewerRole.CONSISTENCY,
352
+ drafts=current_drafts
353
+ )
354
+ review_log["reviewer_c"] = feedback_c.to_dict()
355
+
356
+ final_drafts = self._spawn_writer_agent(
357
+ drafts=current_drafts,
358
+ feedback=feedback_c
359
+ )
360
+
361
+ # Save final drafts
362
+ logger.info("Saving final documentation...")
363
+ for filename, content in final_drafts.items():
364
+ output_file = output_dir / filename
365
+ output_file.write_text(content)
366
+ logger.info(f" ✓ {output_file}")
367
+
368
+ # Save review log
369
+ review_log_file = output_dir / "REVIEW_LOG.json"
370
+ review_log_file.write_text(json.dumps(review_log, indent=2))
371
+ logger.info(f"Review log saved: {review_log_file}")
372
+
373
+ logger.info("Multi-review pipeline complete!")
374
+ return final_drafts
375
+
376
+ def _spawn_reviewer_agent(
377
+ self,
378
+ role: ReviewerRole,
379
+ drafts: Dict[str, str]
380
+ ) -> ReviewFeedback:
381
+ """
382
+ Spawn isolated reviewer agent with fresh context.
383
+
384
+ Reviews documents in batches to avoid API timeouts.
385
+
386
+ Args:
387
+ role: Which reviewer type to spawn
388
+ drafts: Current documentation drafts
389
+
390
+ Returns:
391
+ Structured feedback from the reviewer (merged from all batches)
392
+ """
393
+ BATCH_SIZE = 5 # Review 5 documents at a time
394
+
395
+ draft_items = list(drafts.items())
396
+ total_batches = (len(draft_items) + BATCH_SIZE - 1) // BATCH_SIZE
397
+ all_issues = []
398
+ all_strengths = []
399
+ all_questions = []
400
+ overall_assessments = []
401
+
402
+ role_prompt = self._get_role_prompt(role)
403
+
404
+ logger.info(f" Spawning {role.value} with isolated context ({total_batches} batches)...")
405
+
406
+ for batch_idx in range(0, len(draft_items), BATCH_SIZE):
407
+ batch = dict(draft_items[batch_idx:batch_idx + BATCH_SIZE])
408
+ batch_num = (batch_idx // BATCH_SIZE) + 1
409
+
410
+ logger.info(f" Reviewing batch {batch_num}/{total_batches} ({len(batch)} documents)...")
411
+
412
+ # Build review prompt for this batch
413
+ user_prompt = f"""Review the following documentation drafts against the codebase.
414
+
415
+ IMPORTANT: You are seeing these drafts with fresh eyes. You have no knowledge of previous reviews.
416
+
417
+ Batch {batch_num}/{total_batches}:
418
+ {self._format_drafts(batch)}
419
+
420
+ Provide your feedback as JSON following the format specified in your role description.
421
+ Focus on your PRIMARY responsibilities, but also note any CRITICAL issues in your SECONDARY focus area if applicable.
422
+ """
423
+
424
+ # Generate review for this batch
425
+ response = self.generator.generate(
426
+ system_prompt=role_prompt,
427
+ user_prompt=user_prompt,
428
+ temperature=0.15, # Deterministic reviews
429
+ max_tokens=3000
430
+ )
431
+
432
+ # Parse and accumulate feedback from this batch
433
+ batch_feedback = self._parse_feedback(response, role.value)
434
+ all_issues.extend(batch_feedback.issues)
435
+ all_strengths.extend(batch_feedback.strengths)
436
+ all_questions.extend(batch_feedback.questions)
437
+ overall_assessments.append(batch_feedback.overall_assessment)
438
+
439
+ # Merge all batch feedback into single ReviewFeedback
440
+ merged_assessment = " | ".join(overall_assessments) if overall_assessments else "No issues found"
441
+
442
+ logger.info(f" {role.value} completed all {total_batches} batches ({len(all_issues)} total issues)")
443
+
444
+ return ReviewFeedback(
445
+ reviewer=role.value,
446
+ overall_assessment=merged_assessment,
447
+ issues=all_issues,
448
+ strengths=all_strengths,
449
+ questions=all_questions
450
+ )
451
+
452
+ def _spawn_writer_agent(
453
+ self,
454
+ drafts: Dict[str, str],
455
+ feedback: ReviewFeedback
456
+ ) -> Dict[str, str]:
457
+ """
458
+ Spawn writer agent to apply reviewer feedback.
459
+
460
+ Processes documents in batches to avoid API timeouts.
461
+
462
+ Args:
463
+ drafts: Current documentation drafts
464
+ feedback: Reviewer feedback to apply
465
+
466
+ Returns:
467
+ Revised documentation drafts
468
+ """
469
+ BATCH_SIZE = 4 # Process 4 documents at a time
470
+
471
+ draft_items = list(drafts.items())
472
+ revised_drafts = {}
473
+ total_batches = (len(draft_items) + BATCH_SIZE - 1) // BATCH_SIZE
474
+
475
+ logger.info(f" Writer applying {feedback.reviewer} feedback in {total_batches} batches...")
476
+
477
+ for batch_idx in range(0, len(draft_items), BATCH_SIZE):
478
+ batch = dict(draft_items[batch_idx:batch_idx + BATCH_SIZE])
479
+ batch_num = (batch_idx // BATCH_SIZE) + 1
480
+
481
+ logger.info(f" Processing batch {batch_num}/{total_batches} ({len(batch)} documents)...")
482
+
483
+ # Filter feedback to only issues relevant to this batch
484
+ batch_feedback = self._filter_feedback_for_batch(feedback, batch.keys())
485
+
486
+ user_prompt = f"""Apply the following reviewer feedback to the documentation drafts.
487
+
488
+ Current Drafts (Batch {batch_num}/{total_batches}):
489
+ {self._format_drafts(batch)}
490
+
491
+ Reviewer Feedback:
492
+ {json.dumps(batch_feedback.to_dict(), indent=2)}
493
+
494
+ For each issue:
495
+ 1. Verify the concern against the codebase
496
+ 2. Apply the fix if valid
497
+ 3. Preserve ALL [CITE:...] citations
498
+ 4. Make precise, targeted edits
499
+
500
+ CRITICAL OUTPUT REQUIREMENTS:
501
+ - Return ONLY the revised document content
502
+ - NO explanations, commentary, or analysis
503
+ - NO markdown code blocks (```markdown, etc.)
504
+ - Use EXACTLY this format for each file:
505
+
506
+ === FILENAME ===
507
+ [complete revised content of the file]
508
+
509
+ === NEXT_FILENAME ===
510
+ [complete revised content of next file]
511
+
512
+ EXAMPLES OF CORRECT FORMAT:
513
+
514
+ Example 1 (correct):
515
+ === SYNTHESIZED_GETTING_STARTED.md ===
516
+ # Getting Started
517
+
518
+ Speculators is a library for speculative decoding...
519
+
520
+ ## Installation
521
+ ...
522
+
523
+ === SYNTHESIZED_CONCEPTS.md ===
524
+ # Concepts
525
+
526
+ For installation instructions, see [Getting Started](getting-started.md).
527
+
528
+ ## Overview
529
+ ...
530
+
531
+ Example 2 (INCORRECT - do NOT do this):
532
+ I've reviewed the feedback and applied the following changes:
533
+ - Removed duplicate Installation section from Concepts
534
+ - Fixed the boilerplate issue
535
+ Here are the revised documents:
536
+ === SYNTHESIZED_CONCEPTS.md ===
537
+ ...
538
+
539
+ CRITICAL: Your response must START with "=== " (the first three characters).
540
+ Do not write ANYTHING before the first "=== " delimiter.
541
+ """
542
+
543
+ response = self.generator.generate(
544
+ system_prompt=WRITER_ROLE,
545
+ user_prompt=user_prompt,
546
+ temperature=0.2,
547
+ max_tokens=8000 # Reduced from 16000 since processing fewer docs
548
+ )
549
+
550
+ # Parse and merge batch results
551
+ batch_revised = self._parse_revised_drafts(response, batch)
552
+ revised_drafts.update(batch_revised)
553
+
554
+ logger.info(f" Writer completed all {total_batches} batches")
555
+ return revised_drafts
556
+
557
+ def _filter_feedback_for_batch(
558
+ self,
559
+ feedback: ReviewFeedback,
560
+ batch_filenames: set
561
+ ) -> ReviewFeedback:
562
+ """Filter feedback to only include issues relevant to the current batch."""
563
+ # Filter issues that mention files in this batch
564
+ relevant_issues = []
565
+ for issue in feedback.issues:
566
+ location = issue.get("location", "")
567
+ # Check if location mentions any file in the batch
568
+ if any(filename in location for filename in batch_filenames):
569
+ relevant_issues.append(issue)
570
+ # Also include general issues without specific locations
571
+ elif not any(f".md:" in location for f in ["SYNTHESIZED_"]):
572
+ relevant_issues.append(issue)
573
+
574
+ # Create filtered feedback
575
+ return ReviewFeedback(
576
+ reviewer=feedback.reviewer,
577
+ overall_assessment=feedback.overall_assessment,
578
+ issues=relevant_issues,
579
+ strengths=feedback.strengths,
580
+ questions=feedback.questions
581
+ )
582
+
583
+ def _get_role_prompt(self, role: ReviewerRole) -> str:
584
+ """Get the system prompt for a reviewer role."""
585
+ role_map = {
586
+ ReviewerRole.TECHNICAL_ACCURACY: REVIEWER_A_ROLE,
587
+ ReviewerRole.PROGRESSIVE_DISCLOSURE: REVIEWER_B_ROLE,
588
+ ReviewerRole.CONSISTENCY: REVIEWER_C_ROLE,
589
+ }
590
+ return role_map[role]
591
+
592
+ def _format_drafts(self, drafts: Dict[str, str]) -> str:
593
+ """Format drafts for inclusion in prompts."""
594
+ formatted = []
595
+ for filename, content in drafts.items():
596
+ # Truncate very long content to stay within token limits
597
+ truncated_content = content[:15000] if len(content) > 15000 else content
598
+ truncation_note = "\n[... content truncated for review ...]" if len(content) > 15000 else ""
599
+
600
+ formatted.append(f"""=== {filename} ===
601
+ {truncated_content}{truncation_note}
602
+ """)
603
+ return "\n\n".join(formatted)
604
+
605
+ def _parse_feedback(self, response: str, reviewer_name: str) -> ReviewFeedback:
606
+ """
607
+ Parse reviewer response into structured feedback.
608
+
609
+ Args:
610
+ response: Raw LLM response
611
+ reviewer_name: Name of the reviewer
612
+
613
+ Returns:
614
+ Structured ReviewFeedback object
615
+ """
616
+ # Try to extract JSON from response
617
+ try:
618
+ # Look for JSON in markdown code blocks
619
+ if "```json" in response:
620
+ json_start = response.find("```json") + 7
621
+ json_end = response.find("```", json_start)
622
+ json_str = response[json_start:json_end].strip()
623
+ elif "```" in response:
624
+ json_start = response.find("```") + 3
625
+ json_end = response.find("```", json_start)
626
+ json_str = response[json_start:json_end].strip()
627
+ else:
628
+ json_str = response.strip()
629
+
630
+ data = json.loads(json_str)
631
+
632
+ return ReviewFeedback(
633
+ reviewer=reviewer_name,
634
+ overall_assessment=data.get("overall_assessment", "No assessment provided"),
635
+ issues=data.get("issues", []),
636
+ strengths=data.get("strengths", []),
637
+ questions=data.get("questions", []),
638
+ raw_response=response
639
+ )
640
+ except (json.JSONDecodeError, KeyError, IndexError) as e:
641
+ logger.warning(f"Failed to parse {reviewer_name} feedback as JSON: {e}")
642
+ # Fallback: treat as plain text issues
643
+ return ReviewFeedback(
644
+ reviewer=reviewer_name,
645
+ overall_assessment="Could not parse structured feedback",
646
+ issues=[{
647
+ "location": "general",
648
+ "severity": "minor",
649
+ "category": "parsing_error",
650
+ "description": f"Reviewer response: {response[:500]}",
651
+ "suggested_fix": "Manual review required"
652
+ }],
653
+ raw_response=response
654
+ )
655
+
656
+ def _parse_revised_drafts(
657
+ self,
658
+ response: str,
659
+ original_drafts: Dict[str, str]
660
+ ) -> Dict[str, str]:
661
+ """
662
+ Parse writer's revised drafts from response.
663
+
664
+ Args:
665
+ response: Raw LLM response with revised drafts
666
+ original_drafts: Original drafts (fallback if parsing fails)
667
+
668
+ Returns:
669
+ Dictionary of revised drafts
670
+ """
671
+ revised = {}
672
+
673
+ # Check if response starts with delimiter
674
+ if not response.strip().startswith("==="):
675
+ logger.warning(
676
+ f"Writer response does not start with '===' delimiter. "
677
+ f"First 200 chars: {response[:200]}"
678
+ )
679
+
680
+ # Try to parse delimited format
681
+ try:
682
+ sections = response.split("=== ")
683
+ for section in sections[1:]: # Skip first empty split
684
+ if " ===" not in section:
685
+ continue
686
+
687
+ filename_end = section.find(" ===")
688
+ filename = section[:filename_end].strip()
689
+ content = section[filename_end + 4:].strip()
690
+
691
+ # Clean up markdown code blocks if present
692
+ if content.startswith("```"):
693
+ content = content.split("```", 2)[-1].strip()
694
+ if content.endswith("```"):
695
+ content = content.rsplit("```", 1)[0].strip()
696
+
697
+ revised[filename] = content
698
+ logger.info(f" Successfully parsed revision for {filename} ({len(content)} chars)")
699
+ except Exception as e:
700
+ logger.warning(f"Failed to parse revised drafts: {e}")
701
+ logger.warning(f"Response preview: {response[:500]}")
702
+ return original_drafts
703
+
704
+ # If no files were parsed, return originals
705
+ if not revised:
706
+ logger.warning("No revised drafts found in response, keeping originals")
707
+ logger.warning(f"Response length: {len(response)} chars")
708
+ logger.warning(f"Response preview: {response[:500]}")
709
+ return original_drafts
710
+
711
+ # Fill in any missing files from originals
712
+ for filename in original_drafts:
713
+ if filename not in revised:
714
+ logger.warning(f"File {filename} not in revision, keeping original")
715
+ revised[filename] = original_drafts[filename]
716
+
717
+ return revised