bioguider 0.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +92 -0
  4. bioguider/agents/agent_tools.py +176 -0
  5. bioguider/agents/agent_utils.py +504 -0
  6. bioguider/agents/collection_execute_step.py +182 -0
  7. bioguider/agents/collection_observe_step.py +125 -0
  8. bioguider/agents/collection_plan_step.py +156 -0
  9. bioguider/agents/collection_task.py +184 -0
  10. bioguider/agents/collection_task_utils.py +142 -0
  11. bioguider/agents/common_agent.py +137 -0
  12. bioguider/agents/common_agent_2step.py +215 -0
  13. bioguider/agents/common_conversation.py +61 -0
  14. bioguider/agents/common_step.py +85 -0
  15. bioguider/agents/consistency_collection_step.py +102 -0
  16. bioguider/agents/consistency_evaluation_task.py +57 -0
  17. bioguider/agents/consistency_evaluation_task_utils.py +14 -0
  18. bioguider/agents/consistency_observe_step.py +110 -0
  19. bioguider/agents/consistency_query_step.py +77 -0
  20. bioguider/agents/dockergeneration_execute_step.py +186 -0
  21. bioguider/agents/dockergeneration_observe_step.py +154 -0
  22. bioguider/agents/dockergeneration_plan_step.py +158 -0
  23. bioguider/agents/dockergeneration_task.py +158 -0
  24. bioguider/agents/dockergeneration_task_utils.py +220 -0
  25. bioguider/agents/evaluation_installation_task.py +270 -0
  26. bioguider/agents/evaluation_readme_task.py +767 -0
  27. bioguider/agents/evaluation_submission_requirements_task.py +172 -0
  28. bioguider/agents/evaluation_task.py +206 -0
  29. bioguider/agents/evaluation_tutorial_task.py +169 -0
  30. bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
  31. bioguider/agents/evaluation_userguide_prompts.py +179 -0
  32. bioguider/agents/evaluation_userguide_task.py +154 -0
  33. bioguider/agents/evaluation_utils.py +127 -0
  34. bioguider/agents/identification_execute_step.py +181 -0
  35. bioguider/agents/identification_observe_step.py +104 -0
  36. bioguider/agents/identification_plan_step.py +140 -0
  37. bioguider/agents/identification_task.py +270 -0
  38. bioguider/agents/identification_task_utils.py +22 -0
  39. bioguider/agents/peo_common_step.py +64 -0
  40. bioguider/agents/prompt_utils.py +253 -0
  41. bioguider/agents/python_ast_repl_tool.py +69 -0
  42. bioguider/agents/rag_collection_task.py +130 -0
  43. bioguider/conversation.py +67 -0
  44. bioguider/database/code_structure_db.py +500 -0
  45. bioguider/database/summarized_file_db.py +146 -0
  46. bioguider/generation/__init__.py +39 -0
  47. bioguider/generation/benchmark_metrics.py +610 -0
  48. bioguider/generation/change_planner.py +189 -0
  49. bioguider/generation/document_renderer.py +157 -0
  50. bioguider/generation/llm_cleaner.py +67 -0
  51. bioguider/generation/llm_content_generator.py +1128 -0
  52. bioguider/generation/llm_injector.py +809 -0
  53. bioguider/generation/models.py +85 -0
  54. bioguider/generation/output_manager.py +74 -0
  55. bioguider/generation/repo_reader.py +37 -0
  56. bioguider/generation/report_loader.py +166 -0
  57. bioguider/generation/style_analyzer.py +36 -0
  58. bioguider/generation/suggestion_extractor.py +436 -0
  59. bioguider/generation/test_metrics.py +189 -0
  60. bioguider/managers/benchmark_manager.py +785 -0
  61. bioguider/managers/evaluation_manager.py +215 -0
  62. bioguider/managers/generation_manager.py +686 -0
  63. bioguider/managers/generation_test_manager.py +107 -0
  64. bioguider/managers/generation_test_manager_v2.py +525 -0
  65. bioguider/rag/__init__.py +0 -0
  66. bioguider/rag/config.py +117 -0
  67. bioguider/rag/data_pipeline.py +651 -0
  68. bioguider/rag/embedder.py +24 -0
  69. bioguider/rag/rag.py +138 -0
  70. bioguider/settings.py +103 -0
  71. bioguider/utils/code_structure_builder.py +59 -0
  72. bioguider/utils/constants.py +135 -0
  73. bioguider/utils/default.gitignore +140 -0
  74. bioguider/utils/file_utils.py +215 -0
  75. bioguider/utils/gitignore_checker.py +175 -0
  76. bioguider/utils/notebook_utils.py +117 -0
  77. bioguider/utils/pyphen_utils.py +73 -0
  78. bioguider/utils/python_file_handler.py +65 -0
  79. bioguider/utils/r_file_handler.py +551 -0
  80. bioguider/utils/utils.py +163 -0
  81. bioguider-0.2.52.dist-info/LICENSE +21 -0
  82. bioguider-0.2.52.dist-info/METADATA +51 -0
  83. bioguider-0.2.52.dist-info/RECORD +84 -0
  84. bioguider-0.2.52.dist-info/WHEEL +4 -0
@@ -0,0 +1,809 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import Tuple, Dict, Any, List, Set
5
+ import re
6
+ from difflib import SequenceMatcher
7
+
8
+ from langchain_openai.chat_models.base import BaseChatOpenAI
9
+ from bioguider.agents.common_conversation import CommonConversation
10
+ from bioguider.utils.utils import escape_braces
11
+
12
+
13
+ INJECTION_PROMPT = """
14
+ You are “BioGuider-Intro,” generating a deliberately flawed **INTRODUCTION** file
15
+ (“README-lite”) to test an auto-fixer. Start from the provided clean INTRO doc that follows the
16
+ BioGuider Intro structure (What is it? / What can it do? / Requirements / Install / Quick example /
17
+ Learn more / License & Contact). Produce a corrupted version with small, realistic defects.
18
+
19
+ GOAL
20
+ Introduce subtle but meaningful issues while keeping the document recognizably the same.
21
+
22
+ ERROR CATEGORIES (inject all)
23
+ - typo: spelling/grammar/punctuation mistakes
24
+ - link: malformed URL, wrong domain, or stray spaces in URL
25
+ - duplicate: duplicate a short line/section fragment
26
+ - bio_term: slightly wrong domain term (e.g., “single sell” for “single cell”); do not invent new science
27
+ - function: misspell a known function/API name **from the input README-lite only**
28
+ - markdown_structure: break a header level, list indentation, or code fence (one-off)
29
+ - list_structure: remove bullet space (e.g., “-item”), mix markers inconsistently
30
+ - section_title: subtly change a section title casing or wording
31
+ - image_syntax: break image markdown spacing (e.g., `![alt] (url)`)
32
+ - inline_code: remove backticks around inline code
33
+ - emphasis: break emphasis markers (e.g., missing closing `*`)
34
+ - table_alignment: misalign or omit a `|` in a markdown table
35
+ - code_lang_tag: use the wrong fenced code language (e.g., ```py for R)
36
+
37
+ BIOLOGY-SPECIFIC ERROR CATEGORIES (inject all; keep realistic & subtle)
38
+ - gene_symbol_case: change gene symbol casing or add suffix (e.g., “tp53”, “CD3e”), but **do not alter** protected keywords
39
+ - species_swap: imply human vs mouse mix-up (e.g., “mm10” vs “GRCh38”) in a short phrase
40
+ - ref_genome_mismatch: claim a reference genome that conflicts with the example file or text
41
+ - modality_confusion: conflate RNA-seq with ATAC or proteomics in a brief phrase
42
+ - normalization_error: misuse terms like CPM/TPM/CLR/log1p in a sentence
43
+ - umi_vs_read: confuse UMI counts vs read counts in a short line
44
+ - batch_effect: misstate “batch correction” vs “normalization” terminology
45
+ - qc_threshold: use a common but slightly wrong QC gate (e.g., mito% 0.5 instead of 5)
46
+ - file_format: mix up FASTQ/BAM/MTX/H5AD/RDS in a brief mention
47
+ - strandedness: claim “stranded” when workflow is unstranded (or vice versa)
48
+ - coordinates: confuse 0-based vs 1-based or chromosome naming style (chr1 vs 1)
49
+ - units_scale: use the wrong scale/unit (e.g., μm vs mm; 10e6 instead of 1e6)
50
+ - sample_type: conflate “primary tissue” with “cell line” in a single phrase
51
+ - contamination: misuse “ambient RNA” vs “doublets” terminology
52
+
53
+ CLI/CONFIG ERROR CATEGORIES (inject all)
54
+ - param_name: slightly misspell a CLI flag or config key (e.g., `--min-cell` → `--min-cells`)
55
+ - default_value: state a plausible but incorrect default value
56
+ - path_hint: introduce a subtle path typo (e.g., `data/filtrd`)
57
+
58
+
59
+ CONSTRAINTS
60
+ - Keep edits minimal and local; **≥85% token overlap** with input.
61
+ - **CRITICAL: Preserve ALL code block structure exactly**:
62
+ * Do NOT remove, add, or modify code fence delimiters (``` or ```{r} or ```{python})
63
+ * The number of ``` lines MUST be identical in input and output
64
+ * For RMarkdown/Rmd files, preserve ALL chunk headers like ```{r, ...}
65
+ * Only introduce errors INSIDE code blocks (typos in code), never break the fences
66
+ - **Preserve section ORDER and TITLES** from the Intro spec (if applicable):
67
+ 1) # <project_name>
68
+ _<tagline>_
69
+ 2) What is it?
70
+ 3) What can it do?
71
+ 4) Requirements
72
+ 5) Install
73
+ 6) Quick example
74
+ 7) Learn more
75
+ 8) License & Contact
76
+ - Do **not** add or remove top-level sections. Subtle line-level corruption only.
77
+ - Maintain a **concise length** (≤ {max_words} words).
78
+ - Do **not** alter the protected keywords (exact casing/spelling): {keywords}
79
+ - Keep at least **{min_per_category} errors per category** listed above.
80
+ - Limit `duplicate` injections to at most **{min_per_category}**.
81
+ - If the input contains runnable code, keep it mostly intact but introduce **one** realistic break
82
+ (e.g., missing quote/paren or wrong function name) without adding new libraries.
83
+ - Keep at least one **valid** URL so the fixer can compare.
84
+ - Do not change the project identity, domain, or language.
85
+ - Do not include markers, explanations, or commentary in the corrupted markdown.
86
+
87
+ INPUT INTRO (clean README-lite)
88
+ <<INTRO>>
89
+ {readme}
90
+ <</INTRO>>
91
+
92
+ OUTPUT (JSON only):
93
+ {{
94
+ "corrupted_markdown": "<the entire corrupted INTRO as markdown>",
95
+ "errors": [
96
+ {{
97
+ "id": "e1",
98
+ "category": "typo|link|duplicate|bio_term|function|markdown_structure",
99
+ "rationale": "why this mutation is realistic",
100
+ "original_snippet": "<verbatim snippet from input>",
101
+ "mutated_snippet": "<verbatim mutated text>"
102
+ }}
103
+ // include one entry per individual mutation you applied
104
+ ]
105
+ }}
106
+ """
107
+
108
+
109
+ class LLMErrorInjector:
110
+ def __init__(self, llm: BaseChatOpenAI):
111
+ self.llm = llm
112
+
113
+ def inject(self, readme_text: str, min_per_category: int = 3, preserve_keywords: list[str] | None = None, max_words: int = 450, project_terms: list[str] | None = None) -> Tuple[str, Dict[str, Any]]:
114
+ conv = CommonConversation(self.llm)
115
+ preserve_keywords = preserve_keywords or self._extract_preserve_keywords(readme_text)
116
+
117
+ # Add project terms to prompt if available
118
+ project_terms_section = ""
119
+ if project_terms:
120
+ terms_str = ", ".join(project_terms[:20]) # Limit to top 20 to avoid clutter
121
+ project_terms_section = f"\nPROJECT SPECIFIC TARGETS (Prioritize misspelling these):\n{terms_str}\n"
122
+
123
+ system_prompt = escape_braces(INJECTION_PROMPT).format(
124
+ readme=readme_text[:30000],
125
+ min_per_category=min_per_category,
126
+ keywords=", ".join(preserve_keywords) if preserve_keywords else "",
127
+ max_words=max_words,
128
+ )
129
+
130
+ if project_terms:
131
+ # Insert project terms section before ERROR CATEGORIES
132
+ system_prompt = system_prompt.replace("ERROR CATEGORIES (inject all)", f"{project_terms_section}\nERROR CATEGORIES (inject all)")
133
+
134
+ output, _ = conv.generate(system_prompt=system_prompt, instruction_prompt="Return the JSON now.")
135
+
136
+ # Enhanced JSON parsing with better error handling
137
+ data = self._parse_json_output(output, readme_text)
138
+ corrupted = data.get("corrupted_markdown", readme_text)
139
+
140
+ # CRITICAL: Check code block preservation before validation
141
+ if not self._check_code_blocks_preserved(readme_text, corrupted):
142
+ print("Warning: LLM output broke code blocks, using deterministic fallback")
143
+ corrupted, data = self._deterministic_inject(readme_text)
144
+ # Validate output stays within original context; fallback to deterministic if invalid
145
+ elif not self._validate_corrupted(readme_text, corrupted, preserve_keywords):
146
+ corrupted, data = self._deterministic_inject(readme_text)
147
+
148
+ # Supplement to satisfy minimum per-category counts using deterministic local edits
149
+ corrupted, data = self._supplement_errors(readme_text, corrupted, data, min_per_category, project_terms)
150
+
151
+ # Final safety check: ensure code blocks are still intact after supplements
152
+ if not self._check_code_blocks_preserved(readme_text, corrupted):
153
+ print("Warning: Supplements broke code blocks, reverting to baseline with minimal errors")
154
+ corrupted, data = self._deterministic_inject(readme_text)
155
+
156
+ manifest = {
157
+ "errors": data.get("errors", []),
158
+ }
159
+ return corrupted, manifest
160
+
161
+ def _check_code_blocks_preserved(self, baseline: str, corrupted: str) -> bool:
162
+ """Check that code block structure is preserved exactly."""
163
+ # Count code fence lines (must match exactly)
164
+ base_fences = len(re.findall(r"^```", baseline, flags=re.M))
165
+ corr_fences = len(re.findall(r"^```", corrupted, flags=re.M))
166
+ if base_fences != corr_fences:
167
+ return False
168
+
169
+ # Check RMarkdown chunks specifically (```{r}, ```{python}, etc.)
170
+ base_rmd = re.findall(r"^```\{[^}]*\}", baseline, flags=re.M)
171
+ corr_rmd = re.findall(r"^```\{[^}]*\}", corrupted, flags=re.M)
172
+ if len(base_rmd) != len(corr_rmd):
173
+ return False
174
+
175
+ # Ensure closing ``` match opening count
176
+ base_close = len(re.findall(r"^```\s*$", baseline, flags=re.M))
177
+ corr_close = len(re.findall(r"^```\s*$", corrupted, flags=re.M))
178
+ if base_close != corr_close:
179
+ return False
180
+
181
+ return True
182
+
183
+ def _parse_json_output(self, output: str, fallback_text: str) -> Dict[str, Any]:
184
+ """Enhanced JSON parsing with multiple fallback strategies."""
185
+ import re
186
+
187
+ # Strategy 1: Direct JSON parsing
188
+ try:
189
+ return json.loads(output)
190
+ except json.JSONDecodeError:
191
+ pass
192
+
193
+ # Strategy 2: Extract JSON block between ```json and ```
194
+ json_pattern = r'```(?:json)?\s*(\{.*?\})\s*```'
195
+ match = re.search(json_pattern, output, re.DOTALL)
196
+ if match:
197
+ try:
198
+ return json.loads(match.group(1))
199
+ except json.JSONDecodeError:
200
+ pass
201
+
202
+ # Strategy 3: Find first complete JSON object
203
+ start = output.find("{")
204
+ if start != -1:
205
+ # Find matching closing brace
206
+ brace_count = 0
207
+ end = start
208
+ for i, char in enumerate(output[start:], start):
209
+ if char == "{":
210
+ brace_count += 1
211
+ elif char == "}":
212
+ brace_count -= 1
213
+ if brace_count == 0:
214
+ end = i
215
+ break
216
+
217
+ if brace_count == 0: # Found complete JSON object
218
+ try:
219
+ json_str = output[start:end+1]
220
+ return json.loads(json_str)
221
+ except json.JSONDecodeError:
222
+ pass
223
+
224
+ # Strategy 4: Try to fix common JSON issues
225
+ try:
226
+ # Remove markdown code fences
227
+ cleaned = re.sub(r'```(?:json)?\s*', '', output)
228
+ cleaned = re.sub(r'```\s*$', '', cleaned)
229
+ # Remove leading/trailing whitespace
230
+ cleaned = cleaned.strip()
231
+ # Try parsing again
232
+ return json.loads(cleaned)
233
+ except json.JSONDecodeError:
234
+ pass
235
+
236
+ # Strategy 5: Fallback to deterministic injection
237
+ print(f"Warning: Failed to parse LLM JSON output, using fallback. Output preview: {output[:200]}...")
238
+ return {"corrupted_markdown": fallback_text, "errors": []}
239
+
240
+ def _extract_preserve_keywords(self, text: str) -> List[str]:
241
+ # Extract capitalized terms, domain hyphenations, and hostnames in links
242
+ kws: Set[str] = set()
243
+ for m in re.finditer(r"\b[A-Z][A-Za-z0-9\-/]{2,}(?:\s[A-Z][A-Za-z0-9\-/]{2,})*\b", text):
244
+ term = m.group(0)
245
+ if len(term) <= 40:
246
+ kws.add(term)
247
+ for m in re.finditer(r"\b[\w]+-[\w]+\b", text):
248
+ if any(ch.isalpha() for ch in m.group(0)):
249
+ kws.add(m.group(0))
250
+ for m in re.finditer(r"https?://([^/\s)]+)", text):
251
+ kws.add(m.group(1))
252
+ # Keep a small set to avoid over-constraining
253
+ out = list(kws)[:20]
254
+ return out
255
+
256
+ def _validate_corrupted(self, baseline: str, corrupted: str, preserve_keywords: List[str]) -> bool:
257
+ # Similarity threshold - increased for better structure preservation
258
+ ratio = SequenceMatcher(None, baseline, corrupted).ratio()
259
+ if ratio < 0.75:
260
+ return False
261
+ # Preserve keywords
262
+ for k in preserve_keywords:
263
+ if k and k not in corrupted:
264
+ return False
265
+ # No new top-level sections
266
+ base_h2 = set([ln.strip() for ln in baseline.splitlines() if ln.strip().startswith("## ")])
267
+ corr_h2 = set([ln.strip() for ln in corrupted.splitlines() if ln.strip().startswith("## ")])
268
+ if not corr_h2.issubset(base_h2.union({"## Overview", "## Hardware Requirements", "## License", "## Usage", "## Dependencies", "## System Requirements"})):
269
+ return False
270
+ # New token ratio
271
+ btoks = set(re.findall(r"[A-Za-z0-9_\-]+", baseline.lower()))
272
+ ctoks = set(re.findall(r"[A-Za-z0-9_\-]+", corrupted.lower()))
273
+ new_ratio = len(ctoks - btoks) / max(1, len(ctoks))
274
+ if new_ratio > 0.25:
275
+ return False
276
+ # CRITICAL: Preserve code block structure
277
+ # Count code fences (``` or ```{...}) - must match
278
+ base_fences = len(re.findall(r"^```", baseline, flags=re.M))
279
+ corr_fences = len(re.findall(r"^```", corrupted, flags=re.M))
280
+ if base_fences != corr_fences:
281
+ return False
282
+ # Check RMarkdown chunks specifically
283
+ base_rmd_chunks = len(re.findall(r"^```\{[^}]*\}", baseline, flags=re.M))
284
+ corr_rmd_chunks = len(re.findall(r"^```\{[^}]*\}", corrupted, flags=re.M))
285
+ if base_rmd_chunks != corr_rmd_chunks:
286
+ return False
287
+ return True
288
+
289
+ def _deterministic_inject(self, baseline: str) -> Tuple[str, Dict[str, Any]]:
290
+ errors: List[Dict[str, Any]] = []
291
+ text = baseline
292
+ # typo
293
+ if "successfully" in text:
294
+ text = text.replace("successfully", "succesfully", 1)
295
+ errors.append({"id": "e_typo_1", "category": "typo", "original_snippet": "successfully", "mutated_snippet": "succesfully", "rationale": "common misspelling"})
296
+ elif "installation" in text:
297
+ text = text.replace("installation", "instalation", 1)
298
+ errors.append({"id": "e_typo_1", "category": "typo", "original_snippet": "installation", "mutated_snippet": "instalation", "rationale": "common misspelling"})
299
+ # link
300
+ m = re.search(r"\]\(https?://[^)]+\)", text)
301
+ if m:
302
+ broken = m.group(0).replace("https://", "https//")
303
+ text = text.replace(m.group(0), broken, 1)
304
+ errors.append({"id": "e_link_1", "category": "link", "original_snippet": m.group(0), "mutated_snippet": broken, "rationale": "missing colon in scheme"})
305
+ # duplicate a small section (next header and paragraph)
306
+ lines = text.splitlines()
307
+ dup_idx = next((i for i, ln in enumerate(lines) if ln.strip().startswith("## ")), None)
308
+ if dup_idx is not None:
309
+ block = lines[dup_idx: min(len(lines), dup_idx+5)]
310
+ text = "\n".join(lines + ["", *block])
311
+ errors.append({"id": "e_dup_1", "category": "duplicate", "original_snippet": "\n".join(block), "mutated_snippet": "\n".join(block), "rationale": "duplicated section"})
312
+ # markdown structure: break a header
313
+ if "\n# " in text:
314
+ text = text.replace("\n# ", "\n#", 1)
315
+ errors.append({"id": "e_md_1", "category": "markdown_structure", "original_snippet": "\n# ", "mutated_snippet": "\n#", "rationale": "missing space in header"})
316
+ return text, {"errors": errors}
317
+
318
+ def _supplement_errors(self, baseline: str, corrupted: str, data: Dict[str, Any], min_per_category: int, project_terms: list[str] | None = None) -> Tuple[str, Dict[str, Any]]:
319
+ errors: List[Dict[str, Any]] = data.get("errors", []) or []
320
+ cat_counts: Dict[str, int] = {}
321
+ for e in errors:
322
+ cat = e.get("category", "")
323
+ cat_counts[cat] = cat_counts.get(cat, 0) + 1
324
+
325
+ # Track what's already been corrupted to avoid re-corruption
326
+ corrupted_snippets: Set[str] = set()
327
+ for e in errors:
328
+ corrupted_snippets.add(e.get("original_snippet", ""))
329
+ corrupted_snippets.add(e.get("mutated_snippet", ""))
330
+
331
+ def need(cat: str) -> int:
332
+ return max(0, min_per_category - cat_counts.get(cat, 0))
333
+
334
+ def add_error(cat: str, orig: str, mut: str, rationale: str) -> bool:
335
+ """Add error and update tracking. Returns True if added."""
336
+ if orig in corrupted_snippets or mut in corrupted_snippets:
337
+ return False # Already corrupted
338
+ errors.append({
339
+ "id": f"e_{cat}_sup_{len(errors)}",
340
+ "category": cat,
341
+ "original_snippet": orig,
342
+ "mutated_snippet": mut,
343
+ "rationale": rationale
344
+ })
345
+ cat_counts[cat] = cat_counts.get(cat, 0) + 1
346
+ corrupted_snippets.add(orig)
347
+ corrupted_snippets.add(mut)
348
+ return True
349
+
350
+ # Typo mutation functions for variety
351
+ def mutate_truncate(word: str) -> str:
352
+ """Remove last character."""
353
+ return word[:-1] if len(word) > 3 else word + "x"
354
+
355
+ def mutate_swap(word: str) -> str:
356
+ """Swap two adjacent characters."""
357
+ if len(word) < 4:
358
+ return word + "e"
359
+ pos = len(word) // 2
360
+ return word[:pos] + word[pos+1] + word[pos] + word[pos+2:]
361
+
362
+ def mutate_delete(word: str) -> str:
363
+ """Delete a middle character."""
364
+ if len(word) < 5:
365
+ return word[:-1]
366
+ pos = len(word) // 2
367
+ return word[:pos] + word[pos+1:]
368
+
369
+ def mutate_double(word: str) -> str:
370
+ """Double a character."""
371
+ if len(word) < 3:
372
+ return word + word[-1]
373
+ pos = len(word) // 2
374
+ return word[:pos] + word[pos] + word[pos:]
375
+
376
+ def mutate_case(word: str) -> str:
377
+ """Change case of first letter."""
378
+ if word[0].isupper():
379
+ return word[0].lower() + word[1:]
380
+ return word[0].upper() + word[1:]
381
+
382
+ typo_mutations = [mutate_truncate, mutate_swap, mutate_delete, mutate_double]
383
+ typo_mutation_idx = 0
384
+
385
+ # typo supplements - find words to corrupt with varied mutations
386
+ typo_attempts = 0
387
+ max_typo_attempts = min_per_category * 5 # More attempts for variety
388
+
389
+ # Priority words for typos
390
+ priority_words = [
391
+ "installation", "successfully", "analysis", "documentation", "maintained",
392
+ "example", "requirements", "license", "tutorials", "expression",
393
+ "differential", "features", "cluster", "cells", "data", "sample",
394
+ "marker", "gene", "function", "package", "method", "parameter",
395
+ "variable", "object", "default", "optional", "required", "specify",
396
+ "available", "different", "following", "particular", "similar",
397
+ "significant", "corresponding", "additional", "individual"
398
+ ]
399
+
400
+ while need("typo") > 0 and typo_attempts < max_typo_attempts:
401
+ typo_attempts += 1
402
+ found = False
403
+
404
+ # Try priority words first
405
+ for word in priority_words:
406
+ pattern = r"\b" + re.escape(word) + r"\b"
407
+ for m in re.finditer(pattern, corrupted, flags=re.I):
408
+ orig = m.group(0)
409
+ if orig in corrupted_snippets:
410
+ continue
411
+
412
+ # Try different mutations
413
+ mutation_fn = typo_mutations[typo_mutation_idx % len(typo_mutations)]
414
+ typo_mutation_idx += 1
415
+ mut = mutation_fn(orig)
416
+
417
+ if mut == orig or mut in corrupted_snippets:
418
+ continue
419
+ if orig not in baseline:
420
+ continue
421
+
422
+ corrupted = corrupted.replace(orig, mut, 1)
423
+ rationale = f"{mutation_fn.__doc__.strip().lower()}"
424
+ if add_error("typo", orig, mut, rationale):
425
+ found = True
426
+ break
427
+ if found:
428
+ break
429
+
430
+ if not found:
431
+ # Try generic words with 5+ chars
432
+ for m in re.finditer(r"\b[A-Za-z]{5,}\b", corrupted):
433
+ orig = m.group(0)
434
+ if orig in corrupted_snippets or orig not in baseline:
435
+ continue
436
+ if orig.lower() in ["false", "true", "null", "none"]:
437
+ continue
438
+
439
+ mutation_fn = typo_mutations[typo_mutation_idx % len(typo_mutations)]
440
+ typo_mutation_idx += 1
441
+ mut = mutation_fn(orig)
442
+
443
+ if mut == orig or mut in corrupted_snippets:
444
+ continue
445
+
446
+ corrupted = corrupted.replace(orig, mut, 1)
447
+ if add_error("typo", orig, mut, mutation_fn.__doc__.strip().lower()):
448
+ found = True
449
+ break
450
+
451
+ if not found:
452
+ break
453
+
454
+ # link supplements - find unique links to corrupt
455
+ link_attempts = 0
456
+ while need("link") > 0 and link_attempts < min_per_category * 2:
457
+ link_attempts += 1
458
+ found = False
459
+ for m in re.finditer(r"\[[^\]]+\]\(https?://[^)]+\)", corrupted):
460
+ orig = m.group(0)
461
+ if orig in corrupted_snippets:
462
+ continue
463
+ mut = orig.replace("https://", "https//", 1)
464
+ if mut == orig:
465
+ mut = orig.replace("http://", "http//", 1)
466
+ if mut == orig or mut in corrupted_snippets:
467
+ continue
468
+ corrupted = corrupted.replace(orig, mut, 1)
469
+ if add_error("link", orig, mut, "scheme colon removed"):
470
+ found = True
471
+ break
472
+ if not found:
473
+ break
474
+
475
+ # duplicate supplements (cap to min_per_category) - limited to avoid excessive duplication
476
+ dup_count = 0
477
+ max_dups = min(need("duplicate"), 5) # Cap duplicates at 5 max
478
+ while dup_count < max_dups:
479
+ lines = corrupted.splitlines()
480
+ idx = next((i for i, ln in enumerate(lines) if ln.strip().startswith("- ") or ln.strip().startswith("## ")), None)
481
+ if idx is None:
482
+ break
483
+ frag = lines[idx]
484
+ if frag in corrupted_snippets:
485
+ break # Already duplicated this line
486
+ lines = lines[:idx+1] + [frag] + lines[idx+1:]
487
+ corrupted = "\n".join(lines)
488
+ if add_error("duplicate", frag, frag, "line duplicated"):
489
+ dup_count += 1
490
+ else:
491
+ break
492
+
493
+ # bio_term supplements
494
+ bio_swaps = [(r"single cell", "single sell"), (r"genomics", "genomis"), (r"spatial", "spacial"),
495
+ (r"transcriptome", "transcriptom"), (r"proteome", "proteom"), (r"methylation", "metylation")]
496
+ for pat, rep in bio_swaps:
497
+ if need("bio_term") <= 0:
498
+ break
499
+ m = re.search(pat, corrupted, flags=re.I)
500
+ if m:
501
+ orig = m.group(0)
502
+ if orig in corrupted_snippets or orig not in baseline:
503
+ continue
504
+ mut = rep if orig.islower() else rep.title()
505
+ if mut in corrupted_snippets:
506
+ continue
507
+ corrupted = corrupted.replace(orig, mut, 1)
508
+ add_error("bio_term", orig, mut, "common domain typo")
509
+
510
+ # function supplements
511
+ # First try project terms if available
512
+ if project_terms:
513
+ # Check if any existing function error targets a project term
514
+ has_project_error = any(
515
+ e.get("category") == "function" and
516
+ any(term in e.get("original_snippet", "") for term in project_terms)
517
+ for e in errors
518
+ )
519
+
520
+ # If no project error yet, force at least one if possible
521
+ force_project = not has_project_error
522
+
523
+ for term in project_terms:
524
+ if need("function") <= 0 and not force_project:
525
+ break
526
+
527
+ # Look for term followed by optional parens
528
+ m = re.search(r"\b" + re.escape(term) + r"(?:\(\)?)?", corrupted)
529
+ if m:
530
+ orig = m.group(0)
531
+ # Skip if already corrupted
532
+ if orig in corrupted_snippets or orig not in baseline:
533
+ continue
534
+
535
+ # Simple mutation: drop last char or append 'x'
536
+ if len(term) > 3:
537
+ mut_term = term[:-1]
538
+ else:
539
+ mut_term = term + "x"
540
+
541
+ mut = orig.replace(term, mut_term)
542
+ if mut in corrupted_snippets:
543
+ continue
544
+
545
+ corrupted = corrupted.replace(orig, mut, 1)
546
+ if add_error("function", orig, mut, f"misspelled project function {term}"):
547
+ if force_project:
548
+ force_project = False
549
+
550
+ # Fallback to generic function detection - find unique functions
551
+ func_attempts = 0
552
+ while need("function") > 0 and func_attempts < min_per_category * 2:
553
+ func_attempts += 1
554
+ found = False
555
+ for m in re.finditer(r"\b([A-Za-z_][A-Za-z0-9_]*)\(", corrupted):
556
+ fname = m.group(1)
557
+ orig = fname + "("
558
+
559
+ # Skip if already corrupted or not in baseline
560
+ if orig in corrupted_snippets or orig not in baseline:
561
+ continue
562
+ # Skip project terms (handled above)
563
+ if project_terms and fname in project_terms:
564
+ continue
565
+
566
+ if len(fname) > 3:
567
+ mut_name = fname[:-1]
568
+ else:
569
+ mut_name = fname + "x"
570
+ mutated = mut_name + "("
571
+
572
+ if mutated in corrupted_snippets:
573
+ continue
574
+
575
+ corrupted = corrupted.replace(orig, mutated, 1)
576
+ if add_error("function", orig, mutated, "misspelled API name"):
577
+ found = True
578
+ break
579
+ if not found:
580
+ break
581
+
582
+ # markdown_structure supplements
583
+ # NOTE: We do NOT break code fences as this destroys document structure
584
+ # Only apply safe structural changes like header spacing
585
+ for _ in range(need("markdown_structure")):
586
+ # Try header space removal first (safe)
587
+ m = re.search(r"^(#{1,6}) +", corrupted, flags=re.M)
588
+ if m:
589
+ orig = m.group(0)
590
+ # Remove one space after # symbols
591
+ mut = orig.rstrip()
592
+ if mut != orig:
593
+ corrupted = corrupted.replace(orig, mut, 1)
594
+ errors.append({"id": f"e_md_sup_{len(errors)}", "category": "markdown_structure", "original_snippet": orig.strip(), "mutated_snippet": mut.strip(), "rationale": "removed header space"})
595
+ continue
596
+ # Try list indentation issues (safe)
597
+ m = re.search(r"^( {2,4})[-*]", corrupted, flags=re.M)
598
+ if m:
599
+ orig = m.group(0)
600
+ # Change indentation slightly
601
+ mut = " " + orig.lstrip() # reduce indent by 1
602
+ corrupted = corrupted.replace(orig, mut, 1)
603
+ errors.append({"id": f"e_md_sup_{len(errors)}", "category": "markdown_structure", "original_snippet": orig, "mutated_snippet": mut, "rationale": "inconsistent list indent"})
604
+ continue
605
+ # No more safe structural changes available
606
+ break
607
+
608
+ # list_structure supplements
609
+ for _ in range(need("list_structure")):
610
+ m = re.search(r"^\-\s+\S", corrupted, flags=re.M)
611
+ if not m:
612
+ break
613
+ orig = m.group(0)
614
+ mut = orig.replace("- ", "-", 1)
615
+ corrupted = corrupted.replace(orig, mut, 1)
616
+ errors.append({"id": f"e_list_sup_{len(errors)}", "category": "list_structure", "original_snippet": orig, "mutated_snippet": mut, "rationale": "bullet missing space"})
617
+
618
+ # section_title supplements
619
+ for _ in range(need("section_title")):
620
+ m = re.search(r"^##\s+(What is it\?|What can it do\?|Requirements|Install|Quick example|Learn more|License & Contact)$", corrupted, flags=re.M)
621
+ if not m:
622
+ break
623
+ orig = m.group(0)
624
+ mut = orig.replace("What is it?", "What is It?").replace("Install", "Installation")
625
+ if mut == orig:
626
+ break
627
+ corrupted = corrupted.replace(orig, mut, 1)
628
+ errors.append({"id": f"e_title_sup_{len(errors)}", "category": "section_title", "original_snippet": orig, "mutated_snippet": mut, "rationale": "subtle title change"})
629
+
630
+ # image_syntax supplements
631
+ for _ in range(need("image_syntax")):
632
+ m = re.search(r"!\[[^\]]*\]\([^\)]+\)", corrupted)
633
+ if not m:
634
+ break
635
+ orig = m.group(0)
636
+ mut = orig.replace("](", "] (")
637
+ corrupted = corrupted.replace(orig, mut, 1)
638
+ errors.append({"id": f"e_img_sup_{len(errors)}", "category": "image_syntax", "original_snippet": orig, "mutated_snippet": mut, "rationale": "broken image spacing"})
639
+
640
+ # inline_code supplements
641
+ # NOTE: Only match single-backtick inline code, NOT code fences or RMarkdown chunks
642
+ for _ in range(need("inline_code")):
643
+ # Match inline code that:
644
+ # - Is NOT at the start of a line (to avoid code fences)
645
+ # - Contains word characters (actual code, not just punctuation)
646
+ # - Is surrounded by single backticks only
647
+ m = re.search(r"(?<!`)(?<!^)`([^`\n]{2,30})`(?!`)", corrupted)
648
+ if not m:
649
+ break
650
+ orig = m.group(0)
651
+ inner = m.group(1)
652
+ # Skip if it looks like a code fence or RMarkdown chunk marker
653
+ if inner.startswith("{") or inner.startswith("```"):
654
+ continue
655
+ mut = inner # Remove surrounding backticks
656
+ corrupted = corrupted.replace(orig, mut, 1)
657
+ errors.append({"id": f"e_code_sup_{len(errors)}", "category": "inline_code", "original_snippet": orig, "mutated_snippet": mut, "rationale": "removed inline code backticks"})
658
+
659
+ # ============================================================
660
+ # NEW ERROR CATEGORIES for more diverse injection
661
+ # ============================================================
662
+
663
+ # number supplements - change numeric values
664
+ number_attempts = 0
665
+ while need("number") > 0 and number_attempts < min_per_category * 2:
666
+ number_attempts += 1
667
+ found = False
668
+ # Match numbers not in code blocks (simple heuristic)
669
+ for m in re.finditer(r"(?<![`{])\b(\d+\.?\d*)\b(?![`}])", corrupted):
670
+ orig = m.group(0)
671
+ if orig in corrupted_snippets:
672
+ continue
673
+ # Change the number slightly
674
+ try:
675
+ num = float(orig)
676
+ if num > 1:
677
+ mut = str(int(num) + 1) if "." not in orig else str(num + 0.1)
678
+ else:
679
+ mut = str(num * 2) if num != 0 else "1"
680
+ except:
681
+ continue
682
+ if mut == orig or mut in corrupted_snippets:
683
+ continue
684
+ corrupted = corrupted.replace(orig, mut, 1)
685
+ if add_error("number", orig, mut, "changed numeric value"):
686
+ found = True
687
+ break
688
+ if not found:
689
+ break
690
+
691
+ # boolean supplements - change TRUE/FALSE values
692
+ bool_patterns = [
693
+ (r"\bTRUE\b", "FALSE"),
694
+ (r"\bFALSE\b", "TRUE"),
695
+ (r"\btrue\b", "false"),
696
+ (r"\bfalse\b", "true"),
697
+ (r"\bTrue\b", "False"),
698
+ (r"\bFalse\b", "True"),
699
+ ]
700
+ for pat, replacement in bool_patterns:
701
+ if need("boolean") <= 0:
702
+ break
703
+ m = re.search(pat, corrupted)
704
+ if m:
705
+ orig = m.group(0)
706
+ if orig in corrupted_snippets:
707
+ continue
708
+ mut = replacement
709
+ corrupted = corrupted.replace(orig, mut, 1)
710
+ add_error("boolean", orig, mut, "flipped boolean value")
711
+
712
+ # gene_case supplements - change gene symbol case (important in bioinformatics)
713
+ gene_patterns = [
714
+ (r"\b([A-Z]{2,}[0-9]*)\b", lambda m: m.group(1).lower()), # BRCA1 -> brca1
715
+ (r"\b([a-z]{2,}[0-9]*)\b", lambda m: m.group(1).upper()), # brca1 -> BRCA1
716
+ ]
717
+ gene_attempts = 0
718
+ while need("gene_case") > 0 and gene_attempts < min_per_category:
719
+ gene_attempts += 1
720
+ found = False
721
+ # Look for gene-like patterns (2+ letters, possibly followed by numbers)
722
+ for m in re.finditer(r"\b([A-Z]{2,6}[0-9]{0,2})\b", corrupted):
723
+ orig = m.group(0)
724
+ if orig in corrupted_snippets or len(orig) < 3:
725
+ continue
726
+ # Skip common words that aren't genes
727
+ if orig.lower() in ["the", "and", "for", "not", "are", "was", "rmd", "csv", "pdf"]:
728
+ continue
729
+ mut = orig.lower()
730
+ if mut == orig or mut in corrupted_snippets:
731
+ continue
732
+ corrupted = corrupted.replace(orig, mut, 1)
733
+ if add_error("gene_case", orig, mut, "changed gene symbol case"):
734
+ found = True
735
+ break
736
+ if not found:
737
+ break
738
+
739
+ # param_name supplements - corrupt parameter/argument names
740
+ param_attempts = 0
741
+ while need("param_name") > 0 and param_attempts < min_per_category * 2:
742
+ param_attempts += 1
743
+ found = False
744
+ # Match parameter assignments like "param = value" or "param=value"
745
+ for m in re.finditer(r"\b([a-z_][a-z0-9_.]*)\s*=\s*", corrupted, flags=re.I):
746
+ param = m.group(1)
747
+ orig = param
748
+ if orig in corrupted_snippets or len(param) < 3:
749
+ continue
750
+ # Typo the parameter name
751
+ if len(param) > 3:
752
+ mut = param[:-1]
753
+ else:
754
+ mut = param + "x"
755
+ if mut == orig or mut in corrupted_snippets:
756
+ continue
757
+ # Replace in context
758
+ full_orig = m.group(0)
759
+ full_mut = full_orig.replace(param, mut, 1)
760
+ corrupted = corrupted.replace(full_orig, full_mut, 1)
761
+ if add_error("param_name", orig, mut, "misspelled parameter name"):
762
+ found = True
763
+ break
764
+ if not found:
765
+ break
766
+
767
+ # comment_typo supplements - typos in R comments (# lines)
768
+ comment_attempts = 0
769
+ while need("comment_typo") > 0 and comment_attempts < min_per_category:
770
+ comment_attempts += 1
771
+ found = False
772
+ # Find comment lines
773
+ for m in re.finditer(r"^#\s*(.+)$", corrupted, flags=re.M):
774
+ comment_text = m.group(1)
775
+ # Find a word in the comment to corrupt
776
+ for word_m in re.finditer(r"\b([A-Za-z]{5,})\b", comment_text):
777
+ word = word_m.group(1)
778
+ if word in corrupted_snippets:
779
+ continue
780
+ mut = word[:-1] # Truncate
781
+ if mut == word or mut in corrupted_snippets:
782
+ continue
783
+ corrupted = corrupted.replace(word, mut, 1)
784
+ if add_error("comment_typo", word, mut, "typo in comment"):
785
+ found = True
786
+ break
787
+ if found:
788
+ break
789
+ if not found:
790
+ break
791
+
792
+ # species_name supplements - corrupt species names
793
+ species_swaps = [
794
+ ("human", "humna"),
795
+ ("mouse", "mosue"),
796
+ ("Homo sapiens", "Homo sapien"),
797
+ ("Mus musculus", "Mus musclus"),
798
+ ]
799
+ for orig_sp, mut_sp in species_swaps:
800
+ if need("species_name") <= 0:
801
+ break
802
+ if orig_sp in corrupted and orig_sp not in corrupted_snippets:
803
+ corrupted = corrupted.replace(orig_sp, mut_sp, 1)
804
+ add_error("species_name", orig_sp, mut_sp, "misspelled species name")
805
+
806
+ data["errors"] = errors
807
+ return corrupted, data
808
+
809
+