bioguider 0.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +92 -0
- bioguider/agents/agent_tools.py +176 -0
- bioguider/agents/agent_utils.py +504 -0
- bioguider/agents/collection_execute_step.py +182 -0
- bioguider/agents/collection_observe_step.py +125 -0
- bioguider/agents/collection_plan_step.py +156 -0
- bioguider/agents/collection_task.py +184 -0
- bioguider/agents/collection_task_utils.py +142 -0
- bioguider/agents/common_agent.py +137 -0
- bioguider/agents/common_agent_2step.py +215 -0
- bioguider/agents/common_conversation.py +61 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +110 -0
- bioguider/agents/consistency_query_step.py +77 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +154 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_installation_task.py +270 -0
- bioguider/agents/evaluation_readme_task.py +767 -0
- bioguider/agents/evaluation_submission_requirements_task.py +172 -0
- bioguider/agents/evaluation_task.py +206 -0
- bioguider/agents/evaluation_tutorial_task.py +169 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
- bioguider/agents/evaluation_userguide_prompts.py +179 -0
- bioguider/agents/evaluation_userguide_task.py +154 -0
- bioguider/agents/evaluation_utils.py +127 -0
- bioguider/agents/identification_execute_step.py +181 -0
- bioguider/agents/identification_observe_step.py +104 -0
- bioguider/agents/identification_plan_step.py +140 -0
- bioguider/agents/identification_task.py +270 -0
- bioguider/agents/identification_task_utils.py +22 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +253 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/code_structure_db.py +500 -0
- bioguider/database/summarized_file_db.py +146 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/benchmark_metrics.py +610 -0
- bioguider/generation/change_planner.py +189 -0
- bioguider/generation/document_renderer.py +157 -0
- bioguider/generation/llm_cleaner.py +67 -0
- bioguider/generation/llm_content_generator.py +1128 -0
- bioguider/generation/llm_injector.py +809 -0
- bioguider/generation/models.py +85 -0
- bioguider/generation/output_manager.py +74 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +166 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +436 -0
- bioguider/generation/test_metrics.py +189 -0
- bioguider/managers/benchmark_manager.py +785 -0
- bioguider/managers/evaluation_manager.py +215 -0
- bioguider/managers/generation_manager.py +686 -0
- bioguider/managers/generation_test_manager.py +107 -0
- bioguider/managers/generation_test_manager_v2.py +525 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +651 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +138 -0
- bioguider/settings.py +103 -0
- bioguider/utils/code_structure_builder.py +59 -0
- bioguider/utils/constants.py +135 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +215 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +551 -0
- bioguider/utils/utils.py +163 -0
- bioguider-0.2.52.dist-info/LICENSE +21 -0
- bioguider-0.2.52.dist-info/METADATA +51 -0
- bioguider-0.2.52.dist-info/RECORD +84 -0
- bioguider-0.2.52.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,809 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Tuple, Dict, Any, List, Set
|
|
5
|
+
import re
|
|
6
|
+
from difflib import SequenceMatcher
|
|
7
|
+
|
|
8
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
9
|
+
from bioguider.agents.common_conversation import CommonConversation
|
|
10
|
+
from bioguider.utils.utils import escape_braces
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
INJECTION_PROMPT = """
|
|
14
|
+
You are “BioGuider-Intro,” generating a deliberately flawed **INTRODUCTION** file
|
|
15
|
+
(“README-lite”) to test an auto-fixer. Start from the provided clean INTRO doc that follows the
|
|
16
|
+
BioGuider Intro structure (What is it? / What can it do? / Requirements / Install / Quick example /
|
|
17
|
+
Learn more / License & Contact). Produce a corrupted version with small, realistic defects.
|
|
18
|
+
|
|
19
|
+
GOAL
|
|
20
|
+
Introduce subtle but meaningful issues while keeping the document recognizably the same.
|
|
21
|
+
|
|
22
|
+
ERROR CATEGORIES (inject all)
|
|
23
|
+
- typo: spelling/grammar/punctuation mistakes
|
|
24
|
+
- link: malformed URL, wrong domain, or stray spaces in URL
|
|
25
|
+
- duplicate: duplicate a short line/section fragment
|
|
26
|
+
- bio_term: slightly wrong domain term (e.g., “single sell” for “single cell”); do not invent new science
|
|
27
|
+
- function: misspell a known function/API name **from the input README-lite only**
|
|
28
|
+
- markdown_structure: break a header level, list indentation, or code fence (one-off)
|
|
29
|
+
- list_structure: remove bullet space (e.g., “-item”), mix markers inconsistently
|
|
30
|
+
- section_title: subtly change a section title casing or wording
|
|
31
|
+
- image_syntax: break image markdown spacing (e.g., `![alt] (url)`)
|
|
32
|
+
- inline_code: remove backticks around inline code
|
|
33
|
+
- emphasis: break emphasis markers (e.g., missing closing `*`)
|
|
34
|
+
- table_alignment: misalign or omit a `|` in a markdown table
|
|
35
|
+
- code_lang_tag: use the wrong fenced code language (e.g., ```py for R)
|
|
36
|
+
|
|
37
|
+
BIOLOGY-SPECIFIC ERROR CATEGORIES (inject all; keep realistic & subtle)
|
|
38
|
+
- gene_symbol_case: change gene symbol casing or add suffix (e.g., “tp53”, “CD3e”), but **do not alter** protected keywords
|
|
39
|
+
- species_swap: imply human vs mouse mix-up (e.g., “mm10” vs “GRCh38”) in a short phrase
|
|
40
|
+
- ref_genome_mismatch: claim a reference genome that conflicts with the example file or text
|
|
41
|
+
- modality_confusion: conflate RNA-seq with ATAC or proteomics in a brief phrase
|
|
42
|
+
- normalization_error: misuse terms like CPM/TPM/CLR/log1p in a sentence
|
|
43
|
+
- umi_vs_read: confuse UMI counts vs read counts in a short line
|
|
44
|
+
- batch_effect: misstate “batch correction” vs “normalization” terminology
|
|
45
|
+
- qc_threshold: use a common but slightly wrong QC gate (e.g., mito% 0.5 instead of 5)
|
|
46
|
+
- file_format: mix up FASTQ/BAM/MTX/H5AD/RDS in a brief mention
|
|
47
|
+
- strandedness: claim “stranded” when workflow is unstranded (or vice versa)
|
|
48
|
+
- coordinates: confuse 0-based vs 1-based or chromosome naming style (chr1 vs 1)
|
|
49
|
+
- units_scale: use the wrong scale/unit (e.g., μm vs mm; 10e6 instead of 1e6)
|
|
50
|
+
- sample_type: conflate “primary tissue” with “cell line” in a single phrase
|
|
51
|
+
- contamination: misuse “ambient RNA” vs “doublets” terminology
|
|
52
|
+
|
|
53
|
+
CLI/CONFIG ERROR CATEGORIES (inject all)
|
|
54
|
+
- param_name: slightly misspell a CLI flag or config key (e.g., `--min-cell` → `--min-cells`)
|
|
55
|
+
- default_value: state a plausible but incorrect default value
|
|
56
|
+
- path_hint: introduce a subtle path typo (e.g., `data/filtrd`)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
CONSTRAINTS
|
|
60
|
+
- Keep edits minimal and local; **≥85% token overlap** with input.
|
|
61
|
+
- **CRITICAL: Preserve ALL code block structure exactly**:
|
|
62
|
+
* Do NOT remove, add, or modify code fence delimiters (``` or ```{r} or ```{python})
|
|
63
|
+
* The number of ``` lines MUST be identical in input and output
|
|
64
|
+
* For RMarkdown/Rmd files, preserve ALL chunk headers like ```{r, ...}
|
|
65
|
+
* Only introduce errors INSIDE code blocks (typos in code), never break the fences
|
|
66
|
+
- **Preserve section ORDER and TITLES** from the Intro spec (if applicable):
|
|
67
|
+
1) # <project_name>
|
|
68
|
+
_<tagline>_
|
|
69
|
+
2) What is it?
|
|
70
|
+
3) What can it do?
|
|
71
|
+
4) Requirements
|
|
72
|
+
5) Install
|
|
73
|
+
6) Quick example
|
|
74
|
+
7) Learn more
|
|
75
|
+
8) License & Contact
|
|
76
|
+
- Do **not** add or remove top-level sections. Subtle line-level corruption only.
|
|
77
|
+
- Maintain a **concise length** (≤ {max_words} words).
|
|
78
|
+
- Do **not** alter the protected keywords (exact casing/spelling): {keywords}
|
|
79
|
+
- Keep at least **{min_per_category} errors per category** listed above.
|
|
80
|
+
- Limit `duplicate` injections to at most **{min_per_category}**.
|
|
81
|
+
- If the input contains runnable code, keep it mostly intact but introduce **one** realistic break
|
|
82
|
+
(e.g., missing quote/paren or wrong function name) without adding new libraries.
|
|
83
|
+
- Keep at least one **valid** URL so the fixer can compare.
|
|
84
|
+
- Do not change the project identity, domain, or language.
|
|
85
|
+
- Do not include markers, explanations, or commentary in the corrupted markdown.
|
|
86
|
+
|
|
87
|
+
INPUT INTRO (clean README-lite)
|
|
88
|
+
<<INTRO>>
|
|
89
|
+
{readme}
|
|
90
|
+
<</INTRO>>
|
|
91
|
+
|
|
92
|
+
OUTPUT (JSON only):
|
|
93
|
+
{{
|
|
94
|
+
"corrupted_markdown": "<the entire corrupted INTRO as markdown>",
|
|
95
|
+
"errors": [
|
|
96
|
+
{{
|
|
97
|
+
"id": "e1",
|
|
98
|
+
"category": "typo|link|duplicate|bio_term|function|markdown_structure",
|
|
99
|
+
"rationale": "why this mutation is realistic",
|
|
100
|
+
"original_snippet": "<verbatim snippet from input>",
|
|
101
|
+
"mutated_snippet": "<verbatim mutated text>"
|
|
102
|
+
}}
|
|
103
|
+
// include one entry per individual mutation you applied
|
|
104
|
+
]
|
|
105
|
+
}}
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class LLMErrorInjector:
|
|
110
|
+
def __init__(self, llm: BaseChatOpenAI):
|
|
111
|
+
self.llm = llm
|
|
112
|
+
|
|
113
|
+
def inject(self, readme_text: str, min_per_category: int = 3, preserve_keywords: list[str] | None = None, max_words: int = 450, project_terms: list[str] | None = None) -> Tuple[str, Dict[str, Any]]:
|
|
114
|
+
conv = CommonConversation(self.llm)
|
|
115
|
+
preserve_keywords = preserve_keywords or self._extract_preserve_keywords(readme_text)
|
|
116
|
+
|
|
117
|
+
# Add project terms to prompt if available
|
|
118
|
+
project_terms_section = ""
|
|
119
|
+
if project_terms:
|
|
120
|
+
terms_str = ", ".join(project_terms[:20]) # Limit to top 20 to avoid clutter
|
|
121
|
+
project_terms_section = f"\nPROJECT SPECIFIC TARGETS (Prioritize misspelling these):\n{terms_str}\n"
|
|
122
|
+
|
|
123
|
+
system_prompt = escape_braces(INJECTION_PROMPT).format(
|
|
124
|
+
readme=readme_text[:30000],
|
|
125
|
+
min_per_category=min_per_category,
|
|
126
|
+
keywords=", ".join(preserve_keywords) if preserve_keywords else "",
|
|
127
|
+
max_words=max_words,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if project_terms:
|
|
131
|
+
# Insert project terms section before ERROR CATEGORIES
|
|
132
|
+
system_prompt = system_prompt.replace("ERROR CATEGORIES (inject all)", f"{project_terms_section}\nERROR CATEGORIES (inject all)")
|
|
133
|
+
|
|
134
|
+
output, _ = conv.generate(system_prompt=system_prompt, instruction_prompt="Return the JSON now.")
|
|
135
|
+
|
|
136
|
+
# Enhanced JSON parsing with better error handling
|
|
137
|
+
data = self._parse_json_output(output, readme_text)
|
|
138
|
+
corrupted = data.get("corrupted_markdown", readme_text)
|
|
139
|
+
|
|
140
|
+
# CRITICAL: Check code block preservation before validation
|
|
141
|
+
if not self._check_code_blocks_preserved(readme_text, corrupted):
|
|
142
|
+
print("Warning: LLM output broke code blocks, using deterministic fallback")
|
|
143
|
+
corrupted, data = self._deterministic_inject(readme_text)
|
|
144
|
+
# Validate output stays within original context; fallback to deterministic if invalid
|
|
145
|
+
elif not self._validate_corrupted(readme_text, corrupted, preserve_keywords):
|
|
146
|
+
corrupted, data = self._deterministic_inject(readme_text)
|
|
147
|
+
|
|
148
|
+
# Supplement to satisfy minimum per-category counts using deterministic local edits
|
|
149
|
+
corrupted, data = self._supplement_errors(readme_text, corrupted, data, min_per_category, project_terms)
|
|
150
|
+
|
|
151
|
+
# Final safety check: ensure code blocks are still intact after supplements
|
|
152
|
+
if not self._check_code_blocks_preserved(readme_text, corrupted):
|
|
153
|
+
print("Warning: Supplements broke code blocks, reverting to baseline with minimal errors")
|
|
154
|
+
corrupted, data = self._deterministic_inject(readme_text)
|
|
155
|
+
|
|
156
|
+
manifest = {
|
|
157
|
+
"errors": data.get("errors", []),
|
|
158
|
+
}
|
|
159
|
+
return corrupted, manifest
|
|
160
|
+
|
|
161
|
+
def _check_code_blocks_preserved(self, baseline: str, corrupted: str) -> bool:
|
|
162
|
+
"""Check that code block structure is preserved exactly."""
|
|
163
|
+
# Count code fence lines (must match exactly)
|
|
164
|
+
base_fences = len(re.findall(r"^```", baseline, flags=re.M))
|
|
165
|
+
corr_fences = len(re.findall(r"^```", corrupted, flags=re.M))
|
|
166
|
+
if base_fences != corr_fences:
|
|
167
|
+
return False
|
|
168
|
+
|
|
169
|
+
# Check RMarkdown chunks specifically (```{r}, ```{python}, etc.)
|
|
170
|
+
base_rmd = re.findall(r"^```\{[^}]*\}", baseline, flags=re.M)
|
|
171
|
+
corr_rmd = re.findall(r"^```\{[^}]*\}", corrupted, flags=re.M)
|
|
172
|
+
if len(base_rmd) != len(corr_rmd):
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
# Ensure closing ``` match opening count
|
|
176
|
+
base_close = len(re.findall(r"^```\s*$", baseline, flags=re.M))
|
|
177
|
+
corr_close = len(re.findall(r"^```\s*$", corrupted, flags=re.M))
|
|
178
|
+
if base_close != corr_close:
|
|
179
|
+
return False
|
|
180
|
+
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
def _parse_json_output(self, output: str, fallback_text: str) -> Dict[str, Any]:
|
|
184
|
+
"""Enhanced JSON parsing with multiple fallback strategies."""
|
|
185
|
+
import re
|
|
186
|
+
|
|
187
|
+
# Strategy 1: Direct JSON parsing
|
|
188
|
+
try:
|
|
189
|
+
return json.loads(output)
|
|
190
|
+
except json.JSONDecodeError:
|
|
191
|
+
pass
|
|
192
|
+
|
|
193
|
+
# Strategy 2: Extract JSON block between ```json and ```
|
|
194
|
+
json_pattern = r'```(?:json)?\s*(\{.*?\})\s*```'
|
|
195
|
+
match = re.search(json_pattern, output, re.DOTALL)
|
|
196
|
+
if match:
|
|
197
|
+
try:
|
|
198
|
+
return json.loads(match.group(1))
|
|
199
|
+
except json.JSONDecodeError:
|
|
200
|
+
pass
|
|
201
|
+
|
|
202
|
+
# Strategy 3: Find first complete JSON object
|
|
203
|
+
start = output.find("{")
|
|
204
|
+
if start != -1:
|
|
205
|
+
# Find matching closing brace
|
|
206
|
+
brace_count = 0
|
|
207
|
+
end = start
|
|
208
|
+
for i, char in enumerate(output[start:], start):
|
|
209
|
+
if char == "{":
|
|
210
|
+
brace_count += 1
|
|
211
|
+
elif char == "}":
|
|
212
|
+
brace_count -= 1
|
|
213
|
+
if brace_count == 0:
|
|
214
|
+
end = i
|
|
215
|
+
break
|
|
216
|
+
|
|
217
|
+
if brace_count == 0: # Found complete JSON object
|
|
218
|
+
try:
|
|
219
|
+
json_str = output[start:end+1]
|
|
220
|
+
return json.loads(json_str)
|
|
221
|
+
except json.JSONDecodeError:
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
# Strategy 4: Try to fix common JSON issues
|
|
225
|
+
try:
|
|
226
|
+
# Remove markdown code fences
|
|
227
|
+
cleaned = re.sub(r'```(?:json)?\s*', '', output)
|
|
228
|
+
cleaned = re.sub(r'```\s*$', '', cleaned)
|
|
229
|
+
# Remove leading/trailing whitespace
|
|
230
|
+
cleaned = cleaned.strip()
|
|
231
|
+
# Try parsing again
|
|
232
|
+
return json.loads(cleaned)
|
|
233
|
+
except json.JSONDecodeError:
|
|
234
|
+
pass
|
|
235
|
+
|
|
236
|
+
# Strategy 5: Fallback to deterministic injection
|
|
237
|
+
print(f"Warning: Failed to parse LLM JSON output, using fallback. Output preview: {output[:200]}...")
|
|
238
|
+
return {"corrupted_markdown": fallback_text, "errors": []}
|
|
239
|
+
|
|
240
|
+
def _extract_preserve_keywords(self, text: str) -> List[str]:
|
|
241
|
+
# Extract capitalized terms, domain hyphenations, and hostnames in links
|
|
242
|
+
kws: Set[str] = set()
|
|
243
|
+
for m in re.finditer(r"\b[A-Z][A-Za-z0-9\-/]{2,}(?:\s[A-Z][A-Za-z0-9\-/]{2,})*\b", text):
|
|
244
|
+
term = m.group(0)
|
|
245
|
+
if len(term) <= 40:
|
|
246
|
+
kws.add(term)
|
|
247
|
+
for m in re.finditer(r"\b[\w]+-[\w]+\b", text):
|
|
248
|
+
if any(ch.isalpha() for ch in m.group(0)):
|
|
249
|
+
kws.add(m.group(0))
|
|
250
|
+
for m in re.finditer(r"https?://([^/\s)]+)", text):
|
|
251
|
+
kws.add(m.group(1))
|
|
252
|
+
# Keep a small set to avoid over-constraining
|
|
253
|
+
out = list(kws)[:20]
|
|
254
|
+
return out
|
|
255
|
+
|
|
256
|
+
def _validate_corrupted(self, baseline: str, corrupted: str, preserve_keywords: List[str]) -> bool:
|
|
257
|
+
# Similarity threshold - increased for better structure preservation
|
|
258
|
+
ratio = SequenceMatcher(None, baseline, corrupted).ratio()
|
|
259
|
+
if ratio < 0.75:
|
|
260
|
+
return False
|
|
261
|
+
# Preserve keywords
|
|
262
|
+
for k in preserve_keywords:
|
|
263
|
+
if k and k not in corrupted:
|
|
264
|
+
return False
|
|
265
|
+
# No new top-level sections
|
|
266
|
+
base_h2 = set([ln.strip() for ln in baseline.splitlines() if ln.strip().startswith("## ")])
|
|
267
|
+
corr_h2 = set([ln.strip() for ln in corrupted.splitlines() if ln.strip().startswith("## ")])
|
|
268
|
+
if not corr_h2.issubset(base_h2.union({"## Overview", "## Hardware Requirements", "## License", "## Usage", "## Dependencies", "## System Requirements"})):
|
|
269
|
+
return False
|
|
270
|
+
# New token ratio
|
|
271
|
+
btoks = set(re.findall(r"[A-Za-z0-9_\-]+", baseline.lower()))
|
|
272
|
+
ctoks = set(re.findall(r"[A-Za-z0-9_\-]+", corrupted.lower()))
|
|
273
|
+
new_ratio = len(ctoks - btoks) / max(1, len(ctoks))
|
|
274
|
+
if new_ratio > 0.25:
|
|
275
|
+
return False
|
|
276
|
+
# CRITICAL: Preserve code block structure
|
|
277
|
+
# Count code fences (``` or ```{...}) - must match
|
|
278
|
+
base_fences = len(re.findall(r"^```", baseline, flags=re.M))
|
|
279
|
+
corr_fences = len(re.findall(r"^```", corrupted, flags=re.M))
|
|
280
|
+
if base_fences != corr_fences:
|
|
281
|
+
return False
|
|
282
|
+
# Check RMarkdown chunks specifically
|
|
283
|
+
base_rmd_chunks = len(re.findall(r"^```\{[^}]*\}", baseline, flags=re.M))
|
|
284
|
+
corr_rmd_chunks = len(re.findall(r"^```\{[^}]*\}", corrupted, flags=re.M))
|
|
285
|
+
if base_rmd_chunks != corr_rmd_chunks:
|
|
286
|
+
return False
|
|
287
|
+
return True
|
|
288
|
+
|
|
289
|
+
def _deterministic_inject(self, baseline: str) -> Tuple[str, Dict[str, Any]]:
|
|
290
|
+
errors: List[Dict[str, Any]] = []
|
|
291
|
+
text = baseline
|
|
292
|
+
# typo
|
|
293
|
+
if "successfully" in text:
|
|
294
|
+
text = text.replace("successfully", "succesfully", 1)
|
|
295
|
+
errors.append({"id": "e_typo_1", "category": "typo", "original_snippet": "successfully", "mutated_snippet": "succesfully", "rationale": "common misspelling"})
|
|
296
|
+
elif "installation" in text:
|
|
297
|
+
text = text.replace("installation", "instalation", 1)
|
|
298
|
+
errors.append({"id": "e_typo_1", "category": "typo", "original_snippet": "installation", "mutated_snippet": "instalation", "rationale": "common misspelling"})
|
|
299
|
+
# link
|
|
300
|
+
m = re.search(r"\]\(https?://[^)]+\)", text)
|
|
301
|
+
if m:
|
|
302
|
+
broken = m.group(0).replace("https://", "https//")
|
|
303
|
+
text = text.replace(m.group(0), broken, 1)
|
|
304
|
+
errors.append({"id": "e_link_1", "category": "link", "original_snippet": m.group(0), "mutated_snippet": broken, "rationale": "missing colon in scheme"})
|
|
305
|
+
# duplicate a small section (next header and paragraph)
|
|
306
|
+
lines = text.splitlines()
|
|
307
|
+
dup_idx = next((i for i, ln in enumerate(lines) if ln.strip().startswith("## ")), None)
|
|
308
|
+
if dup_idx is not None:
|
|
309
|
+
block = lines[dup_idx: min(len(lines), dup_idx+5)]
|
|
310
|
+
text = "\n".join(lines + ["", *block])
|
|
311
|
+
errors.append({"id": "e_dup_1", "category": "duplicate", "original_snippet": "\n".join(block), "mutated_snippet": "\n".join(block), "rationale": "duplicated section"})
|
|
312
|
+
# markdown structure: break a header
|
|
313
|
+
if "\n# " in text:
|
|
314
|
+
text = text.replace("\n# ", "\n#", 1)
|
|
315
|
+
errors.append({"id": "e_md_1", "category": "markdown_structure", "original_snippet": "\n# ", "mutated_snippet": "\n#", "rationale": "missing space in header"})
|
|
316
|
+
return text, {"errors": errors}
|
|
317
|
+
|
|
318
|
+
def _supplement_errors(self, baseline: str, corrupted: str, data: Dict[str, Any], min_per_category: int, project_terms: list[str] | None = None) -> Tuple[str, Dict[str, Any]]:
|
|
319
|
+
errors: List[Dict[str, Any]] = data.get("errors", []) or []
|
|
320
|
+
cat_counts: Dict[str, int] = {}
|
|
321
|
+
for e in errors:
|
|
322
|
+
cat = e.get("category", "")
|
|
323
|
+
cat_counts[cat] = cat_counts.get(cat, 0) + 1
|
|
324
|
+
|
|
325
|
+
# Track what's already been corrupted to avoid re-corruption
|
|
326
|
+
corrupted_snippets: Set[str] = set()
|
|
327
|
+
for e in errors:
|
|
328
|
+
corrupted_snippets.add(e.get("original_snippet", ""))
|
|
329
|
+
corrupted_snippets.add(e.get("mutated_snippet", ""))
|
|
330
|
+
|
|
331
|
+
def need(cat: str) -> int:
|
|
332
|
+
return max(0, min_per_category - cat_counts.get(cat, 0))
|
|
333
|
+
|
|
334
|
+
def add_error(cat: str, orig: str, mut: str, rationale: str) -> bool:
|
|
335
|
+
"""Add error and update tracking. Returns True if added."""
|
|
336
|
+
if orig in corrupted_snippets or mut in corrupted_snippets:
|
|
337
|
+
return False # Already corrupted
|
|
338
|
+
errors.append({
|
|
339
|
+
"id": f"e_{cat}_sup_{len(errors)}",
|
|
340
|
+
"category": cat,
|
|
341
|
+
"original_snippet": orig,
|
|
342
|
+
"mutated_snippet": mut,
|
|
343
|
+
"rationale": rationale
|
|
344
|
+
})
|
|
345
|
+
cat_counts[cat] = cat_counts.get(cat, 0) + 1
|
|
346
|
+
corrupted_snippets.add(orig)
|
|
347
|
+
corrupted_snippets.add(mut)
|
|
348
|
+
return True
|
|
349
|
+
|
|
350
|
+
# Typo mutation functions for variety
|
|
351
|
+
def mutate_truncate(word: str) -> str:
|
|
352
|
+
"""Remove last character."""
|
|
353
|
+
return word[:-1] if len(word) > 3 else word + "x"
|
|
354
|
+
|
|
355
|
+
def mutate_swap(word: str) -> str:
|
|
356
|
+
"""Swap two adjacent characters."""
|
|
357
|
+
if len(word) < 4:
|
|
358
|
+
return word + "e"
|
|
359
|
+
pos = len(word) // 2
|
|
360
|
+
return word[:pos] + word[pos+1] + word[pos] + word[pos+2:]
|
|
361
|
+
|
|
362
|
+
def mutate_delete(word: str) -> str:
|
|
363
|
+
"""Delete a middle character."""
|
|
364
|
+
if len(word) < 5:
|
|
365
|
+
return word[:-1]
|
|
366
|
+
pos = len(word) // 2
|
|
367
|
+
return word[:pos] + word[pos+1:]
|
|
368
|
+
|
|
369
|
+
def mutate_double(word: str) -> str:
|
|
370
|
+
"""Double a character."""
|
|
371
|
+
if len(word) < 3:
|
|
372
|
+
return word + word[-1]
|
|
373
|
+
pos = len(word) // 2
|
|
374
|
+
return word[:pos] + word[pos] + word[pos:]
|
|
375
|
+
|
|
376
|
+
def mutate_case(word: str) -> str:
|
|
377
|
+
"""Change case of first letter."""
|
|
378
|
+
if word[0].isupper():
|
|
379
|
+
return word[0].lower() + word[1:]
|
|
380
|
+
return word[0].upper() + word[1:]
|
|
381
|
+
|
|
382
|
+
typo_mutations = [mutate_truncate, mutate_swap, mutate_delete, mutate_double]
|
|
383
|
+
typo_mutation_idx = 0
|
|
384
|
+
|
|
385
|
+
# typo supplements - find words to corrupt with varied mutations
|
|
386
|
+
typo_attempts = 0
|
|
387
|
+
max_typo_attempts = min_per_category * 5 # More attempts for variety
|
|
388
|
+
|
|
389
|
+
# Priority words for typos
|
|
390
|
+
priority_words = [
|
|
391
|
+
"installation", "successfully", "analysis", "documentation", "maintained",
|
|
392
|
+
"example", "requirements", "license", "tutorials", "expression",
|
|
393
|
+
"differential", "features", "cluster", "cells", "data", "sample",
|
|
394
|
+
"marker", "gene", "function", "package", "method", "parameter",
|
|
395
|
+
"variable", "object", "default", "optional", "required", "specify",
|
|
396
|
+
"available", "different", "following", "particular", "similar",
|
|
397
|
+
"significant", "corresponding", "additional", "individual"
|
|
398
|
+
]
|
|
399
|
+
|
|
400
|
+
while need("typo") > 0 and typo_attempts < max_typo_attempts:
|
|
401
|
+
typo_attempts += 1
|
|
402
|
+
found = False
|
|
403
|
+
|
|
404
|
+
# Try priority words first
|
|
405
|
+
for word in priority_words:
|
|
406
|
+
pattern = r"\b" + re.escape(word) + r"\b"
|
|
407
|
+
for m in re.finditer(pattern, corrupted, flags=re.I):
|
|
408
|
+
orig = m.group(0)
|
|
409
|
+
if orig in corrupted_snippets:
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
# Try different mutations
|
|
413
|
+
mutation_fn = typo_mutations[typo_mutation_idx % len(typo_mutations)]
|
|
414
|
+
typo_mutation_idx += 1
|
|
415
|
+
mut = mutation_fn(orig)
|
|
416
|
+
|
|
417
|
+
if mut == orig or mut in corrupted_snippets:
|
|
418
|
+
continue
|
|
419
|
+
if orig not in baseline:
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
423
|
+
rationale = f"{mutation_fn.__doc__.strip().lower()}"
|
|
424
|
+
if add_error("typo", orig, mut, rationale):
|
|
425
|
+
found = True
|
|
426
|
+
break
|
|
427
|
+
if found:
|
|
428
|
+
break
|
|
429
|
+
|
|
430
|
+
if not found:
|
|
431
|
+
# Try generic words with 5+ chars
|
|
432
|
+
for m in re.finditer(r"\b[A-Za-z]{5,}\b", corrupted):
|
|
433
|
+
orig = m.group(0)
|
|
434
|
+
if orig in corrupted_snippets or orig not in baseline:
|
|
435
|
+
continue
|
|
436
|
+
if orig.lower() in ["false", "true", "null", "none"]:
|
|
437
|
+
continue
|
|
438
|
+
|
|
439
|
+
mutation_fn = typo_mutations[typo_mutation_idx % len(typo_mutations)]
|
|
440
|
+
typo_mutation_idx += 1
|
|
441
|
+
mut = mutation_fn(orig)
|
|
442
|
+
|
|
443
|
+
if mut == orig or mut in corrupted_snippets:
|
|
444
|
+
continue
|
|
445
|
+
|
|
446
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
447
|
+
if add_error("typo", orig, mut, mutation_fn.__doc__.strip().lower()):
|
|
448
|
+
found = True
|
|
449
|
+
break
|
|
450
|
+
|
|
451
|
+
if not found:
|
|
452
|
+
break
|
|
453
|
+
|
|
454
|
+
# link supplements - find unique links to corrupt
|
|
455
|
+
link_attempts = 0
|
|
456
|
+
while need("link") > 0 and link_attempts < min_per_category * 2:
|
|
457
|
+
link_attempts += 1
|
|
458
|
+
found = False
|
|
459
|
+
for m in re.finditer(r"\[[^\]]+\]\(https?://[^)]+\)", corrupted):
|
|
460
|
+
orig = m.group(0)
|
|
461
|
+
if orig in corrupted_snippets:
|
|
462
|
+
continue
|
|
463
|
+
mut = orig.replace("https://", "https//", 1)
|
|
464
|
+
if mut == orig:
|
|
465
|
+
mut = orig.replace("http://", "http//", 1)
|
|
466
|
+
if mut == orig or mut in corrupted_snippets:
|
|
467
|
+
continue
|
|
468
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
469
|
+
if add_error("link", orig, mut, "scheme colon removed"):
|
|
470
|
+
found = True
|
|
471
|
+
break
|
|
472
|
+
if not found:
|
|
473
|
+
break
|
|
474
|
+
|
|
475
|
+
# duplicate supplements (cap to min_per_category) - limited to avoid excessive duplication
|
|
476
|
+
dup_count = 0
|
|
477
|
+
max_dups = min(need("duplicate"), 5) # Cap duplicates at 5 max
|
|
478
|
+
while dup_count < max_dups:
|
|
479
|
+
lines = corrupted.splitlines()
|
|
480
|
+
idx = next((i for i, ln in enumerate(lines) if ln.strip().startswith("- ") or ln.strip().startswith("## ")), None)
|
|
481
|
+
if idx is None:
|
|
482
|
+
break
|
|
483
|
+
frag = lines[idx]
|
|
484
|
+
if frag in corrupted_snippets:
|
|
485
|
+
break # Already duplicated this line
|
|
486
|
+
lines = lines[:idx+1] + [frag] + lines[idx+1:]
|
|
487
|
+
corrupted = "\n".join(lines)
|
|
488
|
+
if add_error("duplicate", frag, frag, "line duplicated"):
|
|
489
|
+
dup_count += 1
|
|
490
|
+
else:
|
|
491
|
+
break
|
|
492
|
+
|
|
493
|
+
# bio_term supplements
|
|
494
|
+
bio_swaps = [(r"single cell", "single sell"), (r"genomics", "genomis"), (r"spatial", "spacial"),
|
|
495
|
+
(r"transcriptome", "transcriptom"), (r"proteome", "proteom"), (r"methylation", "metylation")]
|
|
496
|
+
for pat, rep in bio_swaps:
|
|
497
|
+
if need("bio_term") <= 0:
|
|
498
|
+
break
|
|
499
|
+
m = re.search(pat, corrupted, flags=re.I)
|
|
500
|
+
if m:
|
|
501
|
+
orig = m.group(0)
|
|
502
|
+
if orig in corrupted_snippets or orig not in baseline:
|
|
503
|
+
continue
|
|
504
|
+
mut = rep if orig.islower() else rep.title()
|
|
505
|
+
if mut in corrupted_snippets:
|
|
506
|
+
continue
|
|
507
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
508
|
+
add_error("bio_term", orig, mut, "common domain typo")
|
|
509
|
+
|
|
510
|
+
# function supplements
|
|
511
|
+
# First try project terms if available
|
|
512
|
+
if project_terms:
|
|
513
|
+
# Check if any existing function error targets a project term
|
|
514
|
+
has_project_error = any(
|
|
515
|
+
e.get("category") == "function" and
|
|
516
|
+
any(term in e.get("original_snippet", "") for term in project_terms)
|
|
517
|
+
for e in errors
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
# If no project error yet, force at least one if possible
|
|
521
|
+
force_project = not has_project_error
|
|
522
|
+
|
|
523
|
+
for term in project_terms:
|
|
524
|
+
if need("function") <= 0 and not force_project:
|
|
525
|
+
break
|
|
526
|
+
|
|
527
|
+
# Look for term followed by optional parens
|
|
528
|
+
m = re.search(r"\b" + re.escape(term) + r"(?:\(\)?)?", corrupted)
|
|
529
|
+
if m:
|
|
530
|
+
orig = m.group(0)
|
|
531
|
+
# Skip if already corrupted
|
|
532
|
+
if orig in corrupted_snippets or orig not in baseline:
|
|
533
|
+
continue
|
|
534
|
+
|
|
535
|
+
# Simple mutation: drop last char or append 'x'
|
|
536
|
+
if len(term) > 3:
|
|
537
|
+
mut_term = term[:-1]
|
|
538
|
+
else:
|
|
539
|
+
mut_term = term + "x"
|
|
540
|
+
|
|
541
|
+
mut = orig.replace(term, mut_term)
|
|
542
|
+
if mut in corrupted_snippets:
|
|
543
|
+
continue
|
|
544
|
+
|
|
545
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
546
|
+
if add_error("function", orig, mut, f"misspelled project function {term}"):
|
|
547
|
+
if force_project:
|
|
548
|
+
force_project = False
|
|
549
|
+
|
|
550
|
+
# Fallback to generic function detection - find unique functions
|
|
551
|
+
func_attempts = 0
|
|
552
|
+
while need("function") > 0 and func_attempts < min_per_category * 2:
|
|
553
|
+
func_attempts += 1
|
|
554
|
+
found = False
|
|
555
|
+
for m in re.finditer(r"\b([A-Za-z_][A-Za-z0-9_]*)\(", corrupted):
|
|
556
|
+
fname = m.group(1)
|
|
557
|
+
orig = fname + "("
|
|
558
|
+
|
|
559
|
+
# Skip if already corrupted or not in baseline
|
|
560
|
+
if orig in corrupted_snippets or orig not in baseline:
|
|
561
|
+
continue
|
|
562
|
+
# Skip project terms (handled above)
|
|
563
|
+
if project_terms and fname in project_terms:
|
|
564
|
+
continue
|
|
565
|
+
|
|
566
|
+
if len(fname) > 3:
|
|
567
|
+
mut_name = fname[:-1]
|
|
568
|
+
else:
|
|
569
|
+
mut_name = fname + "x"
|
|
570
|
+
mutated = mut_name + "("
|
|
571
|
+
|
|
572
|
+
if mutated in corrupted_snippets:
|
|
573
|
+
continue
|
|
574
|
+
|
|
575
|
+
corrupted = corrupted.replace(orig, mutated, 1)
|
|
576
|
+
if add_error("function", orig, mutated, "misspelled API name"):
|
|
577
|
+
found = True
|
|
578
|
+
break
|
|
579
|
+
if not found:
|
|
580
|
+
break
|
|
581
|
+
|
|
582
|
+
# markdown_structure supplements
|
|
583
|
+
# NOTE: We do NOT break code fences as this destroys document structure
|
|
584
|
+
# Only apply safe structural changes like header spacing
|
|
585
|
+
for _ in range(need("markdown_structure")):
|
|
586
|
+
# Try header space removal first (safe)
|
|
587
|
+
m = re.search(r"^(#{1,6}) +", corrupted, flags=re.M)
|
|
588
|
+
if m:
|
|
589
|
+
orig = m.group(0)
|
|
590
|
+
# Remove one space after # symbols
|
|
591
|
+
mut = orig.rstrip()
|
|
592
|
+
if mut != orig:
|
|
593
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
594
|
+
errors.append({"id": f"e_md_sup_{len(errors)}", "category": "markdown_structure", "original_snippet": orig.strip(), "mutated_snippet": mut.strip(), "rationale": "removed header space"})
|
|
595
|
+
continue
|
|
596
|
+
# Try list indentation issues (safe)
|
|
597
|
+
m = re.search(r"^( {2,4})[-*]", corrupted, flags=re.M)
|
|
598
|
+
if m:
|
|
599
|
+
orig = m.group(0)
|
|
600
|
+
# Change indentation slightly
|
|
601
|
+
mut = " " + orig.lstrip() # reduce indent by 1
|
|
602
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
603
|
+
errors.append({"id": f"e_md_sup_{len(errors)}", "category": "markdown_structure", "original_snippet": orig, "mutated_snippet": mut, "rationale": "inconsistent list indent"})
|
|
604
|
+
continue
|
|
605
|
+
# No more safe structural changes available
|
|
606
|
+
break
|
|
607
|
+
|
|
608
|
+
# list_structure supplements
|
|
609
|
+
for _ in range(need("list_structure")):
|
|
610
|
+
m = re.search(r"^\-\s+\S", corrupted, flags=re.M)
|
|
611
|
+
if not m:
|
|
612
|
+
break
|
|
613
|
+
orig = m.group(0)
|
|
614
|
+
mut = orig.replace("- ", "-", 1)
|
|
615
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
616
|
+
errors.append({"id": f"e_list_sup_{len(errors)}", "category": "list_structure", "original_snippet": orig, "mutated_snippet": mut, "rationale": "bullet missing space"})
|
|
617
|
+
|
|
618
|
+
# section_title supplements
|
|
619
|
+
for _ in range(need("section_title")):
|
|
620
|
+
m = re.search(r"^##\s+(What is it\?|What can it do\?|Requirements|Install|Quick example|Learn more|License & Contact)$", corrupted, flags=re.M)
|
|
621
|
+
if not m:
|
|
622
|
+
break
|
|
623
|
+
orig = m.group(0)
|
|
624
|
+
mut = orig.replace("What is it?", "What is It?").replace("Install", "Installation")
|
|
625
|
+
if mut == orig:
|
|
626
|
+
break
|
|
627
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
628
|
+
errors.append({"id": f"e_title_sup_{len(errors)}", "category": "section_title", "original_snippet": orig, "mutated_snippet": mut, "rationale": "subtle title change"})
|
|
629
|
+
|
|
630
|
+
# image_syntax supplements
|
|
631
|
+
for _ in range(need("image_syntax")):
|
|
632
|
+
m = re.search(r"!\[[^\]]*\]\([^\)]+\)", corrupted)
|
|
633
|
+
if not m:
|
|
634
|
+
break
|
|
635
|
+
orig = m.group(0)
|
|
636
|
+
mut = orig.replace("](", "] (")
|
|
637
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
638
|
+
errors.append({"id": f"e_img_sup_{len(errors)}", "category": "image_syntax", "original_snippet": orig, "mutated_snippet": mut, "rationale": "broken image spacing"})
|
|
639
|
+
|
|
640
|
+
# inline_code supplements
|
|
641
|
+
# NOTE: Only match single-backtick inline code, NOT code fences or RMarkdown chunks
|
|
642
|
+
for _ in range(need("inline_code")):
|
|
643
|
+
# Match inline code that:
|
|
644
|
+
# - Is NOT at the start of a line (to avoid code fences)
|
|
645
|
+
# - Contains word characters (actual code, not just punctuation)
|
|
646
|
+
# - Is surrounded by single backticks only
|
|
647
|
+
m = re.search(r"(?<!`)(?<!^)`([^`\n]{2,30})`(?!`)", corrupted)
|
|
648
|
+
if not m:
|
|
649
|
+
break
|
|
650
|
+
orig = m.group(0)
|
|
651
|
+
inner = m.group(1)
|
|
652
|
+
# Skip if it looks like a code fence or RMarkdown chunk marker
|
|
653
|
+
if inner.startswith("{") or inner.startswith("```"):
|
|
654
|
+
continue
|
|
655
|
+
mut = inner # Remove surrounding backticks
|
|
656
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
657
|
+
errors.append({"id": f"e_code_sup_{len(errors)}", "category": "inline_code", "original_snippet": orig, "mutated_snippet": mut, "rationale": "removed inline code backticks"})
|
|
658
|
+
|
|
659
|
+
# ============================================================
|
|
660
|
+
# NEW ERROR CATEGORIES for more diverse injection
|
|
661
|
+
# ============================================================
|
|
662
|
+
|
|
663
|
+
# number supplements - change numeric values
|
|
664
|
+
number_attempts = 0
|
|
665
|
+
while need("number") > 0 and number_attempts < min_per_category * 2:
|
|
666
|
+
number_attempts += 1
|
|
667
|
+
found = False
|
|
668
|
+
# Match numbers not in code blocks (simple heuristic)
|
|
669
|
+
for m in re.finditer(r"(?<![`{])\b(\d+\.?\d*)\b(?![`}])", corrupted):
|
|
670
|
+
orig = m.group(0)
|
|
671
|
+
if orig in corrupted_snippets:
|
|
672
|
+
continue
|
|
673
|
+
# Change the number slightly
|
|
674
|
+
try:
|
|
675
|
+
num = float(orig)
|
|
676
|
+
if num > 1:
|
|
677
|
+
mut = str(int(num) + 1) if "." not in orig else str(num + 0.1)
|
|
678
|
+
else:
|
|
679
|
+
mut = str(num * 2) if num != 0 else "1"
|
|
680
|
+
except:
|
|
681
|
+
continue
|
|
682
|
+
if mut == orig or mut in corrupted_snippets:
|
|
683
|
+
continue
|
|
684
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
685
|
+
if add_error("number", orig, mut, "changed numeric value"):
|
|
686
|
+
found = True
|
|
687
|
+
break
|
|
688
|
+
if not found:
|
|
689
|
+
break
|
|
690
|
+
|
|
691
|
+
# boolean supplements - change TRUE/FALSE values
|
|
692
|
+
bool_patterns = [
|
|
693
|
+
(r"\bTRUE\b", "FALSE"),
|
|
694
|
+
(r"\bFALSE\b", "TRUE"),
|
|
695
|
+
(r"\btrue\b", "false"),
|
|
696
|
+
(r"\bfalse\b", "true"),
|
|
697
|
+
(r"\bTrue\b", "False"),
|
|
698
|
+
(r"\bFalse\b", "True"),
|
|
699
|
+
]
|
|
700
|
+
for pat, replacement in bool_patterns:
|
|
701
|
+
if need("boolean") <= 0:
|
|
702
|
+
break
|
|
703
|
+
m = re.search(pat, corrupted)
|
|
704
|
+
if m:
|
|
705
|
+
orig = m.group(0)
|
|
706
|
+
if orig in corrupted_snippets:
|
|
707
|
+
continue
|
|
708
|
+
mut = replacement
|
|
709
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
710
|
+
add_error("boolean", orig, mut, "flipped boolean value")
|
|
711
|
+
|
|
712
|
+
# gene_case supplements - change gene symbol case (important in bioinformatics)
|
|
713
|
+
gene_patterns = [
|
|
714
|
+
(r"\b([A-Z]{2,}[0-9]*)\b", lambda m: m.group(1).lower()), # BRCA1 -> brca1
|
|
715
|
+
(r"\b([a-z]{2,}[0-9]*)\b", lambda m: m.group(1).upper()), # brca1 -> BRCA1
|
|
716
|
+
]
|
|
717
|
+
gene_attempts = 0
|
|
718
|
+
while need("gene_case") > 0 and gene_attempts < min_per_category:
|
|
719
|
+
gene_attempts += 1
|
|
720
|
+
found = False
|
|
721
|
+
# Look for gene-like patterns (2+ letters, possibly followed by numbers)
|
|
722
|
+
for m in re.finditer(r"\b([A-Z]{2,6}[0-9]{0,2})\b", corrupted):
|
|
723
|
+
orig = m.group(0)
|
|
724
|
+
if orig in corrupted_snippets or len(orig) < 3:
|
|
725
|
+
continue
|
|
726
|
+
# Skip common words that aren't genes
|
|
727
|
+
if orig.lower() in ["the", "and", "for", "not", "are", "was", "rmd", "csv", "pdf"]:
|
|
728
|
+
continue
|
|
729
|
+
mut = orig.lower()
|
|
730
|
+
if mut == orig or mut in corrupted_snippets:
|
|
731
|
+
continue
|
|
732
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
733
|
+
if add_error("gene_case", orig, mut, "changed gene symbol case"):
|
|
734
|
+
found = True
|
|
735
|
+
break
|
|
736
|
+
if not found:
|
|
737
|
+
break
|
|
738
|
+
|
|
739
|
+
# param_name supplements - corrupt parameter/argument names
|
|
740
|
+
param_attempts = 0
|
|
741
|
+
while need("param_name") > 0 and param_attempts < min_per_category * 2:
|
|
742
|
+
param_attempts += 1
|
|
743
|
+
found = False
|
|
744
|
+
# Match parameter assignments like "param = value" or "param=value"
|
|
745
|
+
for m in re.finditer(r"\b([a-z_][a-z0-9_.]*)\s*=\s*", corrupted, flags=re.I):
|
|
746
|
+
param = m.group(1)
|
|
747
|
+
orig = param
|
|
748
|
+
if orig in corrupted_snippets or len(param) < 3:
|
|
749
|
+
continue
|
|
750
|
+
# Typo the parameter name
|
|
751
|
+
if len(param) > 3:
|
|
752
|
+
mut = param[:-1]
|
|
753
|
+
else:
|
|
754
|
+
mut = param + "x"
|
|
755
|
+
if mut == orig or mut in corrupted_snippets:
|
|
756
|
+
continue
|
|
757
|
+
# Replace in context
|
|
758
|
+
full_orig = m.group(0)
|
|
759
|
+
full_mut = full_orig.replace(param, mut, 1)
|
|
760
|
+
corrupted = corrupted.replace(full_orig, full_mut, 1)
|
|
761
|
+
if add_error("param_name", orig, mut, "misspelled parameter name"):
|
|
762
|
+
found = True
|
|
763
|
+
break
|
|
764
|
+
if not found:
|
|
765
|
+
break
|
|
766
|
+
|
|
767
|
+
# comment_typo supplements - typos in R comments (# lines)
|
|
768
|
+
comment_attempts = 0
|
|
769
|
+
while need("comment_typo") > 0 and comment_attempts < min_per_category:
|
|
770
|
+
comment_attempts += 1
|
|
771
|
+
found = False
|
|
772
|
+
# Find comment lines
|
|
773
|
+
for m in re.finditer(r"^#\s*(.+)$", corrupted, flags=re.M):
|
|
774
|
+
comment_text = m.group(1)
|
|
775
|
+
# Find a word in the comment to corrupt
|
|
776
|
+
for word_m in re.finditer(r"\b([A-Za-z]{5,})\b", comment_text):
|
|
777
|
+
word = word_m.group(1)
|
|
778
|
+
if word in corrupted_snippets:
|
|
779
|
+
continue
|
|
780
|
+
mut = word[:-1] # Truncate
|
|
781
|
+
if mut == word or mut in corrupted_snippets:
|
|
782
|
+
continue
|
|
783
|
+
corrupted = corrupted.replace(word, mut, 1)
|
|
784
|
+
if add_error("comment_typo", word, mut, "typo in comment"):
|
|
785
|
+
found = True
|
|
786
|
+
break
|
|
787
|
+
if found:
|
|
788
|
+
break
|
|
789
|
+
if not found:
|
|
790
|
+
break
|
|
791
|
+
|
|
792
|
+
# species_name supplements - corrupt species names
|
|
793
|
+
species_swaps = [
|
|
794
|
+
("human", "humna"),
|
|
795
|
+
("mouse", "mosue"),
|
|
796
|
+
("Homo sapiens", "Homo sapien"),
|
|
797
|
+
("Mus musculus", "Mus musclus"),
|
|
798
|
+
]
|
|
799
|
+
for orig_sp, mut_sp in species_swaps:
|
|
800
|
+
if need("species_name") <= 0:
|
|
801
|
+
break
|
|
802
|
+
if orig_sp in corrupted and orig_sp not in corrupted_snippets:
|
|
803
|
+
corrupted = corrupted.replace(orig_sp, mut_sp, 1)
|
|
804
|
+
add_error("species_name", orig_sp, mut_sp, "misspelled species name")
|
|
805
|
+
|
|
806
|
+
data["errors"] = errors
|
|
807
|
+
return corrupted, data
|
|
808
|
+
|
|
809
|
+
|