learn_bash_from_session_data 1.0.9 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/scripts/main.py CHANGED
@@ -22,6 +22,7 @@ if sys.version_info < (3, 8):
22
22
  # Constants
23
23
  DEFAULT_OUTPUT_BASE = "./bash-learner-output"
24
24
  MAX_UNIQUE_COMMANDS = 500
25
+ VERSION = "1.0.10"
25
26
 
26
27
 
27
28
  def generate_timestamped_output_dir(base_dir: str = DEFAULT_OUTPUT_BASE) -> Path:
@@ -51,7 +52,8 @@ def get_sessions_base_path() -> Path:
51
52
  is_wsl = False
52
53
  try:
53
54
  with open("/proc/version", "r") as f:
54
- is_wsl = "microsoft" in f.read().lower() or "wsl" in f.read().lower()
55
+ proc_version = f.read().lower()
56
+ is_wsl = "microsoft" in proc_version or "wsl" in proc_version
55
57
  except (FileNotFoundError, PermissionError):
56
58
  pass
57
59
 
@@ -374,7 +376,7 @@ def run_extraction_pipeline(
374
376
  # Step 5: Re-parse expanded commands to get proper base_command for each
375
377
  parsed_expanded = parse_commands(expanded_commands)
376
378
 
377
- # Step 6: Count frequencies BEFORE deduplication
379
+ # Step 6: Count frequencies BEFORE deduplication (for accurate usage stats)
378
380
  cmd_frequency = Counter()
379
381
  base_cmd_frequency = Counter()
380
382
 
@@ -386,7 +388,7 @@ def run_extraction_pipeline(
386
388
  if base_cmd:
387
389
  base_cmd_frequency[base_cmd] += 1
388
390
 
389
- # Step 7: Deduplicate and add frequency data
391
+ # Step 7: Deduplicate and attach frequency data
390
392
  unique_commands = deduplicate_commands(parsed_expanded)
391
393
 
392
394
  # Add frequency to each unique command
@@ -403,7 +405,7 @@ def run_extraction_pipeline(
403
405
  else:
404
406
  print(f"\n{len(unique_commands)} unique commands")
405
407
 
406
- # Step 6: Analyze commands
408
+ # Step 8: Analyze commands
407
409
  print("\nAnalyzing commands...")
408
410
  analysis = analyze_commands(unique_commands)
409
411
 
@@ -415,13 +417,13 @@ def run_extraction_pipeline(
415
417
  analysis['operators_used'] = dict(operator_frequency)
416
418
  print(f" -> Generated analysis with {len(analysis.get('categories', {}))} categories")
417
419
 
418
- # Step 6: Generate quizzes
420
+ # Step 9: Generate quizzes
419
421
  print("\nGenerating quizzes...")
420
422
  quizzes = generate_quizzes(unique_commands, analysis)
421
423
  quiz_count = sum(len(q) for q in quizzes.values()) if isinstance(quizzes, dict) else len(quizzes)
422
424
  print(f" -> Generated {quiz_count} quiz questions")
423
425
 
424
- # Step 7: Generate HTML
426
+ # Step 10: Generate HTML
425
427
  print("\nGenerating HTML output...")
426
428
  html_files = generate_html(unique_commands, analysis, quizzes, output_dir)
427
429
  print(f" -> Created {len(html_files)} HTML files")
@@ -431,7 +433,7 @@ def run_extraction_pipeline(
431
433
  "metadata": {
432
434
  "generated_at": datetime.now().isoformat(),
433
435
  "run_id": output_dir.name,
434
- "version": "1.0.5",
436
+ "version": VERSION,
435
437
  },
436
438
  "input": {
437
439
  "sessions_processed": len(sessions),
@@ -477,8 +479,8 @@ def extract_sub_commands(cmd_str: str) -> List[str]:
477
479
  """
478
480
  Extract individual sub-commands from a compound command.
479
481
 
480
- Splits commands by ||, &&, |, and ; while preserving each sub-command
481
- as a learnable unit.
482
+ Splits commands by ||, &&, |, and ; while respecting quoting
483
+ and skipping inline code commands (python -c, node -e, bash -c).
482
484
 
483
485
  Args:
484
486
  cmd_str: The compound command string
@@ -488,35 +490,77 @@ def extract_sub_commands(cmd_str: str) -> List[str]:
488
490
  """
489
491
  import re
490
492
 
491
- # First, clean up redirections but keep them with their command
492
- # We want "pip show pkg 2>/dev/null" to stay together
493
+ if not cmd_str or not cmd_str.strip():
494
+ return []
493
495
 
494
- # Split by compound operators: ||, &&, |, ;
495
- # Use regex to split while handling edge cases
496
- # Note: | needs special handling to not match ||
496
+ # Don't split commands that contain inline code - the ; and | inside
497
+ # quoted code would produce garbage fragments
498
+ inline_patterns = [' -c "', " -c '", ' -c $', ' -e "', " -e '", ' -e $',
499
+ ' -c\n', ' -c\r']
500
+ first_token = cmd_str.split()[0] if cmd_str.split() else ''
501
+ if first_token in ('python', 'python3', 'node', 'bash', 'sh', 'ruby', 'perl'):
502
+ for pat in inline_patterns:
503
+ if pat in cmd_str:
504
+ return [cmd_str.strip()]
497
505
 
506
+ # Quote-aware splitting: track quote depth to avoid splitting inside quotes
498
507
  sub_commands = []
508
+ current = []
509
+ in_single = False
510
+ in_double = False
511
+ i = 0
512
+ chars = cmd_str
513
+
514
+ while i < len(chars):
515
+ c = chars[i]
516
+
517
+ # Track quoting state
518
+ if c == "'" and not in_double:
519
+ in_single = not in_single
520
+ current.append(c)
521
+ i += 1
522
+ elif c == '"' and not in_single:
523
+ in_double = not in_double
524
+ current.append(c)
525
+ i += 1
526
+ elif not in_single and not in_double:
527
+ # Check for compound operators outside quotes
528
+ remaining = chars[i:]
529
+ if remaining.startswith('&&'):
530
+ cmd = ''.join(current).strip()
531
+ if cmd:
532
+ sub_commands.append(cmd)
533
+ current = []
534
+ i += 2
535
+ elif remaining.startswith('||'):
536
+ cmd = ''.join(current).strip()
537
+ if cmd:
538
+ sub_commands.append(cmd)
539
+ current = []
540
+ i += 2
541
+ elif c == ';':
542
+ cmd = ''.join(current).strip()
543
+ if cmd:
544
+ sub_commands.append(cmd)
545
+ current = []
546
+ i += 1
547
+ elif c == '|' and not remaining.startswith('||'):
548
+ cmd = ''.join(current).strip()
549
+ if cmd:
550
+ sub_commands.append(cmd)
551
+ current = []
552
+ i += 1
553
+ else:
554
+ current.append(c)
555
+ i += 1
556
+ else:
557
+ current.append(c)
558
+ i += 1
499
559
 
500
- # Split by || first (highest precedence for our purposes)
501
- or_parts = re.split(r'\s*\|\|\s*', cmd_str)
502
-
503
- for or_part in or_parts:
504
- # Split each part by &&
505
- and_parts = re.split(r'\s*&&\s*', or_part)
506
-
507
- for and_part in and_parts:
508
- # Split each part by ; (sequential)
509
- seq_parts = re.split(r'\s*;\s*', and_part)
510
-
511
- for seq_part in seq_parts:
512
- # Split by single pipe |
513
- # Use negative lookbehind/lookahead to avoid ||
514
- pipe_parts = re.split(r'(?<!\|)\|(?!\|)', seq_part)
515
-
516
- for pipe_part in pipe_parts:
517
- cleaned = pipe_part.strip()
518
- if cleaned:
519
- sub_commands.append(cleaned)
560
+ # Add final segment
561
+ cmd = ''.join(current).strip()
562
+ if cmd:
563
+ sub_commands.append(cmd)
520
564
 
521
565
  return sub_commands
522
566
 
@@ -0,0 +1,272 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Merge enrichment data into knowledge_base.py COMMAND_DB.
4
+
5
+ Reads enrichment data from enrichment_*.py files and merges them into
6
+ the existing COMMAND_DB entries in knowledge_base.py. Adds missing fields
7
+ (use_cases, gotchas, man_url, related, difficulty) and supplements
8
+ existing flag definitions with extra_flags.
9
+
10
+ Usage:
11
+ python scripts/merge_enrichment.py [--dry-run]
12
+ """
13
+
14
+ import sys
15
+ import re
16
+ import importlib
17
+ import importlib.util
18
+ from pathlib import Path
19
+ from typing import Dict, Any
20
+
21
+
22
+ def load_enrichment_module(filepath: Path) -> Dict[str, Any]:
23
+ """Load ENRICHMENT_DATA from a Python file."""
24
+ spec = importlib.util.spec_from_file_location("enrichment", filepath)
25
+ module = importlib.util.module_from_spec(spec)
26
+ spec.loader.exec_module(module)
27
+ return getattr(module, 'ENRICHMENT_DATA', {})
28
+
29
+
30
+ def collect_all_enrichments(scripts_dir: Path) -> Dict[str, Any]:
31
+ """Collect enrichment data from all enrichment_*.py files."""
32
+ merged = {}
33
+ for enrichment_file in sorted(scripts_dir.glob("enrichment_*.py")):
34
+ print(f" Loading: {enrichment_file.name}")
35
+ data = load_enrichment_module(enrichment_file)
36
+ print(f" -> {len(data)} commands")
37
+ for cmd_name, cmd_data in data.items():
38
+ if cmd_name in merged:
39
+ # Merge: later files can supplement but not overwrite
40
+ for key, value in cmd_data.items():
41
+ if key not in merged[cmd_name] or not merged[cmd_name][key]:
42
+ merged[cmd_name][key] = value
43
+ else:
44
+ merged[cmd_name] = cmd_data
45
+ return merged
46
+
47
+
48
+ def merge_into_knowledge_base(kb_path: Path, enrichments: Dict[str, Any], dry_run: bool = False) -> int:
49
+ """
50
+ Merge enrichment data into knowledge_base.py by modifying COMMAND_DB entries.
51
+
52
+ Strategy: For each command in enrichments, find its entry in COMMAND_DB and
53
+ insert the enrichment fields before the closing brace of that entry.
54
+
55
+ Returns number of commands enriched.
56
+ """
57
+ content = kb_path.read_text(encoding='utf-8')
58
+ original_content = content
59
+ enriched_count = 0
60
+ fields_to_add = ['man_url', 'use_cases', 'gotchas', 'related', 'difficulty']
61
+
62
+ for cmd_name, enrichment in enrichments.items():
63
+ # Find this command's entry in COMMAND_DB
64
+ # Pattern: "cmd_name": { ... },
65
+ # We look for the closing "}, " or "},\n" of this entry
66
+
67
+ # Find the start of this command's dict entry
68
+ # Handle both regular command names and special ones like "."
69
+ escaped_name = re.escape(cmd_name)
70
+ entry_pattern = rf' "{escaped_name}": \{{'
71
+ match = re.search(entry_pattern, content)
72
+ if not match:
73
+ print(f" WARNING: Command '{cmd_name}' not found in COMMAND_DB, skipping")
74
+ continue
75
+
76
+ entry_start = match.start()
77
+
78
+ # Find the closing of this entry by counting braces
79
+ brace_depth = 0
80
+ entry_end = -1
81
+ i = match.end() - 1 # Start at the opening brace
82
+ while i < len(content):
83
+ char = content[i]
84
+ if char == '{':
85
+ brace_depth += 1
86
+ elif char == '}':
87
+ brace_depth -= 1
88
+ if brace_depth == 0:
89
+ entry_end = i
90
+ break
91
+ # Skip string contents to avoid counting braces in strings
92
+ elif char == '"':
93
+ i += 1
94
+ while i < len(content) and content[i] != '"':
95
+ if content[i] == '\\':
96
+ i += 1 # Skip escaped char
97
+ i += 1
98
+ elif char == "'":
99
+ i += 1
100
+ while i < len(content) and content[i] != "'":
101
+ if content[i] == '\\':
102
+ i += 1
103
+ i += 1
104
+ i += 1
105
+
106
+ if entry_end == -1:
107
+ print(f" WARNING: Could not find end of entry for '{cmd_name}', skipping")
108
+ continue
109
+
110
+ # Extract the entry content
111
+ entry_content = content[entry_start:entry_end + 1]
112
+
113
+ # Check which fields are missing
114
+ additions = []
115
+ for field in fields_to_add:
116
+ if f'"{field}"' not in entry_content:
117
+ value = enrichment.get(field)
118
+ if value:
119
+ additions.append((field, value))
120
+
121
+ # Handle extra_flags: merge into existing flags dict
122
+ extra_flags = enrichment.get('extra_flags', {})
123
+ if extra_flags and '"flags"' in entry_content:
124
+ # Find the flags dict closing brace and add new flags before it
125
+ flags_additions = []
126
+ for flag, desc in extra_flags.items():
127
+ escaped_flag = flag.replace('"', '\\"')
128
+ if f'"{escaped_flag}"' not in entry_content:
129
+ flags_additions.append(f' "{escaped_flag}": "{desc}",')
130
+ if flags_additions:
131
+ # Find the closing of the flags dict within this entry
132
+ flags_match = re.search(r'"flags":\s*\{', entry_content)
133
+ if flags_match:
134
+ flags_start = flags_match.end()
135
+ # Find closing brace of flags
136
+ fb_depth = 1
137
+ fi = flags_start
138
+ while fi < len(entry_content) and fb_depth > 0:
139
+ if entry_content[fi] == '{':
140
+ fb_depth += 1
141
+ elif entry_content[fi] == '}':
142
+ fb_depth -= 1
143
+ elif entry_content[fi] == '"':
144
+ fi += 1
145
+ while fi < len(entry_content) and entry_content[fi] != '"':
146
+ if entry_content[fi] == '\\':
147
+ fi += 1
148
+ fi += 1
149
+ fi += 1
150
+ flags_end_pos = entry_start + fi - 1
151
+ # Insert new flags before the closing brace
152
+ flags_insert = '\n' + '\n'.join(flags_additions) + '\n '
153
+ content = content[:flags_end_pos] + flags_insert + content[flags_end_pos:]
154
+ # Recalculate entry_end since we modified content
155
+ entry_end += len(flags_insert)
156
+
157
+ # Handle improved_description: replace existing description
158
+ improved_desc = enrichment.get('improved_description')
159
+ if improved_desc and '"description"' in entry_content:
160
+ # Replace the existing description string
161
+ desc_pattern = rf'( "{escaped_name}": \{{[^}}]*?"description":\s*)"([^"]*(?:\\.[^"]*)*)"'
162
+ new_desc = improved_desc.replace('"', '\\"')
163
+ content = re.sub(desc_pattern, rf'\1"{new_desc}"', content, count=1)
164
+
165
+ if not additions:
166
+ continue
167
+
168
+ # Build the insertion text
169
+ insertion_lines = []
170
+ for field, value in additions:
171
+ if isinstance(value, str):
172
+ escaped_val = value.replace('"', '\\"')
173
+ insertion_lines.append(f' "{field}": "{escaped_val}",')
174
+ elif isinstance(value, list):
175
+ if all(isinstance(v, str) for v in value):
176
+ items = ', '.join(f'"{v}"' for v in value)
177
+ if len(items) < 80:
178
+ insertion_lines.append(f' "{field}": [{items}],')
179
+ else:
180
+ insertion_lines.append(f' "{field}": [')
181
+ for v in value:
182
+ escaped_v = v.replace('"', '\\"')
183
+ insertion_lines.append(f' "{escaped_v}",')
184
+ insertion_lines.append(f' ],')
185
+
186
+ if insertion_lines:
187
+ insertion = '\n' + '\n'.join(insertion_lines)
188
+ # Recalculate entry_end in current content
189
+ match2 = re.search(entry_pattern, content)
190
+ if match2:
191
+ brace_depth = 0
192
+ i2 = match2.end() - 1
193
+ while i2 < len(content):
194
+ char = content[i2]
195
+ if char == '{':
196
+ brace_depth += 1
197
+ elif char == '}':
198
+ brace_depth -= 1
199
+ if brace_depth == 0:
200
+ entry_end = i2
201
+ break
202
+ elif char == '"':
203
+ i2 += 1
204
+ while i2 < len(content) and content[i2] != '"':
205
+ if content[i2] == '\\':
206
+ i2 += 1
207
+ i2 += 1
208
+ elif char == "'":
209
+ i2 += 1
210
+ while i2 < len(content) and content[i2] != "'":
211
+ if content[i2] == '\\':
212
+ i2 += 1
213
+ i2 += 1
214
+ i2 += 1
215
+
216
+ # Insert before the closing brace
217
+ content = content[:entry_end] + insertion + '\n ' + content[entry_end:]
218
+ enriched_count += 1
219
+
220
+ if content != original_content:
221
+ if dry_run:
222
+ print(f"\n DRY RUN: Would enrich {enriched_count} commands")
223
+ # Show a diff summary
224
+ added_lines = len(content.splitlines()) - len(original_content.splitlines())
225
+ print(f" Would add ~{added_lines} lines")
226
+ else:
227
+ kb_path.write_text(content, encoding='utf-8')
228
+ print(f"\n Enriched {enriched_count} commands in {kb_path.name}")
229
+
230
+ return enriched_count
231
+
232
+
233
+ def main():
234
+ dry_run = '--dry-run' in sys.argv
235
+
236
+ scripts_dir = Path(__file__).parent
237
+ kb_path = scripts_dir / 'knowledge_base.py'
238
+
239
+ if not kb_path.exists():
240
+ print(f"Error: {kb_path} not found")
241
+ return 1
242
+
243
+ print("Collecting enrichment data...")
244
+ enrichments = collect_all_enrichments(scripts_dir)
245
+
246
+ if not enrichments:
247
+ print("No enrichment data found. Run the research agents first.")
248
+ return 1
249
+
250
+ print(f"\nTotal enrichments: {len(enrichments)} commands")
251
+ print(f"\nMerging into {kb_path.name}{' (DRY RUN)' if dry_run else ''}...")
252
+ count = merge_into_knowledge_base(kb_path, enrichments, dry_run=dry_run)
253
+
254
+ if count > 0:
255
+ # Verify the file is still valid Python
256
+ if not dry_run:
257
+ print("\nVerifying syntax...")
258
+ try:
259
+ compile(kb_path.read_text(encoding='utf-8'), kb_path, 'exec')
260
+ print(" Syntax OK")
261
+ except SyntaxError as e:
262
+ print(f" SYNTAX ERROR: {e}")
263
+ print(" Reverting changes...")
264
+ # We'd need to keep a backup for this - for now just warn
265
+ return 1
266
+
267
+ print("\nDone.")
268
+ return 0
269
+
270
+
271
+ if __name__ == '__main__':
272
+ sys.exit(main())
@@ -784,29 +784,25 @@ def generate_what_does_quiz(
784
784
  QuizQuestion instance
785
785
  """
786
786
  cmd_string = command.get("command", "")
787
- description = command.get("description", "")
788
787
  complexity = command.get("complexity", 2)
789
788
 
790
789
  parsed = _parse_command(cmd_string)
791
790
  base_cmd = parsed["base"]
792
791
 
793
- # Build the correct description using educational bash-focused generator
794
- correct_desc = description
795
- if not correct_desc:
796
- # Use the educational bash description generator
797
- correct_desc = _generate_bash_description(cmd_string)
798
- # Capitalize first letter for consistent formatting
799
- if correct_desc:
800
- correct_desc = correct_desc[0].upper() + correct_desc[1:]
801
-
802
- # Add flag details if available
803
- flag_descs = []
804
- for flag in parsed["flags"]:
805
- fd = _get_flag_description(base_cmd, flag)
806
- if fd:
807
- flag_descs.append(f"{flag} ({fd.lower()})")
808
- if flag_descs:
809
- correct_desc += " using " + ", ".join(flag_descs)
792
+ # Always use the educational bash description generator (not session descriptions)
793
+ correct_desc = _generate_bash_description(cmd_string)
794
+ # Capitalize first letter for consistent formatting
795
+ if correct_desc:
796
+ correct_desc = correct_desc[0].upper() + correct_desc[1:]
797
+
798
+ # Add flag details if available
799
+ flag_descs = []
800
+ for flag in parsed["flags"]:
801
+ fd = _get_flag_description(base_cmd, flag)
802
+ if fd:
803
+ flag_descs.append(f"{flag} ({fd.lower()})")
804
+ if flag_descs:
805
+ correct_desc += " using " + ", ".join(flag_descs)
810
806
 
811
807
  # Generate distractors
812
808
  distractor_descriptions = _generate_distractor_descriptions(correct_desc, 3)
@@ -965,8 +961,6 @@ def generate_build_command_quiz(
965
961
  QuizQuestion instance
966
962
  """
967
963
  cmd_string = command.get("command", "")
968
- description = command.get("description", "")
969
- intent = command.get("intent", description)
970
964
 
971
965
  parsed = _parse_command(cmd_string)
972
966
  base_cmd = parsed["base"]
@@ -1051,14 +1045,8 @@ def generate_build_command_quiz(
1051
1045
 
1052
1046
  question_id = _generate_id(f"build_{cmd_string}")
1053
1047
 
1054
- # Use educational bash description for task if no intent/description available
1055
- if intent:
1056
- task_description = intent
1057
- elif description:
1058
- task_description = description
1059
- else:
1060
- # Generate educational description from the command
1061
- task_description = _generate_bash_description(cmd_string)
1048
+ # Always generate description from the command itself (not session descriptions)
1049
+ task_description = _generate_bash_description(cmd_string)
1062
1050
 
1063
1051
  return QuizQuestion(
1064
1052
  id=question_id,
@@ -1113,14 +1101,19 @@ def generate_spot_difference_quiz(
1113
1101
 
1114
1102
  # Build the correct explanation of difference
1115
1103
  differences = []
1116
- if only_in_1:
1117
- for flag in only_in_1:
1118
- desc = _get_flag_description(base_cmd, flag)
1119
- differences.append(f"Command 1 has `{flag}` ({desc or 'unknown'})")
1120
- if only_in_2:
1121
- for flag in only_in_2:
1104
+ has_unknown = False
1105
+ for flag_set, label in [(only_in_1, "Command 1"), (only_in_2, "Command 2")]:
1106
+ for flag in flag_set:
1122
1107
  desc = _get_flag_description(base_cmd, flag)
1123
- differences.append(f"Command 2 has `{flag}` ({desc or 'unknown'})")
1108
+ # Handle numeric flags like -3 (shorthand for -n 3)
1109
+ if not desc and re.match(r'^-\d+$', flag):
1110
+ desc = f"Specify count ({flag[1:]})"
1111
+ if not desc:
1112
+ has_unknown = True
1113
+ differences.append(f"{label} has `{flag}` ({desc or 'specifies an option'})")
1114
+ # Skip questions where we can't explain the flags well
1115
+ if has_unknown:
1116
+ return None
1124
1117
  if parsed1["args"] != parsed2["args"]:
1125
1118
  differences.append(f"Different arguments: '{' '.join(parsed1['args'])}' vs '{' '.join(parsed2['args'])}'")
1126
1119
 
@@ -1236,14 +1229,33 @@ def generate_quiz_set(
1236
1229
  """
1237
1230
  questions: list[QuizQuestion] = []
1238
1231
 
1232
+ # Filter out non-bash entries (Python code fragments, junk tokens, single chars)
1233
+ junk_tokens = {'version', 'total', 'package', 'success', 'error', 'reading',
1234
+ 'editing', 'done', 'warning', 'info', 'note', 'output',
1235
+ 'task', 'goal', 'purpose', 'what', 'description'}
1236
+ clean_commands = []
1237
+ for cmd in analyzed_commands:
1238
+ base = cmd.get("base_command", "")
1239
+ if not base or len(base) < 2:
1240
+ continue
1241
+ if any(c in base for c in ('(', ')', '=', '{', '}')):
1242
+ continue
1243
+ if any(c in base for c in ('\\', '"', "'")) or '&' in base:
1244
+ continue
1245
+ if base[0].isupper() and base.isalpha() and base not in ('PATH', 'HOME'):
1246
+ continue
1247
+ if base.lower() in junk_tokens:
1248
+ continue
1249
+ clean_commands.append(cmd)
1250
+
1239
1251
  # Filter commands by complexity >= 2
1240
1252
  eligible_commands = [
1241
- cmd for cmd in analyzed_commands
1253
+ cmd for cmd in clean_commands
1242
1254
  if cmd.get("complexity", 0) >= 2
1243
1255
  ]
1244
1256
 
1245
1257
  if not eligible_commands:
1246
- eligible_commands = analyzed_commands
1258
+ eligible_commands = clean_commands if clean_commands else analyzed_commands
1247
1259
 
1248
1260
  # Weight toward high-frequency commands
1249
1261
  weighted_commands = []
@@ -1270,12 +1282,17 @@ def generate_quiz_set(
1270
1282
  QuizType.SPOT_DIFFERENCE: set(),
1271
1283
  }
1272
1284
 
1285
+ # Max command length for readable quiz questions
1286
+ MAX_QUIZ_CMD_LEN = 200
1287
+
1273
1288
  # Generate "What does this do?" questions
1274
1289
  random.shuffle(weighted_commands)
1275
1290
  for cmd in weighted_commands:
1276
1291
  if len([q for q in questions if q.quiz_type == QuizType.WHAT_DOES]) >= target_what_does:
1277
1292
  break
1278
1293
  cmd_id = cmd.get("command", "")
1294
+ if len(cmd_id) > MAX_QUIZ_CMD_LEN:
1295
+ continue
1279
1296
  if cmd_id not in used_per_type[QuizType.WHAT_DOES]:
1280
1297
  q = generate_what_does_quiz(cmd)
1281
1298
  questions.append(q)
@@ -1299,6 +1316,8 @@ def generate_quiz_set(
1299
1316
  if len([q for q in questions if q.quiz_type == QuizType.BUILD_COMMAND]) >= target_build:
1300
1317
  break
1301
1318
  cmd_id = cmd.get("command", "")
1319
+ if len(cmd_id) > MAX_QUIZ_CMD_LEN:
1320
+ continue
1302
1321
  if cmd_id not in used_per_type[QuizType.BUILD_COMMAND]:
1303
1322
  q = generate_build_command_quiz(cmd)
1304
1323
  questions.append(q)
@@ -1310,6 +1329,8 @@ def generate_quiz_set(
1310
1329
  if len([q for q in questions if q.quiz_type == QuizType.SPOT_DIFFERENCE]) >= target_spot_diff:
1311
1330
  break
1312
1331
  cmd_id = cmd.get("command", "")
1332
+ if len(cmd_id) > MAX_QUIZ_CMD_LEN:
1333
+ continue
1313
1334
  if cmd_id not in used_per_type[QuizType.SPOT_DIFFERENCE]:
1314
1335
  variant = _create_similar_command_variant(cmd)
1315
1336
  if variant:
@@ -1318,6 +1339,17 @@ def generate_quiz_set(
1318
1339
  questions.append(q)
1319
1340
  used_per_type[QuizType.SPOT_DIFFERENCE].add(cmd_id)
1320
1341
 
1342
+ # Deduplicate by question text (same question can come from different commands)
1343
+ seen_texts = set()
1344
+ deduped = []
1345
+ for q in questions:
1346
+ # Normalize: take first 80 chars of question text
1347
+ q_key = q.question_text[:80]
1348
+ if q_key not in seen_texts:
1349
+ deduped.append(q)
1350
+ seen_texts.add(q_key)
1351
+ questions = deduped
1352
+
1321
1353
  # Shuffle final questions
1322
1354
  random.shuffle(questions)
1323
1355