learn_bash_from_session_data 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/scripts/main.py CHANGED
@@ -319,11 +319,66 @@ def run_extraction_pipeline(
319
319
  parsed_commands = parse_commands(raw_commands)
320
320
  print(f" -> Parsed {len(parsed_commands)} commands")
321
321
 
322
- # Step 4: Count frequencies BEFORE deduplication
322
+ # Step 4: Expand compound commands into individual sub-commands
323
+ # Also count operators for tracking
323
324
  from collections import Counter
325
+ import re
326
+
327
+ operator_frequency = Counter()
328
+ expanded_commands = []
329
+
330
+ # Operator patterns to detect
331
+ operator_patterns = {
332
+ '||': r'\|\|',
333
+ '&&': r'&&',
334
+ '|': r'(?<!\|)\|(?!\|)', # Single pipe, not ||
335
+ '2>&1': r'2>&1',
336
+ '2>/dev/null': r'2>/dev/null',
337
+ '>': r'(?<![2&])>(?!>|&)', # Single >, not >> or 2> or >&
338
+ '>>': r'>>',
339
+ '<': r'<(?!<)',
340
+ }
341
+
342
+ for cmd in parsed_commands:
343
+ cmd_str = cmd.get('command', '') or cmd.get('raw', '')
344
+ if not cmd_str:
345
+ continue
346
+
347
+ # Count operators in this command
348
+ for op_name, op_pattern in operator_patterns.items():
349
+ matches = re.findall(op_pattern, cmd_str)
350
+ if matches:
351
+ operator_frequency[op_name] += len(matches)
352
+
353
+ # Check if this is a compound command
354
+ is_compound = any(op in cmd_str for op in ['||', '&&', ' | ', ';'])
355
+
356
+ if is_compound:
357
+ # Extract individual sub-commands from compound statement
358
+ sub_commands = extract_sub_commands(cmd_str)
359
+ for sub_cmd in sub_commands:
360
+ if sub_cmd.strip():
361
+ expanded_commands.append({
362
+ 'command': sub_cmd.strip(),
363
+ 'raw': sub_cmd.strip(),
364
+ 'original_compound': cmd_str,
365
+ 'description': cmd.get('description', ''),
366
+ 'output': cmd.get('output', ''),
367
+ })
368
+ else:
369
+ # Simple command - add as-is
370
+ expanded_commands.append(cmd)
371
+
372
+ print(f" -> Expanded to {len(expanded_commands)} individual commands")
373
+
374
+ # Step 5: Re-parse expanded commands to get proper base_command for each
375
+ parsed_expanded = parse_commands(expanded_commands)
376
+
377
+ # Step 6: Count frequencies BEFORE deduplication
324
378
  cmd_frequency = Counter()
325
379
  base_cmd_frequency = Counter()
326
- for cmd in parsed_commands:
380
+
381
+ for cmd in parsed_expanded:
327
382
  cmd_str = cmd.get('command', '') or cmd.get('raw', '')
328
383
  base_cmd = cmd.get('base_command', '')
329
384
  if cmd_str:
@@ -331,8 +386,8 @@ def run_extraction_pipeline(
331
386
  if base_cmd:
332
387
  base_cmd_frequency[base_cmd] += 1
333
388
 
334
- # Step 5: Deduplicate and add frequency data
335
- unique_commands = deduplicate_commands(parsed_commands)
389
+ # Step 7: Deduplicate and add frequency data
390
+ unique_commands = deduplicate_commands(parsed_expanded)
336
391
 
337
392
  # Add frequency to each unique command
338
393
  for cmd in unique_commands:
@@ -357,6 +412,7 @@ def run_extraction_pipeline(
357
412
  analysis['base_command_frequency'] = dict(base_cmd_frequency)
358
413
  analysis['top_commands'] = cmd_frequency.most_common(20)
359
414
  analysis['top_base_commands'] = base_cmd_frequency.most_common(20)
415
+ analysis['operators_used'] = dict(operator_frequency)
360
416
  print(f" -> Generated analysis with {len(analysis.get('categories', {}))} categories")
361
417
 
362
418
  # Step 6: Generate quizzes
@@ -375,7 +431,7 @@ def run_extraction_pipeline(
375
431
  "metadata": {
376
432
  "generated_at": datetime.now().isoformat(),
377
433
  "run_id": output_dir.name,
378
- "version": "1.0.4",
434
+ "version": "1.0.5",
379
435
  },
380
436
  "input": {
381
437
  "sessions_processed": len(sessions),
@@ -399,6 +455,7 @@ def run_extraction_pipeline(
399
455
  {"command": cmd, "count": count}
400
456
  for cmd, count in list(base_cmd_frequency.most_common(10))
401
457
  ],
458
+ "operators_used": dict(operator_frequency),
402
459
  "complexity_distribution": dict(analysis.get('complexity_distribution', {})),
403
460
  },
404
461
  "output": {
@@ -416,6 +473,54 @@ def run_extraction_pipeline(
416
473
  return True, f"Successfully generated learning materials in {output_dir}"
417
474
 
418
475
 
476
+ def extract_sub_commands(cmd_str: str) -> List[str]:
477
+ """
478
+ Extract individual sub-commands from a compound command.
479
+
480
+ Splits commands by ||, &&, |, and ; while preserving each sub-command
481
+ as a learnable unit.
482
+
483
+ Args:
484
+ cmd_str: The compound command string
485
+
486
+ Returns:
487
+ List of individual sub-command strings
488
+ """
489
+ import re
490
+
491
+ # First, clean up redirections but keep them with their command
492
+ # We want "pip show pkg 2>/dev/null" to stay together
493
+
494
+ # Split by compound operators: ||, &&, |, ;
495
+ # Use regex to split while handling edge cases
496
+ # Note: | needs special handling to not match ||
497
+
498
+ sub_commands = []
499
+
500
+ # Split by || first (highest precedence for our purposes)
501
+ or_parts = re.split(r'\s*\|\|\s*', cmd_str)
502
+
503
+ for or_part in or_parts:
504
+ # Split each part by &&
505
+ and_parts = re.split(r'\s*&&\s*', or_part)
506
+
507
+ for and_part in and_parts:
508
+ # Split each part by ; (sequential)
509
+ seq_parts = re.split(r'\s*;\s*', and_part)
510
+
511
+ for seq_part in seq_parts:
512
+ # Split by single pipe |
513
+ # Use negative lookbehind/lookahead to avoid ||
514
+ pipe_parts = re.split(r'(?<!\|)\|(?!\|)', seq_part)
515
+
516
+ for pipe_part in pipe_parts:
517
+ cleaned = pipe_part.strip()
518
+ if cleaned:
519
+ sub_commands.append(cleaned)
520
+
521
+ return sub_commands
522
+
523
+
419
524
  def deduplicate_commands(commands: List[Dict]) -> List[Dict]:
420
525
  """
421
526
  Remove duplicate commands while preserving order.
@@ -18,6 +18,45 @@ import random
18
18
  import re
19
19
  import hashlib
20
20
 
21
+ try:
22
+ from scripts.knowledge_base import COMMAND_DB, get_command_info, get_flags_for_command
23
+ except ImportError:
24
+ try:
25
+ from knowledge_base import COMMAND_DB, get_command_info, get_flags_for_command
26
+ except ImportError:
27
+ COMMAND_DB = {}
28
+ def get_command_info(name): return None
29
+ def get_flags_for_command(command): return {}
30
+
31
+
32
+ def _get_flags_for_cmd(cmd: str) -> dict[str, str]:
33
+ """Get merged flags for a command from knowledge_base (primary) and local FLAG_DATABASE (fallback).
34
+
35
+ Knowledge_base.py COMMAND_DB is the authoritative source. FLAG_DATABASE provides
36
+ additional coverage for commands not yet in knowledge_base.
37
+ """
38
+ flags = {}
39
+ # Primary source: knowledge_base COMMAND_DB
40
+ kb_flags = get_flags_for_command(cmd)
41
+ if kb_flags:
42
+ flags.update(kb_flags)
43
+ # Fallback/supplement: local FLAG_DATABASE
44
+ if cmd in FLAG_DATABASE:
45
+ for flag, desc in FLAG_DATABASE[cmd].items():
46
+ if flag not in flags:
47
+ flags[flag] = desc
48
+ return flags
49
+
50
+
51
+ def _get_all_flagged_commands() -> set[str]:
52
+ """Get the set of all commands that have flag data from any source."""
53
+ cmds = set()
54
+ for cmd, info in COMMAND_DB.items():
55
+ if info.get("flags"):
56
+ cmds.add(cmd)
57
+ cmds.update(FLAG_DATABASE.keys())
58
+ return cmds
59
+
21
60
 
22
61
  class QuizType(Enum):
23
62
  """Types of quiz questions."""
@@ -397,37 +436,98 @@ def _generate_bash_description(cmd_string: str) -> str:
397
436
  Generate an educational description focusing on bash concepts.
398
437
 
399
438
  Explains what each part of the command does from a bash perspective.
439
+ Handles: &&, ||, |, 2>&1, 2>/dev/null, and combinations.
400
440
  """
401
441
  if not cmd_string:
402
442
  return "Runs a command"
403
443
 
444
+ # Clean up redirections for description (note them but don't clutter)
445
+ has_stderr_to_stdout = '2>&1' in cmd_string
446
+ has_stderr_to_null = '2>/dev/null' in cmd_string
447
+ has_stdout_redirect = re.search(r'>\s*\S+', cmd_string) and '2>' not in cmd_string
448
+
449
+ # Remove redirections for parsing (we'll note them separately)
450
+ clean_cmd = re.sub(r'\s*2>&1\s*', ' ', cmd_string)
451
+ clean_cmd = re.sub(r'\s*2>/dev/null\s*', ' ', clean_cmd)
452
+ clean_cmd = re.sub(r'\s*>\s*\S+\s*', ' ', clean_cmd)
453
+ clean_cmd = ' '.join(clean_cmd.split()) # normalize whitespace
454
+
404
455
  parts = []
405
456
 
406
- # Check for command chaining
407
- if ' && ' in cmd_string:
408
- commands = cmd_string.split(' && ')
457
+ # Handle && (run if previous succeeds)
458
+ if ' && ' in clean_cmd:
459
+ commands = clean_cmd.split(' && ')
409
460
  for i, cmd in enumerate(commands):
410
- base = cmd.strip().split()[0] if cmd.strip() else ''
411
- if i == 0:
412
- parts.append(_describe_single_command(cmd.strip()))
461
+ cmd = cmd.strip()
462
+ if not cmd:
463
+ continue
464
+ # Handle nested || or | within && segments
465
+ if ' || ' in cmd:
466
+ parts.append(_describe_or_chain(cmd))
467
+ elif ' | ' in cmd:
468
+ parts.append(_describe_pipe_chain(cmd))
469
+ elif i == 0:
470
+ parts.append(_describe_single_command(cmd))
413
471
  else:
414
- parts.append(f"then {_describe_single_command(cmd.strip())}")
415
- return ', '.join(parts)
472
+ parts.append(f"then {_describe_single_command(cmd)}")
473
+
474
+ # Handle || (run if previous fails)
475
+ elif ' || ' in clean_cmd:
476
+ parts.append(_describe_or_chain(clean_cmd))
477
+
478
+ # Handle | (pipe)
479
+ elif ' | ' in clean_cmd:
480
+ parts.append(_describe_pipe_chain(clean_cmd))
481
+
482
+ else:
483
+ parts.append(_describe_single_command(clean_cmd))
484
+
485
+ result = ', '.join(parts)
486
+
487
+ # Add redirection notes
488
+ if has_stderr_to_null:
489
+ result += " (suppressing errors)"
490
+ elif has_stderr_to_stdout:
491
+ result += " (capturing all output)"
416
492
 
417
- if ' || ' in cmd_string:
418
- commands = cmd_string.split(' || ')
419
- parts.append(_describe_single_command(commands[0].strip()))
420
- parts.append(f"or if that fails, {_describe_single_command(commands[1].strip())}")
421
- return ', '.join(parts)
493
+ return result
422
494
 
423
- if ' | ' in cmd_string:
424
- commands = cmd_string.split(' | ')
425
- parts.append(_describe_single_command(commands[0].strip()))
426
- for cmd in commands[1:]:
427
- parts.append(f"pipes output to {_describe_single_command(cmd.strip())}")
428
- return ', '.join(parts)
429
495
 
430
- return _describe_single_command(cmd_string)
496
+ def _describe_or_chain(cmd_string: str) -> str:
497
+ """Describe an || chain (fallback pattern)."""
498
+ commands = cmd_string.split(' || ')
499
+ parts = []
500
+ for i, cmd in enumerate(commands):
501
+ cmd = cmd.strip()
502
+ if not cmd:
503
+ continue
504
+ # Handle pipes within || segments
505
+ if ' | ' in cmd:
506
+ desc = _describe_pipe_chain(cmd)
507
+ else:
508
+ desc = _describe_single_command(cmd)
509
+
510
+ if i == 0:
511
+ parts.append(desc)
512
+ else:
513
+ parts.append(f"or if that fails, {desc}")
514
+ return ', '.join(parts)
515
+
516
+
517
+ def _describe_pipe_chain(cmd_string: str) -> str:
518
+ """Describe a pipe chain."""
519
+ commands = cmd_string.split(' | ')
520
+ parts = []
521
+ for i, cmd in enumerate(commands):
522
+ cmd = cmd.strip()
523
+ if not cmd:
524
+ continue
525
+ desc = _describe_single_command(cmd)
526
+ if i == 0:
527
+ parts.append(desc)
528
+ else:
529
+ parts.append(f"pipes to {desc}")
530
+ return ', '.join(parts)
431
531
 
432
532
 
433
533
  def _describe_single_command(cmd: str) -> str:
@@ -438,6 +538,19 @@ def _describe_single_command(cmd: str) -> str:
438
538
  tokens = cmd.split()
439
539
  base_cmd = tokens[0] if tokens else ''
440
540
 
541
+ # Get args (skip flags) for knowledge_base fallback
542
+ args = [t for t in tokens[1:] if not t.startswith('-')]
543
+
544
+ # Check knowledge_base COMMAND_DB for rich description
545
+ if base_cmd and base_cmd in COMMAND_DB:
546
+ cmd_info = COMMAND_DB[base_cmd]
547
+ kb_desc = cmd_info.get('description', '')
548
+ if kb_desc:
549
+ # Use knowledge base description but make it contextual with args
550
+ if args:
551
+ return f"{kb_desc.lower()} ({' '.join(args[:2])})"
552
+ return kb_desc.lower()
553
+
441
554
  # Common command descriptions with bash focus
442
555
  descriptions = {
443
556
  'cd': lambda args: f"changes directory to {args[0] if args else 'specified path'}",
@@ -576,35 +689,35 @@ def _parse_command(cmd_string: str) -> dict:
576
689
 
577
690
 
578
691
  def _get_flag_description(cmd: str, flag: str) -> Optional[str]:
579
- """Get description for a flag of a command."""
580
- if cmd in FLAG_DATABASE:
581
- # Handle flags like -la (combined short flags)
582
- if flag in FLAG_DATABASE[cmd]:
583
- return FLAG_DATABASE[cmd][flag]
584
- # Try individual characters for combined flags
585
- if len(flag) > 2 and flag.startswith("-") and not flag.startswith("--"):
586
- for char in flag[1:]:
587
- single_flag = f"-{char}"
588
- if single_flag in FLAG_DATABASE[cmd]:
589
- return FLAG_DATABASE[cmd][single_flag]
692
+ """Get description for a flag of a command from merged sources."""
693
+ merged = _get_flags_for_cmd(cmd)
694
+ if flag in merged:
695
+ return merged[flag]
696
+ # Try individual characters for combined flags (e.g., -la -> -l, -a)
697
+ if len(flag) > 2 and flag.startswith("-") and not flag.startswith("--"):
698
+ for char in flag[1:]:
699
+ single_flag = f"-{char}"
700
+ if single_flag in merged:
701
+ return merged[single_flag]
590
702
  return None
591
703
 
592
704
 
593
705
  def _generate_distractor_flags(cmd: str, correct_flag: str, count: int = 3) -> list[str]:
594
- """Generate plausible distractor flags."""
706
+ """Generate plausible distractor flags from merged knowledge sources."""
595
707
  distractors = []
596
708
 
597
- # Get other flags from the same command
598
- if cmd in FLAG_DATABASE:
599
- other_flags = [f for f in FLAG_DATABASE[cmd].keys() if f != correct_flag]
709
+ # Get other flags from the same command (merged sources)
710
+ cmd_flags = _get_flags_for_cmd(cmd)
711
+ if cmd_flags:
712
+ other_flags = [f for f in cmd_flags.keys() if f != correct_flag]
600
713
  random.shuffle(other_flags)
601
714
  distractors.extend(other_flags[:count])
602
715
 
603
716
  # If we need more, get common flags from other commands
604
717
  if len(distractors) < count:
605
- for other_cmd, flags in FLAG_DATABASE.items():
718
+ for other_cmd in _get_all_flagged_commands():
606
719
  if other_cmd != cmd:
607
- for flag in flags:
720
+ for flag in _get_flags_for_cmd(other_cmd):
608
721
  if flag not in distractors and flag != correct_flag:
609
722
  distractors.append(flag)
610
723
  if len(distractors) >= count:
@@ -619,10 +732,10 @@ def _generate_distractor_descriptions(correct_desc: str, count: int = 3) -> list
619
732
  """Generate plausible wrong descriptions."""
620
733
  distractors = []
621
734
 
622
- # Collect all descriptions from FLAG_DATABASE
735
+ # Collect all descriptions from merged sources
623
736
  all_descriptions = []
624
- for cmd_flags in FLAG_DATABASE.values():
625
- all_descriptions.extend(cmd_flags.values())
737
+ for cmd in _get_all_flagged_commands():
738
+ all_descriptions.extend(_get_flags_for_cmd(cmd).values())
626
739
 
627
740
  # Remove duplicates and the correct answer
628
741
  all_descriptions = list(set(all_descriptions))
@@ -736,16 +849,17 @@ def generate_which_flag_quiz(
736
849
  parsed = _parse_command(cmd_string)
737
850
  base_cmd = parsed["base"]
738
851
 
739
- if base_cmd not in FLAG_DATABASE or not parsed["flags"]:
852
+ cmd_flags = _get_flags_for_cmd(base_cmd)
853
+ if not cmd_flags or not parsed["flags"]:
740
854
  return None
741
855
 
742
856
  # Pick a flag to quiz on
743
- available_flags = [f for f in parsed["flags"] if f in FLAG_DATABASE.get(base_cmd, {})]
857
+ available_flags = [f for f in parsed["flags"] if f in cmd_flags]
744
858
  if not available_flags:
745
859
  return None
746
860
 
747
861
  target_flag = random.choice(available_flags)
748
- flag_desc = FLAG_DATABASE[base_cmd][target_flag]
862
+ flag_desc = cmd_flags[target_flag]
749
863
 
750
864
  # Generate distractor flags
751
865
  distractor_flags = _generate_distractor_flags(base_cmd, target_flag, 3)
@@ -770,13 +884,13 @@ def generate_which_flag_quiz(
770
884
  correct_id = opt_id
771
885
 
772
886
  # Get description for option explanation
773
- flag_explanation = FLAG_DATABASE.get(base_cmd, {}).get(flag, "Unknown flag")
887
+ flag_explanation = cmd_flags.get(flag, "Unknown flag")
774
888
 
775
889
  options.append(QuizOption(
776
890
  id=opt_id,
777
891
  text=flag,
778
892
  is_correct=is_correct,
779
- explanation=f"{flag}: {flag_explanation}" if flag in FLAG_DATABASE.get(base_cmd, {}) else f"{flag}: Not a standard flag for {base_cmd}"
893
+ explanation=f"{flag}: {flag_explanation}" if flag in cmd_flags else f"{flag}: Not a standard flag for {base_cmd}"
780
894
  ))
781
895
 
782
896
  question_id = _generate_id(f"which_flag_{base_cmd}_{target_flag}")
@@ -835,7 +949,7 @@ def generate_build_command_quiz(
835
949
  distractors.append(" ".join(missing_flag))
836
950
 
837
951
  # Distractor 3: Wrong flag
838
- if parsed["flags"] and base_cmd in FLAG_DATABASE:
952
+ if parsed["flags"] and _get_flags_for_cmd(base_cmd):
839
953
  wrong_flags = _generate_distractor_flags(base_cmd, parsed["flags"][0], 1)
840
954
  if wrong_flags:
841
955
  wrong_flag_cmd = [base_cmd] + [wrong_flags[0]] + parsed["flags"][1:] + parsed["args"]
@@ -1004,14 +1118,15 @@ def _create_similar_command_variant(command: dict) -> Optional[dict]:
1004
1118
  parsed = _parse_command(cmd_string)
1005
1119
  base_cmd = parsed["base"]
1006
1120
 
1007
- if base_cmd not in FLAG_DATABASE:
1121
+ variant_flags = _get_flags_for_cmd(base_cmd)
1122
+ if not variant_flags:
1008
1123
  return None
1009
1124
 
1010
1125
  # Strategy: add, remove, or change a flag
1011
1126
  strategies = []
1012
1127
 
1013
1128
  # Can add a flag
1014
- available_flags = [f for f in FLAG_DATABASE[base_cmd].keys() if f not in parsed["flags"]]
1129
+ available_flags = [f for f in variant_flags.keys() if f not in parsed["flags"]]
1015
1130
  if available_flags:
1016
1131
  strategies.append("add")
1017
1132
 
package/vectors.db ADDED
Binary file