npm - learn_bash_from_session_data - Versions diffs - 1.0.9 → 1.0.10 - Mend

learn_bash_from_session_data 1.0.9 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +8 -1
package/scripts/html_generator.py +10 -6
package/scripts/knowledge_base.py +22 -0
package/scripts/main.py +78 -34
package/scripts/quiz_generator.py +69 -37

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "learn_bash_from_session_data",
-  "version": "1.0.9",
+  "version": "1.0.10",
   "description": "Learn bash from your Claude Code sessions - extracts commands and generates interactive HTML lessons with 400+ commands, quizzes, and comprehensive coverage",
   "main": "bin/learn-bash.js",
   "bin": {
@@ -27,6 +27,13 @@
   "engines": {
     "node": ">=14.0.0"
   },
+  "files": [
+    "bin/",
+    "scripts/*.py",
+    "scripts/__init__.py",
+    "README.md",
+    "LICENSE"
+  ],
   "repository": {
     "type": "git",
     "url": "git+https://github.com/bjpl/learn_bash_from_session_data.git"

package/scripts/html_generator.py CHANGED Viewed

@@ -2191,12 +2191,16 @@ def generate_html_files(
         # Skip entries that look like code fragments (contain parens, equals, dots as methods)
         if any(c in base_cmd for c in ('(', ')', '=', '{', '}')) and not base_cmd.startswith('.'):
             continue
+        # Skip entries with backslashes, quotes, or HTML entities (JSONL text fragments)
+        if any(c in base_cmd for c in ('\\', '"', "'")) or '&' in base_cmd:
+            continue
         # Skip entries that are clearly not commands (capitalized status words, text fragments)
         if base_cmd[0].isupper() and base_cmd.isalpha() and base_cmd not in ('PATH', 'HOME'):
             continue
         # Skip common text fragments that get misidentified as commands
         junk_tokens = {'version', 'total', 'package', 'success', 'error', 'reading',
-                       'editing', 'done', 'warning', 'info', 'note', 'output'}
+                       'editing', 'done', 'warning', 'info', 'note', 'output',
+                       'task', 'goal', 'purpose', 'what', 'description'}
         if base_cmd.lower() in junk_tokens:
             continue
@@ -2279,7 +2283,6 @@ def generate_html_files(
                         '-g': 'Global scope',
                         '-p': 'Preserve attributes or port',
                         '-o': 'Output file',
-                        '-P': 'No dereference (physical path)',
                     }
                     flag_desc = common_flags.get(f, '')
                 formatted_flags.append({'flag': f, 'description': flag_desc})
@@ -2342,13 +2345,14 @@ def generate_html_files(
             if short_args:
                 contextual_desc = f"{base_cmd} {' '.join(short_args[:3])}"
-        # Priority: contextual > session > knowledge base
+        # Priority: contextual > knowledge base > generic fallback
+        # Session descriptions (from JSONL) describe Claude's task, NOT the command
         if contextual_desc:
             description = contextual_desc
-        elif session_desc:
-            description = session_desc
+        elif kb_desc:
+            description = kb_desc
         else:
-            description = kb_desc if kb_desc else f"Run {base_cmd} command"
+            description = f"Run {base_cmd} command"
         # Get subcommand info (for commands like git, docker, npm)
         subcommands = cmd_info.get('subcommands', {})

package/scripts/knowledge_base.py CHANGED Viewed

@@ -467,11 +467,16 @@ COMMAND_DB: Dict[str, Dict[str, Any]] = {
             "-n": "Show line numbers",
             "-c": "Count matching lines",
             "-w": "Match whole words only",
+            "-P": "Use Perl-compatible regular expressions (PCRE)",
             "-E": "Extended regex (same as egrep)",
+            "-F": "Fixed string matching (no regex interpretation)",
             "-o": "Show only matching part of line",
             "-A": "Show N lines after match",
             "-B": "Show N lines before match",
             "-C": "Show N lines of context (before and after)",
+            "-h": "Suppress filename prefix in output",
+            "-H": "Always show filename prefix",
+            "-q": "Quiet mode, only return exit status",
             "--include": "Search only files matching pattern",
             "--exclude": "Skip files matching pattern",
         },
@@ -801,6 +806,23 @@ COMMAND_DB: Dict[str, Dict[str, Any]] = {
         "flags": {
             "--version": "Print git version",
             "-C": "Run as if started in specified directory",
+            "--force": "Override safety checks and force the operation",
+            "-f": "Force operation (shorthand for --force)",
+            "--no-verify": "Skip pre-commit and commit-msg hooks",
+            "--amend": "Replace the tip of the current branch with a new commit",
+            "-m": "Specify commit message inline",
+            "-a": "Automatically stage modified and deleted files",
+            "-b": "Create and switch to a new branch",
+            "-d": "Delete a branch",
+            "-D": "Force delete a branch even if not fully merged",
+            "--all": "Apply to all branches or remotes",
+            "--oneline": "Compact one-line log format",
+            "--graph": "Show ASCII graph of branch and merge history",
+            "-u": "Set upstream tracking branch",
+            "--hard": "Reset working tree and index to match target",
+            "--soft": "Reset only HEAD, keep staged changes",
+            "-p": "Interactively choose hunks to stage",
+            "--stat": "Show diffstat summary of changes",
         },
         "subcommands": {
             "init": "Create empty repository",

package/scripts/main.py CHANGED Viewed

@@ -22,6 +22,7 @@ if sys.version_info < (3, 8):
 # Constants
 DEFAULT_OUTPUT_BASE = "./bash-learner-output"
 MAX_UNIQUE_COMMANDS = 500
+VERSION = "1.0.10"
 def generate_timestamped_output_dir(base_dir: str = DEFAULT_OUTPUT_BASE) -> Path:
@@ -51,7 +52,8 @@ def get_sessions_base_path() -> Path:
     is_wsl = False
     try:
         with open("/proc/version", "r") as f:
-            is_wsl = "microsoft" in f.read().lower() or "wsl" in f.read().lower()
+            proc_version = f.read().lower()
+            is_wsl = "microsoft" in proc_version or "wsl" in proc_version
     except (FileNotFoundError, PermissionError):
         pass
@@ -374,7 +376,7 @@ def run_extraction_pipeline(
     # Step 5: Re-parse expanded commands to get proper base_command for each
     parsed_expanded = parse_commands(expanded_commands)
-    # Step 6: Count frequencies BEFORE deduplication
+    # Step 6: Count frequencies BEFORE deduplication (for accurate usage stats)
     cmd_frequency = Counter()
     base_cmd_frequency = Counter()
@@ -386,7 +388,7 @@ def run_extraction_pipeline(
         if base_cmd:
             base_cmd_frequency[base_cmd] += 1
-    # Step 7: Deduplicate and add frequency data
+    # Step 7: Deduplicate and attach frequency data
     unique_commands = deduplicate_commands(parsed_expanded)
     # Add frequency to each unique command
@@ -403,7 +405,7 @@ def run_extraction_pipeline(
     else:
         print(f"\n{len(unique_commands)} unique commands")
-    # Step 6: Analyze commands
+    # Step 8: Analyze commands
     print("\nAnalyzing commands...")
     analysis = analyze_commands(unique_commands)
@@ -415,13 +417,13 @@ def run_extraction_pipeline(
     analysis['operators_used'] = dict(operator_frequency)
     print(f"  -> Generated analysis with {len(analysis.get('categories', {}))} categories")
-    # Step 6: Generate quizzes
+    # Step 9: Generate quizzes
     print("\nGenerating quizzes...")
     quizzes = generate_quizzes(unique_commands, analysis)
     quiz_count = sum(len(q) for q in quizzes.values()) if isinstance(quizzes, dict) else len(quizzes)
     print(f"  -> Generated {quiz_count} quiz questions")
-    # Step 7: Generate HTML
+    # Step 10: Generate HTML
     print("\nGenerating HTML output...")
     html_files = generate_html(unique_commands, analysis, quizzes, output_dir)
     print(f"  -> Created {len(html_files)} HTML files")
@@ -431,7 +433,7 @@ def run_extraction_pipeline(
         "metadata": {
             "generated_at": datetime.now().isoformat(),
             "run_id": output_dir.name,
-            "version": "1.0.5",
+            "version": VERSION,
         },
         "input": {
             "sessions_processed": len(sessions),
@@ -477,8 +479,8 @@ def extract_sub_commands(cmd_str: str) -> List[str]:
     """
     Extract individual sub-commands from a compound command.
-    Splits commands by ||, &&, |, and ; while preserving each sub-command
-    as a learnable unit.
+    Splits commands by ||, &&, |, and ; while respecting quoting
+    and skipping inline code commands (python -c, node -e, bash -c).
     Args:
         cmd_str: The compound command string
@@ -488,35 +490,77 @@ def extract_sub_commands(cmd_str: str) -> List[str]:
     """
     import re
-    # First, clean up redirections but keep them with their command
-    # We want "pip show pkg 2>/dev/null" to stay together
+    if not cmd_str or not cmd_str.strip():
+        return []
-    # Split by compound operators: ||, &&, |, ;
-    # Use regex to split while handling edge cases
-    # Note: | needs special handling to not match ||
+    # Don't split commands that contain inline code - the ; and | inside
+    # quoted code would produce garbage fragments
+    inline_patterns = [' -c "', " -c '", ' -c $', ' -e "', " -e '", ' -e $',
+                       ' -c\n', ' -c\r']
+    first_token = cmd_str.split()[0] if cmd_str.split() else ''
+    if first_token in ('python', 'python3', 'node', 'bash', 'sh', 'ruby', 'perl'):
+        for pat in inline_patterns:
+            if pat in cmd_str:
+                return [cmd_str.strip()]
+    # Quote-aware splitting: track quote depth to avoid splitting inside quotes
     sub_commands = []
+    current = []
+    in_single = False
+    in_double = False
+    i = 0
+    chars = cmd_str
+    while i < len(chars):
+        c = chars[i]
+        # Track quoting state
+        if c == "'" and not in_double:
+            in_single = not in_single
+            current.append(c)
+            i += 1
+        elif c == '"' and not in_single:
+            in_double = not in_double
+            current.append(c)
+            i += 1
+        elif not in_single and not in_double:
+            # Check for compound operators outside quotes
+            remaining = chars[i:]
+            if remaining.startswith('&&'):
+                cmd = ''.join(current).strip()
+                if cmd:
+                    sub_commands.append(cmd)
+                current = []
+                i += 2
+            elif remaining.startswith('||'):
+                cmd = ''.join(current).strip()
+                if cmd:
+                    sub_commands.append(cmd)
+                current = []
+                i += 2
+            elif c == ';':
+                cmd = ''.join(current).strip()
+                if cmd:
+                    sub_commands.append(cmd)
+                current = []
+                i += 1
+            elif c == '|' and not remaining.startswith('||'):
+                cmd = ''.join(current).strip()
+                if cmd:
+                    sub_commands.append(cmd)
+                current = []
+                i += 1
+            else:
+                current.append(c)
+                i += 1
+        else:
+            current.append(c)
+            i += 1
-    # Split by || first (highest precedence for our purposes)
-    or_parts = re.split(r'\s*\|\|\s*', cmd_str)
-    for or_part in or_parts:
-        # Split each part by &&
-        and_parts = re.split(r'\s*&&\s*', or_part)
-        for and_part in and_parts:
-            # Split each part by ; (sequential)
-            seq_parts = re.split(r'\s*;\s*', and_part)
-            for seq_part in seq_parts:
-                # Split by single pipe |
-                # Use negative lookbehind/lookahead to avoid ||
-                pipe_parts = re.split(r'(?<!\|)\|(?!\|)', seq_part)
-                for pipe_part in pipe_parts:
-                    cleaned = pipe_part.strip()
-                    if cleaned:
-                        sub_commands.append(cleaned)
+    # Add final segment
+    cmd = ''.join(current).strip()
+    if cmd:
+        sub_commands.append(cmd)
     return sub_commands

package/scripts/quiz_generator.py CHANGED Viewed

@@ -784,29 +784,25 @@ def generate_what_does_quiz(
         QuizQuestion instance
     """
     cmd_string = command.get("command", "")
-    description = command.get("description", "")
     complexity = command.get("complexity", 2)
     parsed = _parse_command(cmd_string)
     base_cmd = parsed["base"]
-    # Build the correct description using educational bash-focused generator
-    correct_desc = description
-    if not correct_desc:
-        # Use the educational bash description generator
-        correct_desc = _generate_bash_description(cmd_string)
-        # Capitalize first letter for consistent formatting
-        if correct_desc:
-            correct_desc = correct_desc[0].upper() + correct_desc[1:]
-        # Add flag details if available
-        flag_descs = []
-        for flag in parsed["flags"]:
-            fd = _get_flag_description(base_cmd, flag)
-            if fd:
-                flag_descs.append(f"{flag} ({fd.lower()})")
-        if flag_descs:
-            correct_desc += " using " + ", ".join(flag_descs)
+    # Always use the educational bash description generator (not session descriptions)
+    correct_desc = _generate_bash_description(cmd_string)
+    # Capitalize first letter for consistent formatting
+    if correct_desc:
+        correct_desc = correct_desc[0].upper() + correct_desc[1:]
+    # Add flag details if available
+    flag_descs = []
+    for flag in parsed["flags"]:
+        fd = _get_flag_description(base_cmd, flag)
+        if fd:
+            flag_descs.append(f"{flag} ({fd.lower()})")
+    if flag_descs:
+        correct_desc += " using " + ", ".join(flag_descs)
     # Generate distractors
     distractor_descriptions = _generate_distractor_descriptions(correct_desc, 3)
@@ -965,8 +961,6 @@ def generate_build_command_quiz(
         QuizQuestion instance
     """
     cmd_string = command.get("command", "")
-    description = command.get("description", "")
-    intent = command.get("intent", description)
     parsed = _parse_command(cmd_string)
     base_cmd = parsed["base"]
@@ -1051,14 +1045,8 @@ def generate_build_command_quiz(
     question_id = _generate_id(f"build_{cmd_string}")
-    # Use educational bash description for task if no intent/description available
-    if intent:
-        task_description = intent
-    elif description:
-        task_description = description
-    else:
-        # Generate educational description from the command
-        task_description = _generate_bash_description(cmd_string)
+    # Always generate description from the command itself (not session descriptions)
+    task_description = _generate_bash_description(cmd_string)
     return QuizQuestion(
         id=question_id,
@@ -1113,14 +1101,19 @@ def generate_spot_difference_quiz(
     # Build the correct explanation of difference
     differences = []
-    if only_in_1:
-        for flag in only_in_1:
-            desc = _get_flag_description(base_cmd, flag)
-            differences.append(f"Command 1 has `{flag}` ({desc or 'unknown'})")
-    if only_in_2:
-        for flag in only_in_2:
+    has_unknown = False
+    for flag_set, label in [(only_in_1, "Command 1"), (only_in_2, "Command 2")]:
+        for flag in flag_set:
             desc = _get_flag_description(base_cmd, flag)
-            differences.append(f"Command 2 has `{flag}` ({desc or 'unknown'})")
+            # Handle numeric flags like -3 (shorthand for -n 3)
+            if not desc and re.match(r'^-\d+$', flag):
+                desc = f"Specify count ({flag[1:]})"
+            if not desc:
+                has_unknown = True
+            differences.append(f"{label} has `{flag}` ({desc or 'specifies an option'})")
+    # Skip questions where we can't explain the flags well
+    if has_unknown:
+        return None
     if parsed1["args"] != parsed2["args"]:
         differences.append(f"Different arguments: '{' '.join(parsed1['args'])}' vs '{' '.join(parsed2['args'])}'")
@@ -1236,14 +1229,33 @@ def generate_quiz_set(
     """
     questions: list[QuizQuestion] = []
+    # Filter out non-bash entries (Python code fragments, junk tokens, single chars)
+    junk_tokens = {'version', 'total', 'package', 'success', 'error', 'reading',
+                   'editing', 'done', 'warning', 'info', 'note', 'output',
+                   'task', 'goal', 'purpose', 'what', 'description'}
+    clean_commands = []
+    for cmd in analyzed_commands:
+        base = cmd.get("base_command", "")
+        if not base or len(base) < 2:
+            continue
+        if any(c in base for c in ('(', ')', '=', '{', '}')):
+            continue
+        if any(c in base for c in ('\\', '"', "'")) or '&' in base:
+            continue
+        if base[0].isupper() and base.isalpha() and base not in ('PATH', 'HOME'):
+            continue
+        if base.lower() in junk_tokens:
+            continue
+        clean_commands.append(cmd)
     # Filter commands by complexity >= 2
     eligible_commands = [
-        cmd for cmd in analyzed_commands
+        cmd for cmd in clean_commands
         if cmd.get("complexity", 0) >= 2
     ]
     if not eligible_commands:
-        eligible_commands = analyzed_commands
+        eligible_commands = clean_commands if clean_commands else analyzed_commands
     # Weight toward high-frequency commands
     weighted_commands = []
@@ -1270,12 +1282,17 @@ def generate_quiz_set(
         QuizType.SPOT_DIFFERENCE: set(),
     }
+    # Max command length for readable quiz questions
+    MAX_QUIZ_CMD_LEN = 200
     # Generate "What does this do?" questions
     random.shuffle(weighted_commands)
     for cmd in weighted_commands:
         if len([q for q in questions if q.quiz_type == QuizType.WHAT_DOES]) >= target_what_does:
             break
         cmd_id = cmd.get("command", "")
+        if len(cmd_id) > MAX_QUIZ_CMD_LEN:
+            continue
         if cmd_id not in used_per_type[QuizType.WHAT_DOES]:
             q = generate_what_does_quiz(cmd)
             questions.append(q)
@@ -1299,6 +1316,8 @@ def generate_quiz_set(
         if len([q for q in questions if q.quiz_type == QuizType.BUILD_COMMAND]) >= target_build:
             break
         cmd_id = cmd.get("command", "")
+        if len(cmd_id) > MAX_QUIZ_CMD_LEN:
+            continue
         if cmd_id not in used_per_type[QuizType.BUILD_COMMAND]:
             q = generate_build_command_quiz(cmd)
             questions.append(q)
@@ -1310,6 +1329,8 @@ def generate_quiz_set(
         if len([q for q in questions if q.quiz_type == QuizType.SPOT_DIFFERENCE]) >= target_spot_diff:
             break
         cmd_id = cmd.get("command", "")
+        if len(cmd_id) > MAX_QUIZ_CMD_LEN:
+            continue
         if cmd_id not in used_per_type[QuizType.SPOT_DIFFERENCE]:
             variant = _create_similar_command_variant(cmd)
             if variant:
@@ -1318,6 +1339,17 @@ def generate_quiz_set(
                     questions.append(q)
                     used_per_type[QuizType.SPOT_DIFFERENCE].add(cmd_id)
+    # Deduplicate by question text (same question can come from different commands)
+    seen_texts = set()
+    deduped = []
+    for q in questions:
+        # Normalize: take first 80 chars of question text
+        q_key = q.question_text[:80]
+        if q_key not in seen_texts:
+            deduped.append(q)
+            seen_texts.add(q_key)
+    questions = deduped
     # Shuffle final questions
     random.shuffle(questions)