learn_bash_from_session_data 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,443 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Main orchestration script for learn_bash_from_session_data.
4
+
5
+ Discovers Claude session files, extracts bash commands, analyzes them,
6
+ generates quizzes, and produces HTML learning materials.
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import os
12
+ import sys
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+ from typing import Dict, List, Optional, Tuple
16
+
17
+ # Version check
18
+ if sys.version_info < (3, 8):
19
+ sys.exit("Error: Python 3.8 or higher is required. Current version: "
20
+ f"{sys.version_info.major}.{sys.version_info.minor}")
21
+
22
+ # Constants
23
+ DEFAULT_OUTPUT_DIR = "./bash-learner-output/"
24
+ MAX_UNIQUE_COMMANDS = 500
25
+ SESSIONS_BASE_PATH = Path.home() / ".claude" / "projects"
26
+
27
+
28
+ def get_session_metadata(session_path: Path) -> Dict:
29
+ """
30
+ Extract metadata from a session file.
31
+
32
+ Args:
33
+ session_path: Path to the session JSONL file
34
+
35
+ Returns:
36
+ Dictionary with session metadata
37
+ """
38
+ stat = session_path.stat()
39
+ mod_time = datetime.fromtimestamp(stat.st_mtime)
40
+
41
+ # Try to extract project path hint from parent directory name
42
+ project_hash = session_path.parent.parent.name
43
+
44
+ # Try to read first line to get more metadata
45
+ first_message = None
46
+ try:
47
+ with open(session_path, 'r', encoding='utf-8', errors='replace') as f:
48
+ first_line = f.readline().strip()
49
+ if first_line:
50
+ first_message = json.loads(first_line)
51
+ except (json.JSONDecodeError, IOError):
52
+ pass
53
+
54
+ return {
55
+ "path": session_path,
56
+ "filename": session_path.name,
57
+ "project_hash": project_hash,
58
+ "size_bytes": stat.st_size,
59
+ "size_human": format_file_size(stat.st_size),
60
+ "modified": mod_time,
61
+ "modified_str": mod_time.strftime("%Y-%m-%d %H:%M:%S"),
62
+ "first_message": first_message,
63
+ }
64
+
65
+
66
+ def format_file_size(size_bytes: int) -> str:
67
+ """Format file size in human-readable format."""
68
+ for unit in ['B', 'KB', 'MB', 'GB']:
69
+ if size_bytes < 1024:
70
+ return f"{size_bytes:.1f} {unit}"
71
+ size_bytes /= 1024
72
+ return f"{size_bytes:.1f} TB"
73
+
74
+
75
+ def discover_sessions(
76
+ project_filter: Optional[str] = None,
77
+ limit: Optional[int] = None
78
+ ) -> List[Dict]:
79
+ """
80
+ Discover available Claude session files.
81
+
82
+ Args:
83
+ project_filter: Optional filter for project path substring
84
+ limit: Maximum number of sessions to return
85
+
86
+ Returns:
87
+ List of session metadata dictionaries, sorted by modification time (newest first)
88
+ """
89
+ sessions = []
90
+
91
+ if not SESSIONS_BASE_PATH.exists():
92
+ return sessions
93
+
94
+ # Find all session files
95
+ for project_dir in SESSIONS_BASE_PATH.iterdir():
96
+ if not project_dir.is_dir():
97
+ continue
98
+
99
+ sessions_dir = project_dir / "sessions"
100
+ if not sessions_dir.exists():
101
+ continue
102
+
103
+ for session_file in sessions_dir.glob("*.jsonl"):
104
+ metadata = get_session_metadata(session_file)
105
+
106
+ # Apply project filter if specified
107
+ if project_filter:
108
+ # Check if filter matches project hash or any content
109
+ if project_filter.lower() not in str(session_file).lower():
110
+ continue
111
+
112
+ sessions.append(metadata)
113
+
114
+ # Sort by modification time (newest first)
115
+ sessions.sort(key=lambda x: x["modified"], reverse=True)
116
+
117
+ if limit:
118
+ sessions = sessions[:limit]
119
+
120
+ return sessions
121
+
122
+
123
+ def list_sessions(project_filter: Optional[str] = None) -> None:
124
+ """
125
+ Display available sessions in a formatted table.
126
+
127
+ Args:
128
+ project_filter: Optional filter for project path substring
129
+ """
130
+ sessions = discover_sessions(project_filter=project_filter)
131
+
132
+ if not sessions:
133
+ print("\nNo session files found.")
134
+ print(f"\nExpected location: {SESSIONS_BASE_PATH}/<project-hash>/sessions/*.jsonl")
135
+ print("\nMake sure you have Claude session data available.")
136
+ return
137
+
138
+ print(f"\n{'='*80}")
139
+ print(f"Available Claude Sessions ({len(sessions)} found)")
140
+ print(f"{'='*80}")
141
+ print(f"{'#':<4} {'Date':<20} {'Size':<10} {'Filename':<30}")
142
+ print(f"{'-'*80}")
143
+
144
+ for idx, session in enumerate(sessions, 1):
145
+ print(f"{idx:<4} {session['modified_str']:<20} {session['size_human']:<10} "
146
+ f"{session['filename'][:30]:<30}")
147
+
148
+ print(f"{'-'*80}")
149
+ print(f"\nUse -n <number> to process the N most recent sessions")
150
+ print(f"Use -f <path> to process a specific session file")
151
+
152
+
153
+ def load_session_file(session_path: Path) -> List[Dict]:
154
+ """
155
+ Load and parse a session JSONL file.
156
+
157
+ Args:
158
+ session_path: Path to the session file
159
+
160
+ Returns:
161
+ List of parsed JSON objects from the session
162
+ """
163
+ entries = []
164
+
165
+ with open(session_path, 'r', encoding='utf-8', errors='replace') as f:
166
+ for line_num, line in enumerate(f, 1):
167
+ line = line.strip()
168
+ if not line:
169
+ continue
170
+ try:
171
+ entry = json.loads(line)
172
+ entries.append(entry)
173
+ except json.JSONDecodeError as e:
174
+ print(f"Warning: Skipping malformed JSON at line {line_num}: {e}")
175
+
176
+ return entries
177
+
178
+
179
+ def run_extraction_pipeline(
180
+ sessions: List[Dict],
181
+ output_dir: Path
182
+ ) -> Tuple[bool, str]:
183
+ """
184
+ Run the full extraction and generation pipeline.
185
+
186
+ Args:
187
+ sessions: List of session metadata dictionaries
188
+ output_dir: Directory for output files
189
+
190
+ Returns:
191
+ Tuple of (success: bool, message: str)
192
+ """
193
+ # Import processing modules (lazy import to allow standalone testing)
194
+ try:
195
+ from scripts.extractor import extract_commands
196
+ from scripts.parser import parse_commands
197
+ from scripts.analyzer import analyze_commands
198
+ from scripts.quiz_generator import generate_quizzes
199
+ from scripts.html_generator import generate_html
200
+ except ImportError:
201
+ # Try relative import for when run as script
202
+ try:
203
+ from extractor import extract_commands
204
+ from parser import parse_commands
205
+ from analyzer import analyze_commands
206
+ from quiz_generator import generate_quizzes
207
+ from html_generator import generate_html
208
+ except ImportError as e:
209
+ return False, f"Failed to import processing modules: {e}"
210
+
211
+ # Safety check: prevent writing to critical system directories
212
+ output_resolved = output_dir.resolve()
213
+ forbidden_prefixes = ['/etc', '/usr', '/bin', '/sbin', '/lib', '/boot', '/root', '/sys', '/proc']
214
+ for prefix in forbidden_prefixes:
215
+ if str(output_resolved).startswith(prefix):
216
+ return False, f"Safety error: Cannot write to system directory: {output_resolved}"
217
+
218
+ # Create output directory
219
+ output_dir.mkdir(parents=True, exist_ok=True)
220
+
221
+ print(f"\nProcessing {len(sessions)} session(s)...")
222
+ print(f"Output directory: {output_dir.absolute()}")
223
+ print("-" * 60)
224
+
225
+ # Step 1: Load all session data
226
+ all_entries = []
227
+ for session in sessions:
228
+ print(f"Loading: {session['filename']} ({session['size_human']})")
229
+ entries = load_session_file(session['path'])
230
+ all_entries.extend(entries)
231
+ print(f" -> Loaded {len(entries)} entries")
232
+
233
+ if not all_entries:
234
+ return False, "No session entries found in the provided files."
235
+
236
+ print(f"\nTotal entries loaded: {len(all_entries)}")
237
+
238
+ # Step 2: Extract commands
239
+ print("\nExtracting bash commands...")
240
+ raw_commands = extract_commands(all_entries)
241
+ print(f" -> Found {len(raw_commands)} raw commands")
242
+
243
+ if not raw_commands:
244
+ return False, ("No bash commands found in the session data. "
245
+ "Try analyzing more sessions with -n <number>.")
246
+
247
+ # Step 3: Parse commands
248
+ print("\nParsing commands...")
249
+ parsed_commands = parse_commands(raw_commands)
250
+ print(f" -> Parsed {len(parsed_commands)} commands")
251
+
252
+ # Step 4: Deduplicate and cap
253
+ unique_commands = deduplicate_commands(parsed_commands)
254
+ if len(unique_commands) > MAX_UNIQUE_COMMANDS:
255
+ print(f"\nCapping at {MAX_UNIQUE_COMMANDS} unique commands "
256
+ f"(found {len(unique_commands)})")
257
+ unique_commands = unique_commands[:MAX_UNIQUE_COMMANDS]
258
+ else:
259
+ print(f"\n{len(unique_commands)} unique commands")
260
+
261
+ # Step 5: Analyze commands
262
+ print("\nAnalyzing commands...")
263
+ analysis = analyze_commands(unique_commands)
264
+ print(f" -> Generated analysis with {len(analysis.get('categories', {}))} categories")
265
+
266
+ # Step 6: Generate quizzes
267
+ print("\nGenerating quizzes...")
268
+ quizzes = generate_quizzes(unique_commands, analysis)
269
+ quiz_count = sum(len(q) for q in quizzes.values()) if isinstance(quizzes, dict) else len(quizzes)
270
+ print(f" -> Generated {quiz_count} quiz questions")
271
+
272
+ # Step 7: Generate HTML
273
+ print("\nGenerating HTML output...")
274
+ html_files = generate_html(unique_commands, analysis, quizzes, output_dir)
275
+ print(f" -> Created {len(html_files)} HTML files")
276
+
277
+ # Write summary JSON
278
+ summary = {
279
+ "generated_at": datetime.now().isoformat(),
280
+ "sessions_processed": len(sessions),
281
+ "total_entries": len(all_entries),
282
+ "raw_commands": len(raw_commands),
283
+ "unique_commands": len(unique_commands),
284
+ "categories": list(analysis.get('categories', {}).keys()),
285
+ "quiz_count": quiz_count,
286
+ "html_files": [str(f) for f in html_files],
287
+ }
288
+
289
+ summary_path = output_dir / "summary.json"
290
+ with open(summary_path, 'w', encoding='utf-8') as f:
291
+ json.dump(summary, f, indent=2)
292
+
293
+ print(f"\nSummary written to: {summary_path}")
294
+
295
+ return True, f"Successfully generated learning materials in {output_dir}"
296
+
297
+
298
+ def deduplicate_commands(commands: List[Dict]) -> List[Dict]:
299
+ """
300
+ Remove duplicate commands while preserving order.
301
+
302
+ Args:
303
+ commands: List of parsed command dictionaries
304
+
305
+ Returns:
306
+ Deduplicated list of commands
307
+ """
308
+ seen = set()
309
+ unique = []
310
+
311
+ for cmd in commands:
312
+ # Create a key based on the command string
313
+ key = cmd.get('command', '') or cmd.get('raw', '')
314
+ if key and key not in seen:
315
+ seen.add(key)
316
+ unique.append(cmd)
317
+
318
+ return unique
319
+
320
+
321
+ def parse_arguments() -> argparse.Namespace:
322
+ """Parse command-line arguments."""
323
+ parser = argparse.ArgumentParser(
324
+ description="Learn Bash from Claude session data",
325
+ formatter_class=argparse.RawDescriptionHelpFormatter,
326
+ epilog="""
327
+ Examples:
328
+ %(prog)s --list List available sessions
329
+ %(prog)s -n 5 Process 5 most recent sessions
330
+ %(prog)s -f /path/to/session.jsonl Process specific session file
331
+ %(prog)s -n 10 -o ./my-output/ Process 10 sessions to custom directory
332
+ %(prog)s -l -p myproject List sessions matching 'myproject'
333
+ """
334
+ )
335
+
336
+ parser.add_argument(
337
+ '-n', '--sessions',
338
+ type=int,
339
+ default=1,
340
+ help='Number of recent sessions to process (default: 1)'
341
+ )
342
+
343
+ parser.add_argument(
344
+ '-f', '--file',
345
+ type=str,
346
+ help='Specific session file path to process'
347
+ )
348
+
349
+ parser.add_argument(
350
+ '-o', '--output',
351
+ type=str,
352
+ default=DEFAULT_OUTPUT_DIR,
353
+ help=f'Output directory (default: {DEFAULT_OUTPUT_DIR})'
354
+ )
355
+
356
+ parser.add_argument(
357
+ '-l', '--list',
358
+ action='store_true',
359
+ help='List available sessions'
360
+ )
361
+
362
+ parser.add_argument(
363
+ '-p', '--project',
364
+ type=str,
365
+ help='Filter sessions by project path substring'
366
+ )
367
+
368
+ parser.add_argument(
369
+ '-v', '--verbose',
370
+ action='store_true',
371
+ help='Enable verbose output'
372
+ )
373
+
374
+ return parser.parse_args()
375
+
376
+
377
+ def main() -> int:
378
+ """
379
+ Main entry point.
380
+
381
+ Returns:
382
+ Exit code (0 for success, non-zero for errors)
383
+ """
384
+ args = parse_arguments()
385
+
386
+ # Handle --list
387
+ if args.list:
388
+ list_sessions(project_filter=args.project)
389
+ return 0
390
+
391
+ # Determine which sessions to process
392
+ sessions_to_process = []
393
+
394
+ if args.file:
395
+ # Process specific file
396
+ file_path = Path(args.file)
397
+ if not file_path.exists():
398
+ print(f"Error: Session file not found: {args.file}")
399
+ return 1
400
+ if not file_path.suffix == '.jsonl':
401
+ print(f"Warning: Expected .jsonl file, got: {file_path.suffix}")
402
+
403
+ sessions_to_process = [get_session_metadata(file_path)]
404
+
405
+ else:
406
+ # Discover and select sessions
407
+ sessions = discover_sessions(
408
+ project_filter=args.project,
409
+ limit=args.sessions
410
+ )
411
+
412
+ if not sessions:
413
+ print("\nNo session files found.")
414
+ print(f"\nExpected location: {SESSIONS_BASE_PATH}/<project-hash>/sessions/*.jsonl")
415
+ print("\nTo create session data, use Claude Code and your sessions will be stored automatically.")
416
+ print("\nUse --list to see available sessions once you have some.")
417
+ return 1
418
+
419
+ sessions_to_process = sessions
420
+
421
+ # Run the pipeline
422
+ output_dir = Path(args.output)
423
+ success, message = run_extraction_pipeline(sessions_to_process, output_dir)
424
+
425
+ if success:
426
+ print(f"\n{'='*60}")
427
+ print("SUCCESS!")
428
+ print(message)
429
+ print(f"{'='*60}")
430
+
431
+ # Print next steps
432
+ index_file = output_dir / "index.html"
433
+ if index_file.exists():
434
+ print(f"\nOpen {index_file.absolute()} in your browser to start learning!")
435
+
436
+ return 0
437
+ else:
438
+ print(f"\nError: {message}")
439
+ return 1
440
+
441
+
442
+ if __name__ == "__main__":
443
+ sys.exit(main())