learn_bash_from_session_data 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,591 @@
1
+ """
2
+ Analysis engine for bash command extraction and learning.
3
+
4
+ This module provides complexity scoring, category assignment, deduplication,
5
+ and statistics generation for extracted bash commands.
6
+ """
7
+
8
+ import re
9
+ import hashlib
10
+ from collections import Counter, defaultdict
11
+ from dataclasses import dataclass, field
12
+ from typing import Dict, List, Set, Tuple, Any, Optional
13
+
14
+ from knowledge_base import (
15
+ COMMAND_TO_CATEGORY,
16
+ PIPE_OPERATORS,
17
+ REDIRECT_OPERATORS,
18
+ COMPOUND_OPERATORS,
19
+ SUBSHELL_MARKERS,
20
+ PROCESS_SUBSTITUTION,
21
+ LOOP_KEYWORDS,
22
+ CONDITIONAL_KEYWORDS,
23
+ LEARNING_ORDER,
24
+ get_category,
25
+ get_all_categories,
26
+ )
27
+
28
+
29
+ @dataclass
30
+ class ParsedCommand:
31
+ """Represents a parsed bash command with its components."""
32
+ raw: str
33
+ base_command: str
34
+ args: List[str] = field(default_factory=list)
35
+ flags: List[str] = field(default_factory=list)
36
+ has_pipe: bool = False
37
+ has_redirect: bool = False
38
+ has_compound: bool = False
39
+ has_subshell: bool = False
40
+ has_process_sub: bool = False
41
+ has_loop: bool = False
42
+ has_conditional: bool = False
43
+ pipe_count: int = 0
44
+ command_count: int = 1
45
+
46
+
47
+ @dataclass
48
+ class AnalysisResult:
49
+ """Results from analyzing a set of commands."""
50
+ total_commands: int
51
+ unique_commands: int
52
+ unique_base_commands: int
53
+ complexity_distribution: Dict[int, int]
54
+ category_breakdown: Dict[str, int]
55
+ top_commands: List[Tuple[str, int]]
56
+ top_base_commands: List[Tuple[str, int]]
57
+ deduplicated_commands: List[Dict[str, Any]]
58
+ fuzzy_groups: Dict[str, List[str]]
59
+ statistics: Dict[str, Any]
60
+
61
+
62
+ def parse_command(raw_cmd: str) -> ParsedCommand:
63
+ """
64
+ Parse a raw command string into its components.
65
+
66
+ Args:
67
+ raw_cmd: The raw command string to parse
68
+
69
+ Returns:
70
+ ParsedCommand with extracted components
71
+ """
72
+ raw_cmd = raw_cmd.strip()
73
+
74
+ # Initialize result
75
+ result = ParsedCommand(raw=raw_cmd, base_command="")
76
+
77
+ if not raw_cmd:
78
+ return result
79
+
80
+ # Check for various operators and constructs
81
+ result.has_pipe = any(op in raw_cmd for op in PIPE_OPERATORS)
82
+ result.has_redirect = any(op in raw_cmd for op in REDIRECT_OPERATORS)
83
+ result.has_compound = any(op in raw_cmd for op in COMPOUND_OPERATORS)
84
+ result.has_subshell = any(marker in raw_cmd for marker in SUBSHELL_MARKERS)
85
+ result.has_process_sub = any(marker in raw_cmd for marker in PROCESS_SUBSTITUTION)
86
+
87
+ # Check for loops and conditionals
88
+ words = set(re.findall(r'\b\w+\b', raw_cmd))
89
+ result.has_loop = bool(words & LOOP_KEYWORDS)
90
+ result.has_conditional = bool(words & CONDITIONAL_KEYWORDS)
91
+
92
+ # Count pipes
93
+ result.pipe_count = raw_cmd.count('|') - raw_cmd.count('||')
94
+
95
+ # Count commands (separated by pipes, && or ||, or ;)
96
+ result.command_count = 1 + result.pipe_count + raw_cmd.count('&&') + raw_cmd.count('||') + raw_cmd.count(';')
97
+
98
+ # Extract base command (first word, handling various prefixes)
99
+ # Skip common prefixes like sudo, env, time, nice, etc.
100
+ prefix_commands = {'sudo', 'env', 'time', 'nice', 'nohup', 'strace', 'ltrace', 'timeout'}
101
+
102
+ # Get first segment (before any pipe or compound operator)
103
+ first_segment = re.split(r'[|;&]', raw_cmd)[0].strip()
104
+
105
+ # Tokenize the first segment
106
+ tokens = first_segment.split()
107
+
108
+ # Skip prefix commands to find the actual base command
109
+ base_idx = 0
110
+ while base_idx < len(tokens) and tokens[base_idx] in prefix_commands:
111
+ base_idx += 1
112
+
113
+ if base_idx < len(tokens):
114
+ base_cmd = tokens[base_idx]
115
+ # Handle env VAR=value cmd pattern
116
+ while base_idx < len(tokens) and '=' in tokens[base_idx]:
117
+ base_idx += 1
118
+ if base_idx < len(tokens):
119
+ base_cmd = tokens[base_idx]
120
+ result.base_command = base_cmd
121
+
122
+ # Extract flags and args from remaining tokens
123
+ for token in tokens[base_idx + 1:]:
124
+ if token.startswith('-'):
125
+ result.flags.append(token)
126
+ elif '=' not in token:
127
+ result.args.append(token)
128
+
129
+ return result
130
+
131
+
132
+ def score_complexity(parsed_cmd: ParsedCommand) -> int:
133
+ """
134
+ Score the complexity of a parsed command from 1-5.
135
+
136
+ Complexity levels:
137
+ 1: Single command, no flags (ls, pwd, cd src)
138
+ 2: Single command with flags (ls -la, grep -r "pattern" .)
139
+ 3: Pipes or redirects (cat file | grep pattern)
140
+ 4: Compound commands, subshells, loops (find . -name "*.ts" | xargs grep)
141
+ 5: Complex pipelines, process substitution, multi-line
142
+
143
+ Args:
144
+ parsed_cmd: The parsed command to score
145
+
146
+ Returns:
147
+ Complexity score from 1 to 5
148
+ """
149
+ if not parsed_cmd.raw:
150
+ return 1
151
+
152
+ score = 1
153
+
154
+ # Level 2: Has flags or multiple arguments
155
+ if parsed_cmd.flags or len(parsed_cmd.args) > 1:
156
+ score = max(score, 2)
157
+
158
+ # Level 3: Has pipes or redirects
159
+ if parsed_cmd.has_pipe or parsed_cmd.has_redirect:
160
+ score = max(score, 3)
161
+
162
+ # Level 4: Compound commands, subshells, loops, or multiple pipes
163
+ if (parsed_cmd.has_compound or parsed_cmd.has_subshell or
164
+ parsed_cmd.has_loop or parsed_cmd.pipe_count >= 2):
165
+ score = max(score, 4)
166
+
167
+ # Level 5: Process substitution, conditionals with pipes, or very complex
168
+ if (parsed_cmd.has_process_sub or
169
+ (parsed_cmd.has_conditional and parsed_cmd.has_pipe) or
170
+ parsed_cmd.command_count >= 4 or
171
+ parsed_cmd.pipe_count >= 3):
172
+ score = max(score, 5)
173
+
174
+ # Additional complexity factors
175
+ raw = parsed_cmd.raw
176
+
177
+ # Check for inline scripts or complex patterns
178
+ complex_patterns = [
179
+ r'\$\{[^}]+\}', # Parameter expansion
180
+ r'\$\([^)]+\)', # Command substitution
181
+ r'`[^`]+`', # Backtick command substitution
182
+ r'\[\[.*\]\]', # Extended test
183
+ r'<<<', # Here string
184
+ r'<<\s*\w+', # Here document
185
+ r'\(\s*\)', # Empty subshell or function
186
+ r'{\s*\w+.*;', # Brace expansion with commands
187
+ ]
188
+
189
+ complex_count = sum(1 for p in complex_patterns if re.search(p, raw))
190
+ if complex_count >= 2:
191
+ score = min(5, score + 1)
192
+
193
+ return score
194
+
195
+
196
+ def assign_category(parsed_cmd: ParsedCommand) -> str:
197
+ """
198
+ Assign a category to a parsed command based on its base command.
199
+
200
+ Args:
201
+ parsed_cmd: The parsed command to categorize
202
+
203
+ Returns:
204
+ Category name string
205
+ """
206
+ if not parsed_cmd.base_command:
207
+ return "Unknown"
208
+
209
+ # Look up in knowledge base
210
+ category = get_category(parsed_cmd.base_command)
211
+
212
+ # If not found, try to infer from common patterns
213
+ if category == "Unknown":
214
+ base = parsed_cmd.base_command.lower()
215
+
216
+ # Git subcommands (git-xxx or things that look git-related)
217
+ if base.startswith('git-') or 'commit' in base or 'branch' in base:
218
+ return "Git"
219
+
220
+ # Development tools patterns
221
+ if any(ext in base for ext in ['make', 'build', 'compile', 'test']):
222
+ return "Development"
223
+
224
+ # Network-related patterns
225
+ if any(net in base for net in ['http', 'ftp', 'ssh', 'net', 'port']):
226
+ return "Networking"
227
+
228
+ # Package management patterns
229
+ if any(pkg in base for pkg in ['install', 'update', 'upgrade', 'remove']):
230
+ return "Package Management"
231
+
232
+ return category
233
+
234
+
235
+ def _normalize_for_fuzzy(cmd: str) -> str:
236
+ """
237
+ Normalize a command for fuzzy deduplication.
238
+
239
+ Replaces specific arguments with placeholders while keeping structure.
240
+ """
241
+ normalized = cmd
242
+
243
+ # Replace quoted strings with placeholder
244
+ normalized = re.sub(r'"[^"]*"', '"<STR>"', normalized)
245
+ normalized = re.sub(r"'[^']*'", "'<STR>'", normalized)
246
+
247
+ # Replace file paths with placeholder
248
+ normalized = re.sub(r'(?<=[=\s])/[^\s]+', '<PATH>', normalized)
249
+ normalized = re.sub(r'(?<=[=\s])\./[^\s]+', '<PATH>', normalized)
250
+ normalized = re.sub(r'(?<=[=\s])~/[^\s]+', '<PATH>', normalized)
251
+
252
+ # Replace numbers with placeholder
253
+ normalized = re.sub(r'\b\d+\b', '<NUM>', normalized)
254
+
255
+ # Replace UUIDs/hashes with placeholder
256
+ normalized = re.sub(r'\b[a-f0-9]{32,}\b', '<HASH>', normalized)
257
+ normalized = re.sub(r'\b[a-f0-9-]{36}\b', '<UUID>', normalized)
258
+
259
+ # Replace IP addresses with placeholder
260
+ normalized = re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '<IP>', normalized)
261
+
262
+ # Replace URLs with placeholder
263
+ normalized = re.sub(r'https?://[^\s]+', '<URL>', normalized)
264
+
265
+ return normalized
266
+
267
+
268
+ def _get_command_signature(parsed_cmd: ParsedCommand) -> str:
269
+ """
270
+ Get a structural signature for a command (for fuzzy grouping).
271
+ """
272
+ parts = [parsed_cmd.base_command]
273
+
274
+ # Add sorted flags (structure matters, not order)
275
+ if parsed_cmd.flags:
276
+ parts.append('FLAGS:' + ','.join(sorted(parsed_cmd.flags)))
277
+
278
+ # Add argument count
279
+ parts.append(f'ARGS:{len(parsed_cmd.args)}')
280
+
281
+ # Add structural markers
282
+ if parsed_cmd.has_pipe:
283
+ parts.append(f'PIPES:{parsed_cmd.pipe_count}')
284
+ if parsed_cmd.has_redirect:
285
+ parts.append('REDIR')
286
+ if parsed_cmd.has_compound:
287
+ parts.append('COMPOUND')
288
+ if parsed_cmd.has_subshell:
289
+ parts.append('SUBSHELL')
290
+
291
+ return '|'.join(parts)
292
+
293
+
294
+ def deduplicate(commands: List[str]) -> Tuple[List[Dict[str, Any]], Dict[str, List[str]]]:
295
+ """
296
+ Deduplicate commands using both exact and fuzzy matching.
297
+
298
+ Args:
299
+ commands: List of raw command strings
300
+
301
+ Returns:
302
+ Tuple of (deduplicated command list with metadata, fuzzy groups)
303
+ """
304
+ # Exact deduplication with frequency counting
305
+ exact_counts = Counter(commands)
306
+
307
+ # Parse each unique command
308
+ unique_parsed: Dict[str, ParsedCommand] = {}
309
+ for cmd in exact_counts.keys():
310
+ unique_parsed[cmd] = parse_command(cmd)
311
+
312
+ # Group by fuzzy signature
313
+ fuzzy_groups: Dict[str, List[str]] = defaultdict(list)
314
+ for cmd, parsed in unique_parsed.items():
315
+ signature = _get_command_signature(parsed)
316
+ fuzzy_groups[signature].append(cmd)
317
+
318
+ # Build deduplicated result
319
+ deduplicated = []
320
+ seen_signatures: Set[str] = set()
321
+
322
+ for cmd in sorted(exact_counts.keys(), key=lambda x: -exact_counts[x]):
323
+ parsed = unique_parsed[cmd]
324
+ signature = _get_command_signature(parsed)
325
+
326
+ is_fuzzy_duplicate = signature in seen_signatures
327
+ seen_signatures.add(signature)
328
+
329
+ deduplicated.append({
330
+ 'command': cmd,
331
+ 'frequency': exact_counts[cmd],
332
+ 'base_command': parsed.base_command,
333
+ 'complexity': score_complexity(parsed),
334
+ 'category': assign_category(parsed),
335
+ 'is_fuzzy_duplicate': is_fuzzy_duplicate,
336
+ 'fuzzy_signature': signature,
337
+ 'parsed': parsed,
338
+ })
339
+
340
+ # Convert fuzzy groups to regular dict with only groups > 1
341
+ fuzzy_groups_filtered = {
342
+ sig: cmds for sig, cmds in fuzzy_groups.items() if len(cmds) > 1
343
+ }
344
+
345
+ return deduplicated, fuzzy_groups_filtered
346
+
347
+
348
+ def generate_statistics(commands: List[str]) -> Dict[str, Any]:
349
+ """
350
+ Generate comprehensive statistics for a list of commands.
351
+
352
+ Args:
353
+ commands: List of raw command strings
354
+
355
+ Returns:
356
+ Dictionary containing various statistics
357
+ """
358
+ if not commands:
359
+ return {
360
+ 'total_commands': 0,
361
+ 'unique_commands': 0,
362
+ 'unique_base_commands': 0,
363
+ 'complexity_distribution': {},
364
+ 'category_breakdown': {},
365
+ 'average_complexity': 0.0,
366
+ 'most_complex_commands': [],
367
+ }
368
+
369
+ # Parse all commands
370
+ parsed_commands = [parse_command(cmd) for cmd in commands]
371
+
372
+ # Basic counts
373
+ total = len(commands)
374
+ unique_cmds = set(commands)
375
+ unique_count = len(unique_cmds)
376
+
377
+ # Base command analysis
378
+ base_commands = [p.base_command for p in parsed_commands if p.base_command]
379
+ unique_base = set(base_commands)
380
+ base_counts = Counter(base_commands)
381
+
382
+ # Complexity analysis
383
+ complexities = [score_complexity(p) for p in parsed_commands]
384
+ complexity_dist = Counter(complexities)
385
+ avg_complexity = sum(complexities) / len(complexities) if complexities else 0.0
386
+
387
+ # Category analysis
388
+ categories = [assign_category(p) for p in parsed_commands]
389
+ category_counts = Counter(categories)
390
+
391
+ # Find most complex commands
392
+ cmd_complexity = [(p.raw, score_complexity(p)) for p in parsed_commands]
393
+ most_complex = sorted(cmd_complexity, key=lambda x: -x[1])[:10]
394
+
395
+ # Command frequency
396
+ cmd_counts = Counter(commands)
397
+ top_commands = cmd_counts.most_common(20)
398
+ top_base = base_counts.most_common(20)
399
+
400
+ return {
401
+ 'total_commands': total,
402
+ 'unique_commands': unique_count,
403
+ 'unique_base_commands': len(unique_base),
404
+ 'complexity_distribution': dict(sorted(complexity_dist.items())),
405
+ 'category_breakdown': dict(sorted(category_counts.items(), key=lambda x: -x[1])),
406
+ 'average_complexity': round(avg_complexity, 2),
407
+ 'most_complex_commands': most_complex,
408
+ 'top_commands': top_commands,
409
+ 'top_base_commands': top_base,
410
+ 'base_command_coverage': {
411
+ cat: sum(1 for b in unique_base if get_category(b) == cat)
412
+ for cat in get_all_categories()
413
+ },
414
+ }
415
+
416
+
417
+ def analyze_session(extracted_commands: List[str]) -> AnalysisResult:
418
+ """
419
+ Perform complete analysis of extracted commands from a session.
420
+
421
+ Args:
422
+ extracted_commands: List of command strings extracted from session data
423
+
424
+ Returns:
425
+ AnalysisResult with comprehensive analysis
426
+ """
427
+ # Generate statistics
428
+ stats = generate_statistics(extracted_commands)
429
+
430
+ # Deduplicate
431
+ deduplicated, fuzzy_groups = deduplicate(extracted_commands)
432
+
433
+ # Filter to non-fuzzy-duplicates for unique list
434
+ unique_deduplicated = [d for d in deduplicated if not d['is_fuzzy_duplicate']]
435
+
436
+ return AnalysisResult(
437
+ total_commands=stats['total_commands'],
438
+ unique_commands=stats['unique_commands'],
439
+ unique_base_commands=stats['unique_base_commands'],
440
+ complexity_distribution=stats['complexity_distribution'],
441
+ category_breakdown=stats['category_breakdown'],
442
+ top_commands=stats['top_commands'],
443
+ top_base_commands=stats['top_base_commands'],
444
+ deduplicated_commands=deduplicated,
445
+ fuzzy_groups=fuzzy_groups,
446
+ statistics=stats,
447
+ )
448
+
449
+
450
+ def format_analysis_report(result: AnalysisResult) -> str:
451
+ """
452
+ Format an analysis result as a human-readable report.
453
+
454
+ Args:
455
+ result: AnalysisResult to format
456
+
457
+ Returns:
458
+ Formatted string report
459
+ """
460
+ lines = [
461
+ "=" * 60,
462
+ "BASH COMMAND ANALYSIS REPORT",
463
+ "=" * 60,
464
+ "",
465
+ "SUMMARY",
466
+ "-" * 40,
467
+ f"Total commands analyzed: {result.total_commands}",
468
+ f"Unique command strings: {result.unique_commands}",
469
+ f"Unique base utilities: {result.unique_base_commands}",
470
+ f"Average complexity: {result.statistics.get('average_complexity', 0):.2f}",
471
+ "",
472
+ "COMPLEXITY DISTRIBUTION",
473
+ "-" * 40,
474
+ ]
475
+
476
+ complexity_labels = {
477
+ 1: "Simple (no flags)",
478
+ 2: "Basic (with flags)",
479
+ 3: "Intermediate (pipes/redirects)",
480
+ 4: "Advanced (compound/loops)",
481
+ 5: "Expert (complex pipelines)",
482
+ }
483
+
484
+ for level in range(1, 6):
485
+ count = result.complexity_distribution.get(level, 0)
486
+ pct = (count / result.total_commands * 100) if result.total_commands else 0
487
+ bar = "#" * int(pct / 2)
488
+ lines.append(f" {level}: {complexity_labels[level]:<30} {count:>5} ({pct:>5.1f}%) {bar}")
489
+
490
+ lines.extend([
491
+ "",
492
+ "CATEGORY BREAKDOWN",
493
+ "-" * 40,
494
+ ])
495
+
496
+ for category, count in sorted(result.category_breakdown.items(), key=lambda x: -x[1]):
497
+ pct = (count / result.total_commands * 100) if result.total_commands else 0
498
+ lines.append(f" {category:<25} {count:>5} ({pct:>5.1f}%)")
499
+
500
+ lines.extend([
501
+ "",
502
+ "TOP 10 MOST USED COMMANDS",
503
+ "-" * 40,
504
+ ])
505
+
506
+ for cmd, count in result.top_commands[:10]:
507
+ display_cmd = cmd[:50] + "..." if len(cmd) > 50 else cmd
508
+ lines.append(f" {count:>5}x {display_cmd}")
509
+
510
+ lines.extend([
511
+ "",
512
+ "TOP 10 BASE UTILITIES",
513
+ "-" * 40,
514
+ ])
515
+
516
+ for base, count in result.top_base_commands[:10]:
517
+ lines.append(f" {count:>5}x {base}")
518
+
519
+ if result.statistics.get('most_complex_commands'):
520
+ lines.extend([
521
+ "",
522
+ "MOST COMPLEX COMMANDS (TOP 5)",
523
+ "-" * 40,
524
+ ])
525
+ for cmd, complexity in result.statistics['most_complex_commands'][:5]:
526
+ display_cmd = cmd[:60] + "..." if len(cmd) > 60 else cmd
527
+ lines.append(f" [Level {complexity}] {display_cmd}")
528
+
529
+ lines.extend([
530
+ "",
531
+ "=" * 60,
532
+ ])
533
+
534
+ return "\n".join(lines)
535
+
536
+
537
+ # Convenience function for quick analysis
538
+ def quick_analyze(commands: List[str], verbose: bool = False) -> Dict[str, Any]:
539
+ """
540
+ Perform a quick analysis and return essential metrics.
541
+
542
+ Args:
543
+ commands: List of command strings
544
+ verbose: If True, include full deduplicated list
545
+
546
+ Returns:
547
+ Dictionary with analysis summary
548
+ """
549
+ result = analyze_session(commands)
550
+
551
+ summary = {
552
+ 'total': result.total_commands,
553
+ 'unique': result.unique_commands,
554
+ 'unique_utilities': result.unique_base_commands,
555
+ 'complexity': result.complexity_distribution,
556
+ 'categories': result.category_breakdown,
557
+ 'top_commands': result.top_commands[:10],
558
+ 'top_utilities': result.top_base_commands[:10],
559
+ }
560
+
561
+ if verbose:
562
+ summary['deduplicated'] = result.deduplicated_commands
563
+ summary['fuzzy_groups'] = result.fuzzy_groups
564
+
565
+ return summary
566
+
567
+
568
+ if __name__ == "__main__":
569
+ # Example usage and testing
570
+ test_commands = [
571
+ "ls",
572
+ "ls -la",
573
+ "ls -la /home/user",
574
+ "cd src",
575
+ "pwd",
576
+ "cat file.txt | grep pattern",
577
+ "find . -name '*.py' | xargs grep 'import'",
578
+ "git status",
579
+ "git commit -m 'test commit'",
580
+ "docker run -it --rm ubuntu bash",
581
+ "for f in *.txt; do echo $f; done",
582
+ "curl -s https://api.example.com | jq '.data'",
583
+ "ps aux | grep python | awk '{print $2}' | xargs kill",
584
+ "npm install",
585
+ "pip install -r requirements.txt",
586
+ "ls -la", # Duplicate
587
+ "ls -la /home/other", # Fuzzy duplicate
588
+ ]
589
+
590
+ result = analyze_session(test_commands)
591
+ print(format_analysis_report(result))