learn_bash_from_session_data 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,411 @@
1
+ """
2
+ JSONL Extractor for Claude Code Session Data
3
+
4
+ Parses JSONL files from Claude Code sessions, extracts bash tool_use events,
5
+ and correlates them with their corresponding tool_result outputs.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import Iterator, Optional
13
+
14
+
15
+ @dataclass
16
+ class ExtractedCommand:
17
+ """Represents an extracted bash command from a Claude Code session."""
18
+ command: str
19
+ description: str
20
+ output: str
21
+ timestamp: str
22
+ sequence_number: int
23
+ tool_use_id: str = ""
24
+ success: bool = True
25
+ exit_code: Optional[int] = None
26
+
27
+
28
+ class JSONLExtractor:
29
+ """
30
+ Extracts bash commands from Claude Code session JSONL files.
31
+
32
+ Claude Code sessions store tool interactions in JSONL format with:
33
+ - tool_use blocks containing the command and description
34
+ - tool_result blocks containing the execution output
35
+
36
+ This extractor correlates tool_use with tool_result by matching tool_use_id.
37
+ """
38
+
39
+ def __init__(self, session_path: Optional[Path] = None):
40
+ """
41
+ Initialize the extractor.
42
+
43
+ Args:
44
+ session_path: Optional path to Claude Code sessions directory.
45
+ Defaults to ~/.claude/projects/
46
+ """
47
+ if session_path is None:
48
+ home = Path.home()
49
+ session_path = home / ".claude" / "projects"
50
+ self.session_path = Path(session_path)
51
+
52
+ def find_session_files(self, project_filter: Optional[str] = None) -> Iterator[Path]:
53
+ """
54
+ Find all JSONL session files.
55
+
56
+ Args:
57
+ project_filter: Optional substring to filter project directories
58
+
59
+ Yields:
60
+ Paths to JSONL files
61
+ """
62
+ if not self.session_path.exists():
63
+ return
64
+
65
+ for project_dir in self.session_path.iterdir():
66
+ if not project_dir.is_dir():
67
+ continue
68
+ if project_filter and project_filter not in project_dir.name:
69
+ continue
70
+
71
+ # Look for JSONL files in project directory and subdirectories
72
+ for jsonl_file in project_dir.rglob("*.jsonl"):
73
+ yield jsonl_file
74
+
75
+ def extract_from_file(self, file_path: Path) -> list[ExtractedCommand]:
76
+ """
77
+ Extract bash commands from a single JSONL file.
78
+
79
+ Args:
80
+ file_path: Path to the JSONL file
81
+
82
+ Returns:
83
+ List of ExtractedCommand objects
84
+ """
85
+ tool_uses: dict[str, dict] = {} # tool_use_id -> tool_use data
86
+ tool_results: dict[str, dict] = {} # tool_use_id -> tool_result data
87
+ sequence_counter = 0
88
+
89
+ with open(file_path, 'r', encoding='utf-8') as f:
90
+ for line_num, line in enumerate(f, 1):
91
+ line = line.strip()
92
+ if not line:
93
+ continue
94
+
95
+ try:
96
+ entry = json.loads(line)
97
+ except json.JSONDecodeError:
98
+ continue
99
+
100
+ # Extract tool_use and tool_result from various message formats
101
+ self._process_entry(entry, tool_uses, tool_results, sequence_counter)
102
+ sequence_counter += 1
103
+
104
+ # Correlate tool_use with tool_result
105
+ return self._correlate_commands(tool_uses, tool_results)
106
+
107
+ def _process_entry(
108
+ self,
109
+ entry: dict,
110
+ tool_uses: dict,
111
+ tool_results: dict,
112
+ sequence: int
113
+ ) -> None:
114
+ """
115
+ Process a single JSONL entry to extract tool_use and tool_result blocks.
116
+
117
+ Handles multiple message formats from Claude Code sessions:
118
+ - Direct content arrays
119
+ - Nested message structures
120
+ - Assistant/user message pairs
121
+ """
122
+ # Handle direct content array format
123
+ if isinstance(entry, dict) and 'content' in entry:
124
+ content = entry.get('content', [])
125
+ if isinstance(content, list):
126
+ for block in content:
127
+ self._process_content_block(block, tool_uses, tool_results, sequence, entry)
128
+ elif isinstance(content, str):
129
+ # Text-only content, skip
130
+ pass
131
+
132
+ # Handle message format with role
133
+ if isinstance(entry, dict) and 'message' in entry:
134
+ message = entry['message']
135
+ if isinstance(message, dict) and 'content' in message:
136
+ content = message.get('content', [])
137
+ if isinstance(content, list):
138
+ for block in content:
139
+ self._process_content_block(block, tool_uses, tool_results, sequence, entry)
140
+
141
+ # Handle nested messages array
142
+ if isinstance(entry, dict) and 'messages' in entry:
143
+ for msg in entry.get('messages', []):
144
+ if isinstance(msg, dict) and 'content' in msg:
145
+ content = msg.get('content', [])
146
+ if isinstance(content, list):
147
+ for block in content:
148
+ self._process_content_block(block, tool_uses, tool_results, sequence, entry)
149
+
150
+ def _process_content_block(
151
+ self,
152
+ block: dict,
153
+ tool_uses: dict,
154
+ tool_results: dict,
155
+ sequence: int,
156
+ parent_entry: dict
157
+ ) -> None:
158
+ """Process a single content block for tool_use or tool_result."""
159
+ if not isinstance(block, dict):
160
+ return
161
+
162
+ block_type = block.get('type', '')
163
+
164
+ if block_type == 'tool_use':
165
+ tool_name = block.get('name', '')
166
+ if tool_name.lower() == 'bash' or tool_name == 'Bash':
167
+ tool_use_id = block.get('id', '')
168
+ input_data = block.get('input', {})
169
+
170
+ # Extract command and description
171
+ command = ''
172
+ description = ''
173
+
174
+ if isinstance(input_data, dict):
175
+ command = input_data.get('command', '')
176
+ description = input_data.get('description', '')
177
+ elif isinstance(input_data, str):
178
+ command = input_data
179
+
180
+ if command and tool_use_id:
181
+ # Extract timestamp from parent entry
182
+ timestamp = self._extract_timestamp(parent_entry)
183
+
184
+ tool_uses[tool_use_id] = {
185
+ 'command': command,
186
+ 'description': description or '',
187
+ 'timestamp': timestamp,
188
+ 'sequence': sequence
189
+ }
190
+
191
+ elif block_type == 'tool_result':
192
+ tool_use_id = block.get('tool_use_id', '')
193
+ if tool_use_id:
194
+ content = block.get('content', '')
195
+
196
+ # Handle content that might be a list of text blocks
197
+ if isinstance(content, list):
198
+ text_parts = []
199
+ for item in content:
200
+ if isinstance(item, dict) and item.get('type') == 'text':
201
+ text_parts.append(item.get('text', ''))
202
+ elif isinstance(item, str):
203
+ text_parts.append(item)
204
+ content = '\n'.join(text_parts)
205
+
206
+ # Check for error indicators
207
+ is_error = block.get('is_error', False)
208
+
209
+ tool_results[tool_use_id] = {
210
+ 'output': content,
211
+ 'is_error': is_error
212
+ }
213
+
214
+ def _extract_timestamp(self, entry: dict) -> str:
215
+ """Extract timestamp from entry, trying multiple possible locations."""
216
+ # Try common timestamp field names
217
+ for field in ['timestamp', 'created_at', 'time', 'ts', 'datetime']:
218
+ if field in entry:
219
+ return str(entry[field])
220
+
221
+ # Try nested message timestamp
222
+ if 'message' in entry and isinstance(entry['message'], dict):
223
+ for field in ['timestamp', 'created_at', 'time']:
224
+ if field in entry['message']:
225
+ return str(entry['message'][field])
226
+
227
+ return ''
228
+
229
+ def _correlate_commands(
230
+ self,
231
+ tool_uses: dict,
232
+ tool_results: dict
233
+ ) -> list[ExtractedCommand]:
234
+ """
235
+ Correlate tool_use blocks with their corresponding tool_result blocks.
236
+
237
+ Args:
238
+ tool_uses: Dict mapping tool_use_id to tool_use data
239
+ tool_results: Dict mapping tool_use_id to tool_result data
240
+
241
+ Returns:
242
+ List of ExtractedCommand objects, sorted by sequence number
243
+ """
244
+ commands = []
245
+
246
+ for tool_use_id, use_data in tool_uses.items():
247
+ result_data = tool_results.get(tool_use_id, {})
248
+
249
+ output = result_data.get('output', '')
250
+ is_error = result_data.get('is_error', False)
251
+
252
+ # Try to extract exit code from output if present
253
+ exit_code = self._extract_exit_code(output)
254
+
255
+ cmd = ExtractedCommand(
256
+ command=use_data['command'],
257
+ description=use_data['description'],
258
+ output=output,
259
+ timestamp=use_data['timestamp'],
260
+ sequence_number=use_data['sequence'],
261
+ tool_use_id=tool_use_id,
262
+ success=not is_error and (exit_code is None or exit_code == 0),
263
+ exit_code=exit_code
264
+ )
265
+ commands.append(cmd)
266
+
267
+ # Sort by sequence number to maintain order
268
+ commands.sort(key=lambda x: x.sequence_number)
269
+ return commands
270
+
271
+ def _extract_exit_code(self, output: str) -> Optional[int]:
272
+ """
273
+ Try to extract exit code from command output.
274
+
275
+ Some outputs include exit code information in various formats.
276
+ """
277
+ if not output:
278
+ return None
279
+
280
+ # Common patterns for exit codes in output
281
+ import re
282
+
283
+ patterns = [
284
+ r'exit code[:\s]+(\d+)',
285
+ r'exited with[:\s]+(\d+)',
286
+ r'return code[:\s]+(\d+)',
287
+ r'\[exit code: (\d+)\]',
288
+ ]
289
+
290
+ for pattern in patterns:
291
+ match = re.search(pattern, output, re.IGNORECASE)
292
+ if match:
293
+ try:
294
+ return int(match.group(1))
295
+ except ValueError:
296
+ pass
297
+
298
+ return None
299
+
300
+ def extract_all(
301
+ self,
302
+ project_filter: Optional[str] = None
303
+ ) -> list[ExtractedCommand]:
304
+ """
305
+ Extract bash commands from all session files.
306
+
307
+ Args:
308
+ project_filter: Optional substring to filter project directories
309
+
310
+ Returns:
311
+ List of all ExtractedCommand objects from all files
312
+ """
313
+ all_commands = []
314
+
315
+ for file_path in self.find_session_files(project_filter):
316
+ try:
317
+ commands = self.extract_from_file(file_path)
318
+ all_commands.extend(commands)
319
+ except (IOError, OSError) as e:
320
+ # Log error but continue processing other files
321
+ print(f"Warning: Could not process {file_path}: {e}")
322
+
323
+ return all_commands
324
+
325
+ def extract_from_directory(self, directory: Path) -> list[ExtractedCommand]:
326
+ """
327
+ Extract bash commands from all JSONL files in a directory.
328
+
329
+ Args:
330
+ directory: Path to directory containing JSONL files
331
+
332
+ Returns:
333
+ List of ExtractedCommand objects
334
+ """
335
+ all_commands = []
336
+ directory = Path(directory)
337
+
338
+ if not directory.exists():
339
+ return all_commands
340
+
341
+ for jsonl_file in directory.rglob("*.jsonl"):
342
+ try:
343
+ commands = self.extract_from_file(jsonl_file)
344
+ all_commands.extend(commands)
345
+ except (IOError, OSError) as e:
346
+ print(f"Warning: Could not process {jsonl_file}: {e}")
347
+
348
+ return all_commands
349
+
350
+
351
+ def extract_commands_from_jsonl(file_path: str | Path) -> list[ExtractedCommand]:
352
+ """
353
+ Convenience function to extract commands from a single JSONL file.
354
+
355
+ Args:
356
+ file_path: Path to the JSONL file
357
+
358
+ Returns:
359
+ List of ExtractedCommand objects
360
+ """
361
+ extractor = JSONLExtractor()
362
+ return extractor.extract_from_file(Path(file_path))
363
+
364
+
365
+ def extract_commands_from_sessions(
366
+ session_path: Optional[str | Path] = None,
367
+ project_filter: Optional[str] = None
368
+ ) -> list[ExtractedCommand]:
369
+ """
370
+ Convenience function to extract commands from Claude Code session files.
371
+
372
+ Args:
373
+ session_path: Path to sessions directory (defaults to ~/.claude/projects/)
374
+ project_filter: Optional substring to filter project directories
375
+
376
+ Returns:
377
+ List of ExtractedCommand objects
378
+ """
379
+ path = Path(session_path) if session_path else None
380
+ extractor = JSONLExtractor(path)
381
+ return extractor.extract_all(project_filter)
382
+
383
+
384
+ if __name__ == "__main__":
385
+ import sys
386
+
387
+ if len(sys.argv) > 1:
388
+ # Process specific file or directory
389
+ target = Path(sys.argv[1])
390
+ extractor = JSONLExtractor()
391
+
392
+ if target.is_file():
393
+ commands = extractor.extract_from_file(target)
394
+ elif target.is_dir():
395
+ commands = extractor.extract_from_directory(target)
396
+ else:
397
+ print(f"Error: {target} is not a valid file or directory")
398
+ sys.exit(1)
399
+ else:
400
+ # Process all session files
401
+ extractor = JSONLExtractor()
402
+ commands = extractor.extract_all()
403
+
404
+ print(f"Extracted {len(commands)} bash commands")
405
+
406
+ for i, cmd in enumerate(commands[:10], 1):
407
+ print(f"\n--- Command {i} ---")
408
+ print(f"Command: {cmd.command[:100]}{'...' if len(cmd.command) > 100 else ''}")
409
+ print(f"Description: {cmd.description[:80]}{'...' if len(cmd.description) > 80 else ''}")
410
+ print(f"Output length: {len(cmd.output)} chars")
411
+ print(f"Success: {cmd.success}")