@da-trollefsen/claude-wrapped 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,382 @@
1
+ """Read and parse Claude Code conversation history from local JSONL files."""
2
+
3
+ import json
4
+ import os
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+ from typing import Iterator
9
+
10
+ from dotenv import load_dotenv
11
+
12
+ # Load .env file if it exists
13
+ load_dotenv()
14
+
15
+
16
+ @dataclass
17
+ class TokenUsage:
18
+ """Token usage for a single message."""
19
+ input_tokens: int = 0
20
+ output_tokens: int = 0
21
+ cache_creation_tokens: int = 0
22
+ cache_read_tokens: int = 0
23
+
24
+ @property
25
+ def total_tokens(self) -> int:
26
+ return self.input_tokens + self.output_tokens + self.cache_creation_tokens + self.cache_read_tokens
27
+
28
+
29
+ @dataclass
30
+ class Message:
31
+ """A single message from a conversation."""
32
+ role: str # 'user' or 'assistant'
33
+ content: str
34
+ timestamp: datetime | None = None
35
+ model: str | None = None
36
+ usage: TokenUsage | None = None
37
+ session_id: str | None = None
38
+ project: str | None = None
39
+ git_branch: str | None = None
40
+ tool_calls: list[str] = field(default_factory=list)
41
+ message_id: str | None = None # For deduplication
42
+
43
+
44
+ @dataclass
45
+ class Session:
46
+ """A conversation session."""
47
+ session_id: str
48
+ project: str
49
+ messages: list[Message] = field(default_factory=list)
50
+ start_time: datetime | None = None
51
+ end_time: datetime | None = None
52
+
53
+
54
+ def get_claude_dir() -> Path:
55
+ """Get the Claude Code data directory."""
56
+ claude_dir = Path.home() / ".claude"
57
+ if not claude_dir.exists():
58
+ raise FileNotFoundError(f"Claude Code directory not found: {claude_dir}")
59
+ return claude_dir
60
+
61
+
62
+ def get_custom_claude_dirs() -> list[Path]:
63
+ """Get custom Claude Code directories from environment variables.
64
+
65
+ Supports:
66
+ - CLAUDE_BACKUP_DIRS: Comma-separated list of directories containing .claude folders
67
+ - Each directory should have the same structure as ~/.claude (with projects/, history.jsonl, etc.)
68
+
69
+ Returns:
70
+ List of Path objects for each valid custom directory
71
+ """
72
+ custom_dirs = []
73
+
74
+ # Check for CLAUDE_BACKUP_DIRS environment variable
75
+ backup_dirs_str = os.getenv("CLAUDE_BACKUP_DIRS", "")
76
+ if backup_dirs_str:
77
+ for dir_str in backup_dirs_str.split(","):
78
+ dir_path = Path(dir_str.strip()).expanduser()
79
+ if dir_path.exists() and dir_path.is_dir():
80
+ custom_dirs.append(dir_path)
81
+ else:
82
+ print(f"Warning: Custom directory not found: {dir_path}")
83
+
84
+ return custom_dirs
85
+
86
+
87
+ def parse_timestamp(ts: int | str | None) -> datetime | None:
88
+ """Parse a timestamp from various formats and convert to local time."""
89
+ if ts is None:
90
+ return None
91
+ if isinstance(ts, int):
92
+ # Milliseconds since epoch - fromtimestamp returns local time
93
+ return datetime.fromtimestamp(ts / 1000)
94
+ if isinstance(ts, str):
95
+ # ISO format with Z (UTC)
96
+ try:
97
+ # Parse as UTC
98
+ utc_dt = datetime.fromisoformat(ts.replace('Z', '+00:00'))
99
+ # Convert to local time (removes timezone info but shifts the time)
100
+ local_dt = utc_dt.astimezone().replace(tzinfo=None)
101
+ return local_dt
102
+ except ValueError:
103
+ return None
104
+ return None
105
+
106
+
107
+ def extract_tool_calls(content: list | str) -> list[str]:
108
+ """Extract tool call names from message content."""
109
+ tool_calls = []
110
+ if isinstance(content, list):
111
+ for item in content:
112
+ if isinstance(item, dict):
113
+ if item.get('type') == 'tool_use':
114
+ tool_calls.append(item.get('name', 'unknown'))
115
+ return tool_calls
116
+
117
+
118
+ def parse_jsonl_record(record: dict) -> Message | None:
119
+ """Parse a single JSONL record into a Message."""
120
+ record_type = record.get('type')
121
+
122
+ if record_type not in ('user', 'assistant'):
123
+ return None
124
+
125
+ message_data = record.get('message', {})
126
+ if not message_data:
127
+ return None
128
+
129
+ content = message_data.get('content', '')
130
+ if isinstance(content, list):
131
+ # Extract text from content blocks
132
+ text_parts = []
133
+ for item in content:
134
+ if isinstance(item, dict) and item.get('type') == 'text':
135
+ text_parts.append(item.get('text', ''))
136
+ elif isinstance(item, str):
137
+ text_parts.append(item)
138
+ content = '\n'.join(text_parts)
139
+
140
+ usage = None
141
+ usage_data = message_data.get('usage')
142
+ if usage_data:
143
+ usage = TokenUsage(
144
+ input_tokens=usage_data.get('input_tokens', 0),
145
+ output_tokens=usage_data.get('output_tokens', 0),
146
+ cache_creation_tokens=usage_data.get('cache_creation_input_tokens', 0),
147
+ cache_read_tokens=usage_data.get('cache_read_input_tokens', 0),
148
+ )
149
+
150
+ return Message(
151
+ role=message_data.get('role', record_type),
152
+ content=content,
153
+ timestamp=parse_timestamp(record.get('timestamp')),
154
+ model=message_data.get('model'),
155
+ usage=usage,
156
+ session_id=record.get('sessionId'),
157
+ project=record.get('cwd'),
158
+ git_branch=record.get('gitBranch'),
159
+ tool_calls=extract_tool_calls(message_data.get('content', [])),
160
+ message_id=message_data.get('id'), # Used for deduplication
161
+ )
162
+
163
+
164
+ def iter_project_sessions(claude_dir: Path) -> Iterator[tuple[str, Path]]:
165
+ """Iterate over all project session JSONL files."""
166
+ projects_dir = claude_dir / "projects"
167
+ if not projects_dir.exists():
168
+ return
169
+
170
+ for project_dir in projects_dir.iterdir():
171
+ if not project_dir.is_dir():
172
+ continue
173
+ for jsonl_file in project_dir.glob("*.jsonl"):
174
+ yield project_dir.name, jsonl_file
175
+
176
+
177
+ def iter_flat_sessions(flat_dir: Path) -> Iterator[tuple[str, Path]]:
178
+ """Iterate over JSONL files in a flat directory (no projects/ subdirectory).
179
+
180
+ Supports directories that just contain .jsonl files directly, without the
181
+ nested projects/[project-name]/*.jsonl structure.
182
+ """
183
+ if not flat_dir.exists() or not flat_dir.is_dir():
184
+ return
185
+
186
+ for jsonl_file in flat_dir.glob("*.jsonl"):
187
+ if jsonl_file.is_file():
188
+ # Use directory name as project name
189
+ yield flat_dir.name, jsonl_file
190
+
191
+
192
+ def iter_projects_folder(projects_dir: Path) -> Iterator[tuple[str, Path]]:
193
+ """Iterate over JSONL files when pointed directly at a projects/ folder.
194
+
195
+ Supports when the directory itself IS the projects folder, containing
196
+ subdirectories like -Users-you-project/ with *.jsonl files inside.
197
+ Same as iter_project_sessions but without looking for projects/ subdirectory.
198
+ """
199
+ if not projects_dir.exists() or not projects_dir.is_dir():
200
+ return
201
+
202
+ for project_dir in projects_dir.iterdir():
203
+ if not project_dir.is_dir():
204
+ continue
205
+ for jsonl_file in project_dir.glob("*.jsonl"):
206
+ yield project_dir.name, jsonl_file
207
+
208
+
209
+ def read_session_file(jsonl_path: Path) -> list[Message]:
210
+ """Read all messages from a session JSONL file."""
211
+ messages = []
212
+ try:
213
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
214
+ for line in f:
215
+ line = line.strip()
216
+ if not line:
217
+ continue
218
+ try:
219
+ record = json.loads(line)
220
+ message = parse_jsonl_record(record)
221
+ if message:
222
+ messages.append(message)
223
+ except json.JSONDecodeError:
224
+ continue
225
+ except (IOError, OSError):
226
+ pass
227
+ return messages
228
+
229
+
230
+ def read_history_file(claude_dir: Path) -> list[Message]:
231
+ """Read the main history.jsonl file (user prompts only)."""
232
+ history_file = claude_dir / "history.jsonl"
233
+ messages = []
234
+ if not history_file.exists():
235
+ return messages
236
+
237
+ try:
238
+ with open(history_file, 'r', encoding='utf-8') as f:
239
+ for line in f:
240
+ line = line.strip()
241
+ if not line:
242
+ continue
243
+ try:
244
+ record = json.loads(line)
245
+ # History file has different format
246
+ messages.append(Message(
247
+ role='user',
248
+ content=record.get('display', ''),
249
+ timestamp=parse_timestamp(record.get('timestamp')),
250
+ project=record.get('project'),
251
+ ))
252
+ except json.JSONDecodeError:
253
+ continue
254
+ except (IOError, OSError):
255
+ pass
256
+ return messages
257
+
258
+
259
+ def read_stats_cache(claude_dir: Path) -> dict | None:
260
+ """Read the pre-computed stats cache if available."""
261
+ stats_file = claude_dir / "stats-cache.json"
262
+ if not stats_file.exists():
263
+ return None
264
+ try:
265
+ with open(stats_file, 'r', encoding='utf-8') as f:
266
+ return json.load(f)
267
+ except (json.JSONDecodeError, IOError):
268
+ return None
269
+
270
+
271
+ def load_all_messages(claude_dir: Path | None = None, year: int | None = None,
272
+ include_custom_dirs: bool = True) -> list[Message]:
273
+ """Load all messages from all sessions, optionally filtered by year.
274
+
275
+ Reads from both project session files (detailed) and history.jsonl (older data).
276
+ Deduplicates messages by message_id to avoid counting duplicate entries
277
+ that can occur from streaming or retries.
278
+
279
+ Args:
280
+ claude_dir: Main Claude directory (defaults to ~/.claude)
281
+ year: Filter messages by year
282
+ include_custom_dirs: If True, also loads from CLAUDE_BACKUP_DIRS env variable
283
+ """
284
+ if claude_dir is None:
285
+ claude_dir = get_claude_dir()
286
+
287
+ all_messages = []
288
+
289
+ # Collect all directories to scan
290
+ dirs_to_scan = [claude_dir]
291
+ if include_custom_dirs:
292
+ custom_dirs = get_custom_claude_dirs()
293
+ if custom_dirs:
294
+ print(f"Loading from {len(custom_dirs)} additional backup director{'y' if len(custom_dirs) == 1 else 'ies'}...")
295
+ dirs_to_scan.extend(custom_dirs)
296
+
297
+ # Read from all directories
298
+ for scan_dir in dirs_to_scan:
299
+ # Detect directory structure
300
+ has_projects_subdir = (scan_dir / "projects").exists()
301
+ has_project_folders = any(
302
+ d.is_dir() and any(d.glob("*.jsonl"))
303
+ for d in scan_dir.iterdir()
304
+ if d.is_dir()
305
+ ) if scan_dir.exists() else False
306
+ has_jsonl_files = any(scan_dir.glob("*.jsonl")) if scan_dir.exists() else False
307
+
308
+ if has_projects_subdir:
309
+ # Structure 1: Standard ~/.claude with projects/ subdirectory
310
+ # Example: ~/.claude/projects/[project-name]/*.jsonl
311
+ for project_name, jsonl_path in iter_project_sessions(scan_dir):
312
+ messages = read_session_file(jsonl_path)
313
+ all_messages.extend(messages)
314
+ # Also read history.jsonl
315
+ history_messages = read_history_file(scan_dir)
316
+ all_messages.extend(history_messages)
317
+
318
+ elif has_project_folders:
319
+ # Structure 2: Directory IS a projects folder
320
+ # Example: ~/.claude/backups/projects/[project-name]/*.jsonl
321
+ for project_name, jsonl_path in iter_projects_folder(scan_dir):
322
+ messages = read_session_file(jsonl_path)
323
+ all_messages.extend(messages)
324
+
325
+ elif has_jsonl_files:
326
+ # Structure 3: Flat directory with *.jsonl files directly
327
+ # Example: ~/exported-chats/*.jsonl
328
+ for project_name, jsonl_path in iter_flat_sessions(scan_dir):
329
+ messages = read_session_file(jsonl_path)
330
+ all_messages.extend(messages)
331
+
332
+ # Deduplicate by message_id (keep the last occurrence which has final token counts)
333
+ seen_ids: dict[str, Message] = {}
334
+ seen_content: dict[tuple, Message] = {} # For messages without IDs
335
+ messages_without_timestamp = [] # Edge case: no timestamp at all
336
+
337
+ for msg in all_messages:
338
+ if msg.message_id:
339
+ # Keep latest version (overwrite previous)
340
+ seen_ids[msg.message_id] = msg
341
+ else:
342
+ # Messages without ID - deduplicate by timestamp+content hash
343
+ if msg.timestamp:
344
+ key = (msg.timestamp.isoformat(), msg.content[:100] if msg.content else "")
345
+ # Keep LAST occurrence (overwrite previous) - matches message_id behavior
346
+ seen_content[key] = msg
347
+ else:
348
+ # No timestamp - can't deduplicate, keep all (rare edge case)
349
+ messages_without_timestamp.append(msg)
350
+
351
+ # Combine all deduplicated messages
352
+ unique_messages = list(seen_ids.values()) + list(seen_content.values()) + messages_without_timestamp
353
+
354
+ # Filter by year if specified
355
+ if year:
356
+ unique_messages = [
357
+ m for m in unique_messages
358
+ if m.timestamp and m.timestamp.year == year
359
+ ]
360
+
361
+ # Sort by timestamp
362
+ unique_messages.sort(key=lambda m: m.timestamp or datetime.min)
363
+
364
+ return unique_messages
365
+
366
+
367
+ if __name__ == "__main__":
368
+ # Quick test
369
+ claude_dir = get_claude_dir()
370
+ print(f"Claude dir: {claude_dir}")
371
+
372
+ messages = load_all_messages(year=2025)
373
+ print(f"Total messages in 2025: {len(messages)}")
374
+
375
+ user_messages = [m for m in messages if m.role == 'user']
376
+ assistant_messages = [m for m in messages if m.role == 'assistant']
377
+ print(f"User messages: {len(user_messages)}")
378
+ print(f"Assistant messages: {len(assistant_messages)}")
379
+
380
+ # Token usage
381
+ total_tokens = sum(m.usage.total_tokens for m in messages if m.usage)
382
+ print(f"Total tokens: {total_tokens:,}")