@ebowwa/claudecodehistory 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,261 +0,0 @@
1
- //! High-performance JSONL parser for Claude Code history files
2
- //!
3
- //! Uses memory-mapped I/O and parallel processing for maximum speed.
4
-
5
- use memmap2::Mmap;
6
- use rayon::prelude::*;
7
- use std::fs::File;
8
- use std::path::Path;
9
- use walkdir::WalkDir;
10
-
11
- use crate::types::*;
12
- use crate::utils::*;
13
-
14
- /// Fast directory parser - parses all JSONL files in a directory
15
- ///
16
- /// Uses memory-mapped I/O and parallel processing for maximum speed.
17
- /// This is the equivalent of `parseDirFast` from @ebowwa/jsonl-hft
18
- pub fn parse_dir_fast(dir_path: &str) -> Vec<ParsedEntry> {
19
- let path = Path::new(dir_path);
20
-
21
- if !path.exists() || !path.is_dir() {
22
- return Vec::new();
23
- }
24
-
25
- // Collect all JSONL files
26
- let jsonl_files: Vec<_> = WalkDir::new(path)
27
- .max_depth(1)
28
- .into_iter()
29
- .filter_map(|e| e.ok())
30
- .filter(|e| e.path().extension().map_or(false, |ext| ext == "jsonl"))
31
- .map(|e| e.path().to_path_buf())
32
- .collect();
33
-
34
- // Parse files in parallel
35
- jsonl_files
36
- .par_iter()
37
- .flat_map(|file_path| parse_file_fast(file_path))
38
- .collect()
39
- }
40
-
41
- /// Fast file parser - parses a single JSONL file using memory-mapped I/O
42
- pub fn parse_file_fast(file_path: &Path) -> Vec<ParsedEntry> {
43
- let file = match File::open(file_path) {
44
- Ok(f) => f,
45
- Err(_) => return Vec::new(),
46
- };
47
-
48
- let mmap = match unsafe { Mmap::map(&file) } {
49
- Ok(m) => m,
50
- Err(_) => return Vec::new(),
51
- };
52
-
53
- let file_path_str = file_path.to_string_lossy().to_string();
54
-
55
- // Parse lines in parallel using rayon
56
- mmap.split(|&b| b == b'\n')
57
- .par_bridge()
58
- .filter_map(|line| parse_line_fast(line, &file_path_str))
59
- .collect()
60
- }
61
-
62
- /// Parse a single line into a ParsedEntry
63
- fn parse_line_fast(line: &[u8], file_path: &str) -> Option<ParsedEntry> {
64
- // Skip empty lines
65
- let trimmed = trim_whitespace(line);
66
- if trimmed.is_empty() {
67
- return None;
68
- }
69
-
70
- // Quick validation: must start with '{' and end with '}'
71
- if trimmed.first()? != &b'{' || trimmed.last()? != &b'}' {
72
- return None;
73
- }
74
-
75
- // Parse JSON
76
- let value: serde_json::Value = serde_json::from_slice(trimmed).ok()?;
77
-
78
- // Extract fields with minimal allocations
79
- let obj = value.as_object()?;
80
-
81
- let uuid = obj.get("uuid")?.as_str()?.to_string();
82
- let session_id = obj.get("sessionId")?.as_str()?.to_string();
83
- let timestamp = obj.get("timestamp")?.as_str()?.to_string();
84
-
85
- // Extract type/role
86
- let message_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("user");
87
- let role = message_type.to_string();
88
-
89
- // Extract content from message.content
90
- let content = extract_content(obj).unwrap_or_default();
91
-
92
- Some(ParsedEntry {
93
- uuid,
94
- session_id,
95
- timestamp,
96
- role,
97
- content,
98
- file_path: file_path.to_string(),
99
- })
100
- }
101
-
102
- /// Extract content from the message field
103
- fn extract_content(obj: &serde_json::Map<String, serde_json::Value>) -> Option<String> {
104
- let message = obj.get("message")?.as_object()?;
105
- let content = message.get("content")?;
106
-
107
- match content {
108
- serde_json::Value::String(s) => Some(s.clone()),
109
- serde_json::Value::Array(arr) => {
110
- let texts: Vec<_> = arr
111
- .iter()
112
- .filter_map(|item| {
113
- if let serde_json::Value::String(s) = item {
114
- Some(s.clone())
115
- } else if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
116
- Some(text.to_string())
117
- } else {
118
- None
119
- }
120
- })
121
- .collect();
122
- Some(texts.join(" "))
123
- }
124
- _ => None,
125
- }
126
- }
127
-
128
- /// Trim whitespace from bytes
129
- fn trim_whitespace(bytes: &[u8]) -> &[u8] {
130
- let start = bytes.iter().position(|&b| !b.is_ascii_whitespace()).unwrap_or(0);
131
- let end = bytes
132
- .iter()
133
- .rposition(|&b| !b.is_ascii_whitespace())
134
- .map(|i| i + 1)
135
- .unwrap_or(0);
136
- &bytes[start..end]
137
- }
138
-
139
- /// Parse a JSONL file into ConversationEntry objects
140
- pub fn parse_jsonl_file(
141
- file_path: &Path,
142
- project_dir: &str,
143
- start_date: Option<&str>,
144
- end_date: Option<&str>,
145
- ) -> Vec<ConversationEntry> {
146
- let file = match File::open(file_path) {
147
- Ok(f) => f,
148
- Err(_) => return Vec::new(),
149
- };
150
-
151
- let mmap = match unsafe { Mmap::map(&file) } {
152
- Ok(m) => m,
153
- Err(_) => return Vec::new(),
154
- };
155
-
156
- let project_path = decode_project_path(project_dir);
157
-
158
- mmap.split(|&b| b == b'\n')
159
- .filter_map(|line| {
160
- let trimmed = trim_whitespace(line);
161
- if trimmed.is_empty() {
162
- return None;
163
- }
164
-
165
- let msg: ClaudeCodeMessage = serde_json::from_slice(trimmed).ok()?;
166
-
167
- // Date filtering
168
- if let Some(start) = start_date {
169
- if msg.timestamp.as_str() < start {
170
- return None;
171
- }
172
- }
173
- if let Some(end) = end_date {
174
- if msg.timestamp.as_str() > end {
175
- return None;
176
- }
177
- }
178
-
179
- convert_message_to_entry(msg, &project_path)
180
- })
181
- .collect()
182
- }
183
-
184
- /// Convert a ClaudeCodeMessage to a ConversationEntry
185
- fn convert_message_to_entry(
186
- msg: ClaudeCodeMessage,
187
- project_path: &str,
188
- ) -> Option<ConversationEntry> {
189
- // Extract content
190
- let content = msg.message.as_ref().map(|m| {
191
- match &m.content {
192
- MessageContentValue::String(s) => s.clone(),
193
- MessageContentValue::Array(blocks) => blocks
194
- .iter()
195
- .filter_map(|b| {
196
- b.text.clone().or_else(|| {
197
- b.block_type
198
- .as_ref()
199
- .map(|t| format!("[{}]", t))
200
- })
201
- })
202
- .collect::<Vec<_>>()
203
- .join(" "),
204
- }
205
- }).unwrap_or_default();
206
-
207
- // Calculate time-related fields
208
- let timestamp = msg.timestamp.clone();
209
- let (formatted_time, time_ago, local_date) = format_timestamp(&timestamp);
210
-
211
- Some(ConversationEntry {
212
- session_id: msg.session_id,
213
- timestamp,
214
- entry_type: msg.message_type,
215
- content,
216
- project_path: project_path.to_string(),
217
- uuid: msg.uuid,
218
- formatted_time: Some(formatted_time),
219
- time_ago: Some(time_ago),
220
- local_date: Some(local_date),
221
- metadata: msg.message.map(|m| EntryMetadata {
222
- usage: m.usage,
223
- model: m.model,
224
- request_id: msg.request_id,
225
- total_cost_usd: None,
226
- num_turns: None,
227
- duration_ms: None,
228
- is_error: None,
229
- error_type: None,
230
- }),
231
- project_name: None,
232
- })
233
- }
234
-
235
- /// Decode project path from directory name (e.g., "-Users-ebowwa-Desktop-codespaces" -> "/Users/ebowwa/Desktop/codespaces")
236
- fn decode_project_path(project_dir: &str) -> String {
237
- project_dir
238
- .replace('-', "/")
239
- .trim_start_matches('/')
240
- .to_string()
241
- }
242
-
243
- #[cfg(test)]
244
- mod tests {
245
- use super::*;
246
-
247
- #[test]
248
- fn test_decode_project_path() {
249
- assert_eq!(
250
- decode_project_path("-Users-ebowwa-Desktop-codespaces"),
251
- "Users/ebowwa/Desktop/codespaces"
252
- );
253
- }
254
-
255
- #[test]
256
- fn test_trim_whitespace() {
257
- assert_eq!(trim_whitespace(b" hello "), b"hello");
258
- assert_eq!(trim_whitespace(b"\n\ttest\n"), b"test");
259
- assert_eq!(trim_whitespace(b""), b"");
260
- }
261
- }
@@ -1,206 +0,0 @@
1
- //! Search index operations
2
- //!
3
- //! Provides index management for conversation history search
4
-
5
- use tantivy::{Index, IndexWriter, TantivyDocument};
6
- use tantivy::schema::{Schema, Value};
7
-
8
- use super::schema::HistoryFields;
9
- use super::SearchError;
10
- use crate::types::*;
11
-
12
- /// Add a conversation entry to the search index
13
- pub fn add_entry_to_index(
14
- writer: &mut IndexWriter<TantivyDocument>,
15
- schema: &Schema,
16
- entry: &ConversationEntry,
17
- ) -> Result<(), SearchError> {
18
- let fields = HistoryFields::new(schema);
19
- let mut doc = TantivyDocument::new();
20
-
21
- // Add primary identifiers
22
- doc.add_text(fields.uuid, &entry.uuid);
23
- doc.add_text(fields.session_id, &entry.session_id);
24
- doc.add_text(fields.project_path, &entry.project_path);
25
-
26
- // Add message type
27
- let message_type = match entry.entry_type {
28
- MessageType::User => "user",
29
- MessageType::Assistant => "assistant",
30
- MessageType::System => "system",
31
- MessageType::Result => "result",
32
- };
33
- doc.add_text(fields.message_type, message_type);
34
-
35
- // Add role (same as message_type for now)
36
- doc.add_text(fields.role, message_type);
37
-
38
- // Add timestamp
39
- let timestamp_ms = super::schema::timestamp_to_ms(&entry.timestamp);
40
- doc.add_i64(fields.timestamp_ms, timestamp_ms);
41
-
42
- // Add content
43
- doc.add_text(fields.content, &entry.content);
44
-
45
- // Build combined searchable text
46
- let project_name = super::schema::extract_project_name(&entry.project_path);
47
- let all_text = format!(
48
- "{} {} {} {}",
49
- entry.content,
50
- entry.session_id,
51
- project_name,
52
- entry.uuid
53
- );
54
- doc.add_text(fields.all_text, &all_text);
55
-
56
- // Add metadata if present
57
- if let Some(ref meta) = entry.metadata {
58
- if let Some(ref model) = meta.model {
59
- doc.add_text(fields.model, model);
60
- }
61
- if let Some(ref request_id) = meta.request_id {
62
- doc.add_text(fields.request_id, request_id);
63
- }
64
- if let Some(cost_usd) = meta.total_cost_usd {
65
- doc.add_f64(fields.cost_usd, cost_usd);
66
- }
67
- if let Some(duration_ms) = meta.duration_ms {
68
- doc.add_i64(fields.duration_ms, duration_ms as i64);
69
- }
70
- if let Some(is_error) = meta.is_error {
71
- doc.add_bool(fields.is_error, is_error);
72
- }
73
- }
74
-
75
- // Add project name for faceted search
76
- doc.add_text(fields.project_name, &project_name);
77
-
78
- writer.add_document(doc).map_err(|e| SearchError::Commit(e.to_string()))?;
79
-
80
- Ok(())
81
- }
82
-
83
- /// Delete all entries for a session from the index
84
- pub fn delete_session_from_index(
85
- writer: &mut IndexWriter<TantivyDocument>,
86
- schema: &Schema,
87
- session_id: &str,
88
- ) -> Result<(), SearchError> {
89
- let field = schema.get_field("session_id")
90
- .map_err(|e| SearchError::Schema(e.to_string()))?;
91
-
92
- let term = tantivy::Term::from_field_text(field, session_id);
93
- writer.delete_term(term);
94
-
95
- Ok(())
96
- }
97
-
98
- /// Get index statistics
99
- pub fn get_index_stats(index: &Index, schema: &Schema) -> Result<super::IndexStats, SearchError> {
100
- let reader = index.reader()
101
- .map_err(|e| SearchError::Query(e.to_string()))?;
102
-
103
- let searcher = reader.searcher();
104
- let fields = HistoryFields::new(schema);
105
-
106
- let total_docs = searcher.num_docs() as usize;
107
-
108
- // Count unique sessions and projects
109
- let mut session_ids: std::collections::HashSet<String> = std::collections::HashSet::new();
110
- let mut project_paths: std::collections::HashSet<String> = std::collections::HashSet::new();
111
- let mut user_messages = 0;
112
- let mut assistant_messages = 0;
113
-
114
- for segment_reader in searcher.segment_readers() {
115
- let store_reader = segment_reader.get_store_reader(0)
116
- .map_err(|e| SearchError::Query(e.to_string()))?;
117
-
118
- for doc_id in segment_reader.doc_ids_alive() {
119
- if let Ok(doc) = store_reader.get::<TantivyDocument>(doc_id) {
120
- // Extract session_id
121
- if let Some(value) = doc.get_first(fields.session_id) {
122
- if let Some(text) = value.as_str() {
123
- session_ids.insert(text.to_string());
124
- }
125
- }
126
-
127
- // Extract project_path
128
- if let Some(value) = doc.get_first(fields.project_path) {
129
- if let Some(text) = value.as_str() {
130
- project_paths.insert(text.to_string());
131
- }
132
- }
133
-
134
- // Count message types
135
- if let Some(value) = doc.get_first(fields.message_type) {
136
- if let Some(text) = value.as_str() {
137
- match text {
138
- "user" => user_messages += 1,
139
- "assistant" => assistant_messages += 1,
140
- _ => {}
141
- }
142
- }
143
- }
144
- }
145
- }
146
- }
147
-
148
- // Calculate index size
149
- let index_size_bytes = get_index_size(index);
150
-
151
- Ok(super::IndexStats {
152
- total_docs,
153
- user_messages,
154
- assistant_messages,
155
- unique_sessions: session_ids.len(),
156
- unique_projects: project_paths.len(),
157
- index_size_bytes,
158
- })
159
- }
160
-
161
- /// Get the size of the index directory in bytes
162
- /// Note: In Tantivy 0.22, ManagedDirectory doesn't expose path()
163
- /// This function returns 0 as a placeholder - index size can be tracked separately
164
- fn get_index_size(_index: &Index) -> u64 {
165
- // Tantivy 0.22 doesn't expose directory path through ManagedDirectory
166
- // Index size tracking would need to be done at a higher level
167
- 0
168
- }
169
-
170
- #[cfg(test)]
171
- mod tests {
172
- use super::*;
173
- use tempfile::tempdir;
174
- use super::super::schema::create_history_schema;
175
-
176
- #[test]
177
- fn test_add_entry_to_index() {
178
- let dir = tempdir().unwrap();
179
- let schema = create_history_schema();
180
-
181
- let index = Index::create_in_dir(dir.path(), schema.clone())
182
- .expect("Failed to create index");
183
-
184
- let mut writer = index.writer(50_000_000)
185
- .expect("Failed to create writer");
186
-
187
- let entry = ConversationEntry {
188
- uuid: "test-uuid".to_string(),
189
- session_id: "test-session".to_string(),
190
- timestamp: "2024-01-01T00:00:00Z".to_string(),
191
- project_path: "/test/project".to_string(),
192
- entry_type: MessageType::User,
193
- content: "Hello world".to_string(),
194
- formatted_time: None,
195
- time_ago: None,
196
- local_date: None,
197
- metadata: None,
198
- project_name: None,
199
- };
200
-
201
- let result = add_entry_to_index(&mut writer, &schema, &entry);
202
- assert!(result.is_ok());
203
-
204
- writer.commit().expect("Failed to commit");
205
- }
206
- }