devlake-mcp 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,547 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Transcript 解析工具模块
5
+
6
+ 提供 transcript 文件的解析功能:
7
+ - 获取最新用户消息 UUID
8
+ - 解析最新的 Claude 响应
9
+ - 提取使用的工具列表
10
+ - 统计消息数量
11
+ - 读取完整内容
12
+ - 压缩 transcript 内容
13
+ """
14
+
15
+ import json
16
+ import logging
17
+ import gzip
18
+ import base64
19
+ from typing import Optional, Dict, List, Any
20
+ from datetime import datetime, timezone, timedelta
21
+
22
+ from devlake_mcp.constants import (
23
+ TRANSCRIPT_COMPRESSION_THRESHOLD,
24
+ TRANSCRIPT_COMPRESSION_ALGORITHM,
25
+ )
26
+
27
+ # 配置日志(使用标准 Python logging)
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # 时区配置
31
+ UTC_PLUS_8 = timezone(timedelta(hours=8))
32
+
33
+
34
+ def convert_to_utc_plus_8(iso_timestamp: str) -> str:
35
+ """
36
+ 将 ISO 8601 格式的时间戳转换为 UTC+8 时区
37
+
38
+ Args:
39
+ iso_timestamp: ISO 8601 格式时间戳,如 "2025-11-03T05:39:16.109Z"
40
+
41
+ Returns:
42
+ UTC+8 时区的 ISO 8601 格式时间戳,如 "2025-11-03T13:39:16.109+08:00"
43
+ """
44
+ try:
45
+ if not iso_timestamp:
46
+ return None
47
+
48
+ # 解析 ISO 8601 时间戳
49
+ dt = datetime.fromisoformat(iso_timestamp.replace('Z', '+00:00'))
50
+
51
+ # 转换为 UTC+8
52
+ dt_utc8 = dt.astimezone(UTC_PLUS_8)
53
+
54
+ # 返回 ISO 格式(保留时区信息)
55
+ return dt_utc8.isoformat()
56
+ except Exception as e:
57
+ logger.error(f"Failed to convert timestamp {iso_timestamp}: {e}")
58
+ return iso_timestamp # 转换失败时返回原始值
59
+
60
+
61
+ def get_latest_user_message_uuid(transcript_path: str) -> Optional[str]:
62
+ """
63
+ 获取最新的用户消息 UUID
64
+
65
+ 如果找不到用户消息的 UUID,则尝试从 summary 中获取 leafUuid
66
+
67
+ Args:
68
+ transcript_path: Transcript 文件路径
69
+
70
+ Returns:
71
+ 最新用户消息的 UUID,或者 summary 的 leafUuid,如果都没有返回 None
72
+ """
73
+ logger.debug(f"开始获取最新用户消息 UUID,transcript: {transcript_path}")
74
+
75
+ try:
76
+ with open(transcript_path, 'r', encoding='utf-8') as f:
77
+ lines = f.readlines()
78
+ logger.debug(f"读取到 {len(lines)} 行数据")
79
+
80
+ # 从后往前找第一个 type='user' 的消息
81
+ user_msg_count = 0
82
+ for line in reversed(lines):
83
+ try:
84
+ msg = json.loads(line.strip())
85
+ if msg.get('type') == 'user':
86
+ user_msg_count += 1
87
+ msg_uuid = msg.get('uuid')
88
+ logger.debug(f"找到第 {user_msg_count} 个 user 消息,UUID: {msg_uuid}")
89
+ if msg_uuid:
90
+ logger.debug(f"成功获取用户消息 UUID: {msg_uuid}")
91
+ return msg_uuid
92
+ except json.JSONDecodeError:
93
+ continue
94
+
95
+ logger.debug(f"未找到有效的 user 消息 UUID,尝试从 summary 获取 leafUuid")
96
+
97
+ # 如果没有找到 user 消息的 UUID,尝试从 summary 中获取 leafUuid
98
+ for line in reversed(lines):
99
+ try:
100
+ msg = json.loads(line.strip())
101
+ if msg.get('type') == 'summary':
102
+ leaf_uuid = msg.get('leafUuid')
103
+ if leaf_uuid:
104
+ logger.info(f"未找到用户消息 UUID,使用 summary 的 leafUuid: {leaf_uuid}")
105
+ return leaf_uuid
106
+ except json.JSONDecodeError:
107
+ continue
108
+
109
+ except FileNotFoundError:
110
+ logger.error(f"Transcript 文件不存在: {transcript_path}")
111
+ except Exception as e:
112
+ logger.error(f"Failed to get latest user message UUID: {e}")
113
+
114
+ # 无法获取任何 UUID
115
+ logger.warning(f"无法从 transcript 获取 UUID 或 leafUuid")
116
+ return None
117
+
118
+
119
+ def parse_latest_response(transcript_path: str) -> Optional[Dict]:
120
+ """
121
+ 解析最新的 Claude 响应(等待完整响应)
122
+
123
+ Args:
124
+ transcript_path: Transcript 文件路径
125
+
126
+ Returns:
127
+ 响应消息字典,包含 uuid、parent_uuid、content、usage、timestamp、model
128
+ 如果不存在返回 None
129
+ """
130
+ try:
131
+ import time
132
+ max_wait = 5 # 最多等待 5 秒
133
+ wait_interval = 0.1 # 每次等待 100ms
134
+ elapsed = 0
135
+
136
+ while elapsed < max_wait:
137
+ with open(transcript_path, 'r', encoding='utf-8') as f:
138
+ lines = f.readlines()
139
+ logger.debug(f"读取 transcript: {transcript_path}, 行数: {len(lines)}")
140
+ # 从后往前找第一个 type='assistant' 的消息
141
+ for line in reversed(lines):
142
+ try:
143
+ msg = json.loads(line.strip())
144
+ if msg.get('type') == 'assistant':
145
+ message_obj = msg.get('message', {})
146
+ usage = message_obj.get('usage', {})
147
+ output_tokens = usage.get('output_tokens', 0)
148
+
149
+ # 确保响应已完成:output_tokens > 1(避免只获取到第一个 token)
150
+ # 或者有 stop_reason
151
+ stop_reason = message_obj.get('stop_reason')
152
+ if output_tokens > 1 or stop_reason:
153
+ logger.debug(f"找到完整响应:tokens={output_tokens}, stop_reason={stop_reason}")
154
+ return {
155
+ 'uuid': msg.get('uuid'),
156
+ 'parent_uuid': msg.get('parentUuid'),
157
+ 'content': message_obj.get('content', []),
158
+ 'usage': usage,
159
+ 'timestamp': msg.get('timestamp'),
160
+ 'model': message_obj.get('model')
161
+ }
162
+ else:
163
+ # 响应还未完成,继续等待
164
+ logger.debug(f"响应未完成(tokens={output_tokens}),等待...")
165
+ break
166
+ except json.JSONDecodeError:
167
+ continue
168
+
169
+ # 等待一小段时间后重试
170
+ time.sleep(wait_interval)
171
+ elapsed += wait_interval
172
+
173
+ # 超时后,返回最后找到的响应(即使不完整)
174
+ logger.warning(f"等待 {max_wait}s 后仍未获取完整响应,返回最后的响应")
175
+ with open(transcript_path, 'r', encoding='utf-8') as f:
176
+ lines = f.readlines()
177
+ for line in reversed(lines):
178
+ try:
179
+ msg = json.loads(line.strip())
180
+ if msg.get('type') == 'assistant':
181
+ message_obj = msg.get('message', {})
182
+ return {
183
+ 'uuid': msg.get('uuid'),
184
+ 'parent_uuid': msg.get('parentUuid'),
185
+ 'content': message_obj.get('content', []),
186
+ 'usage': message_obj.get('usage', {}),
187
+ 'timestamp': msg.get('timestamp'),
188
+ 'model': message_obj.get('model')
189
+ }
190
+ except json.JSONDecodeError:
191
+ continue
192
+
193
+ except Exception as e:
194
+ logger.error(f"Failed to parse latest response: {e}")
195
+
196
+ return None
197
+
198
+
199
+ def extract_tools_used(response_message: Dict) -> List[str]:
200
+ """
201
+ 从响应中提取使用的工具列表
202
+
203
+ Args:
204
+ response_message: 响应消息字典(由 parse_latest_response 返回)
205
+
206
+ Returns:
207
+ 工具名称列表,如 ['Edit', 'Bash', 'Read']
208
+ """
209
+ tools = set()
210
+ try:
211
+ content = response_message.get('content', [])
212
+ if isinstance(content, list):
213
+ for item in content:
214
+ if isinstance(item, dict) and item.get('type') == 'tool_use':
215
+ tool_name = item.get('name', '')
216
+ if tool_name:
217
+ tools.add(tool_name)
218
+ except Exception as e:
219
+ logger.error(f"Failed to extract tools: {e}")
220
+
221
+ return list(tools)
222
+
223
+
224
+ def count_user_messages(transcript_path: str) -> int:
225
+ """
226
+ 统计 transcript 中的用户消息数量
227
+
228
+ Args:
229
+ transcript_path: Transcript 文件路径
230
+
231
+ Returns:
232
+ 用户消息数量
233
+ """
234
+ count = 0
235
+ try:
236
+ with open(transcript_path, 'r', encoding='utf-8') as f:
237
+ for line in f:
238
+ try:
239
+ msg = json.loads(line.strip())
240
+ if msg.get('type') == 'user':
241
+ count += 1
242
+ except json.JSONDecodeError:
243
+ continue
244
+ except Exception as e:
245
+ logger.error(f"Failed to count user messages: {e}")
246
+
247
+ return count
248
+
249
+
250
+ def read_transcript_content(transcript_path: str) -> str:
251
+ """
252
+ 读取 transcript 文件的完整内容
253
+
254
+ Args:
255
+ transcript_path: Transcript 文件路径
256
+
257
+ Returns:
258
+ 完整的 JSONL 内容(字符串)
259
+ """
260
+ try:
261
+ with open(transcript_path, 'r', encoding='utf-8') as f:
262
+ return f.read()
263
+ except Exception as e:
264
+ logger.error(f"Failed to read transcript: {e}")
265
+ return ''
266
+
267
+
268
+ def get_user_message_by_uuid(transcript_path: str, user_uuid: str) -> Optional[Dict]:
269
+ """
270
+ 根据 UUID 获取完整的 user 消息信息
271
+
272
+ Args:
273
+ transcript_path: Transcript 文件路径
274
+ user_uuid: 用户消息的 UUID
275
+
276
+ Returns:
277
+ 用户消息字典,包含 uuid、content、timestamp 等完整信息
278
+ 如果不存在返回 None
279
+ """
280
+ try:
281
+ with open(transcript_path, 'r', encoding='utf-8') as f:
282
+ for line in f:
283
+ try:
284
+ msg = json.loads(line.strip())
285
+ if msg.get('uuid') == user_uuid and msg.get('type') == 'user':
286
+ # user 消息的内容在 message.content 中
287
+ message_obj = msg.get('message', {})
288
+ content = message_obj.get('content', '')
289
+
290
+ return {
291
+ 'uuid': msg.get('uuid'),
292
+ 'content': content,
293
+ 'timestamp': msg.get('timestamp'),
294
+ 'parent_uuid': msg.get('parentUuid'),
295
+ # 提取额外的元数据(如果存在)
296
+ 'cwd': msg.get('cwd'),
297
+ 'permission_mode': msg.get('permissionMode'),
298
+ 'raw_message': msg # 保留原始消息,以备需要
299
+ }
300
+ except json.JSONDecodeError:
301
+ continue
302
+ except Exception as e:
303
+ logger.error(f"Failed to get user message by UUID {user_uuid}: {e}")
304
+
305
+ return None
306
+
307
+
308
+ def trace_to_user_message(transcript_path: str, start_uuid: str, max_depth: int = 100) -> Optional[str]:
309
+ """
310
+ 从给定的 UUID 追溯到最初的 user 消息(排除 tool_result 类型的 user 消息)
311
+
312
+ 用于处理:
313
+ 1. thinking 消息链:user → assistant(thinking) → assistant(thinking) → assistant(response)
314
+ 2. tool_result 消息链:user(prompt) → assistant(tool_use) → user(tool_result) → assistant(response)
315
+ 3. 复杂的消息链:包含多个工具调用、hook 触发、system 消息等
316
+
317
+ Args:
318
+ transcript_path: Transcript 文件路径
319
+ start_uuid: 起始 UUID(通常是 assistant 消息的 parentUuid)
320
+ max_depth: 最大追溯深度(防止死循环),默认 100 步
321
+
322
+ Returns:
323
+ 最初的 user 消息 UUID(内容是真正的用户输入,而非 tool_result),如果未找到或超过深度限制返回 None
324
+
325
+ 注意:
326
+ 在包含大量工具调用和 hooks 的复杂对话中,追溯深度可能超过 20 步。
327
+ 例如:user → assistant(tool_use) → user(tool_result) → system(hook) → assistant(thinking) → ...
328
+ """
329
+ try:
330
+ # 构建 UUID -> 消息的映射
331
+ uuid_to_message = {}
332
+ with open(transcript_path, 'r', encoding='utf-8') as f:
333
+ for line in f:
334
+ try:
335
+ msg = json.loads(line.strip())
336
+ uuid_to_message[msg.get('uuid')] = msg
337
+ except json.JSONDecodeError:
338
+ continue
339
+
340
+ # 从 start_uuid 开始追溯
341
+ current_uuid = start_uuid
342
+ depth = 0
343
+
344
+ while current_uuid and depth < max_depth:
345
+ msg = uuid_to_message.get(current_uuid)
346
+ if not msg:
347
+ # UUID 不存在,停止追溯
348
+ logger.warning(f"UUID {current_uuid} not found in transcript")
349
+ return None
350
+
351
+ msg_type = msg.get('type')
352
+
353
+ if msg_type == 'user':
354
+ # 检查是否是 tool_result 类型的 user 消息
355
+ message_obj = msg.get('message', {})
356
+ content = message_obj.get('content', '')
357
+
358
+ # 如果 content 是列表且包含 tool_result,继续往上追溯
359
+ if isinstance(content, list):
360
+ has_tool_result = any(
361
+ isinstance(item, dict) and item.get('type') == 'tool_result'
362
+ for item in content
363
+ )
364
+ if has_tool_result:
365
+ # 这是 tool_result 类型的 user 消息,继续追溯
366
+ logger.debug(f"跳过 tool_result 类型的 user 消息: {current_uuid}")
367
+ parent_uuid = msg.get('parentUuid')
368
+ if parent_uuid:
369
+ current_uuid = parent_uuid
370
+ depth += 1
371
+ continue
372
+ else:
373
+ logger.warning(f"tool_result user message {current_uuid} has no parentUuid")
374
+ return None
375
+
376
+ # 找到真正的 user 消息,返回
377
+ logger.debug(f"找到真实 user 消息: {current_uuid}")
378
+ return current_uuid
379
+
380
+ elif msg_type == 'assistant':
381
+ # 继续向上追溯
382
+ parent_uuid = msg.get('parentUuid')
383
+ if not parent_uuid:
384
+ logger.warning(f"No parentUuid for assistant message {current_uuid}")
385
+ return None
386
+ current_uuid = parent_uuid
387
+ else:
388
+ # 跳过其他类型(如 system、file-history-snapshot 等),继续追溯
389
+ parent_uuid = msg.get('parentUuid')
390
+ if parent_uuid:
391
+ current_uuid = parent_uuid
392
+ else:
393
+ logger.warning(f"No parentUuid for message type {msg_type}: {current_uuid}")
394
+ return None
395
+
396
+ depth += 1
397
+
398
+ if depth >= max_depth:
399
+ logger.warning(f"Exceeded max depth {max_depth} when tracing from {start_uuid}")
400
+
401
+ return None
402
+
403
+ except Exception as e:
404
+ logger.error(f"Failed to trace to user message: {e}")
405
+ return None
406
+
407
+
408
+ def get_transcript_stats(transcript_path: str) -> Dict:
409
+ """
410
+ 获取 transcript 的统计信息
411
+
412
+ Args:
413
+ transcript_path: Transcript 文件路径
414
+
415
+ Returns:
416
+ 统计信息字典
417
+ """
418
+ try:
419
+ import os
420
+
421
+ user_count = 0
422
+ assistant_count = 0
423
+
424
+ with open(transcript_path, 'r', encoding='utf-8') as f:
425
+ for line in f:
426
+ try:
427
+ msg = json.loads(line.strip())
428
+ msg_type = msg.get('type')
429
+ if msg_type == 'user':
430
+ user_count += 1
431
+ elif msg_type == 'assistant':
432
+ assistant_count += 1
433
+ except json.JSONDecodeError:
434
+ continue
435
+
436
+ file_size = os.path.getsize(transcript_path) if os.path.exists(transcript_path) else 0
437
+
438
+ return {
439
+ 'user_messages': user_count,
440
+ 'assistant_messages': assistant_count,
441
+ 'total_messages': user_count + assistant_count,
442
+ 'file_size_bytes': file_size,
443
+ 'file_size_kb': round(file_size / 1024, 2)
444
+ }
445
+ except Exception as e:
446
+ logger.error(f"Failed to get transcript stats: {e}")
447
+ return {
448
+ 'user_messages': 0,
449
+ 'assistant_messages': 0,
450
+ 'total_messages': 0,
451
+ 'file_size_bytes': 0,
452
+ 'file_size_kb': 0.0
453
+ }
454
+
455
+
456
+ def compress_transcript_content(
457
+ content: str,
458
+ threshold_bytes: int = TRANSCRIPT_COMPRESSION_THRESHOLD
459
+ ) -> Dict[str, Any]:
460
+ """
461
+ 智能压缩 transcript 内容
462
+
463
+ 根据内容大小自动判断是否需要压缩:
464
+ - 大于阈值:使用 gzip 压缩 + base64 编码
465
+ - 小于阈值:直接返回原始内容
466
+
467
+ Args:
468
+ content: 原始 JSONL 内容(字符串)
469
+ threshold_bytes: 压缩阈值(字节),默认使用 TRANSCRIPT_COMPRESSION_THRESHOLD
470
+
471
+ Returns:
472
+ Dict[str, Any]: {
473
+ 'content': str, # 处理后的内容(压缩+base64 或原始)
474
+ 'compression': str, # 压缩类型:'gzip' 或 'none'
475
+ 'original_size': int, # 原始大小(字节)
476
+ 'compressed_size': int, # 处理后大小(字节)
477
+ 'compression_ratio': float # 压缩率(百分比,0-100)
478
+ }
479
+
480
+ 异常处理:
481
+ 如果压缩过程失败,会自动降级为不压缩,返回原始内容。
482
+
483
+ 示例:
484
+ >>> content = read_transcript_content('/path/to/transcript.jsonl')
485
+ >>> result = compress_transcript_content(content)
486
+ >>> if result['compression'] == 'gzip':
487
+ >>> logger.info(f"压缩率: {result['compression_ratio']:.1f}%")
488
+ """
489
+ try:
490
+ # 1. 计算原始大小
491
+ content_bytes = content.encode('utf-8')
492
+ original_size = len(content_bytes)
493
+
494
+ # 2. 判断是否需要压缩
495
+ if original_size <= threshold_bytes:
496
+ # 不压缩,直接返回原始内容
497
+ logger.debug(
498
+ f"Transcript 大小 {original_size} bytes <= {threshold_bytes} bytes,不压缩"
499
+ )
500
+ return {
501
+ 'content': content,
502
+ 'compression': 'none',
503
+ 'original_size': original_size,
504
+ 'compressed_size': original_size,
505
+ 'compression_ratio': 0.0
506
+ }
507
+
508
+ # 3. 执行压缩
509
+ logger.debug(
510
+ f"Transcript 大小 {original_size} bytes > {threshold_bytes} bytes,开始压缩"
511
+ )
512
+
513
+ # gzip 压缩
514
+ compressed_bytes = gzip.compress(content_bytes, compresslevel=6)
515
+ compressed_size = len(compressed_bytes)
516
+
517
+ # base64 编码(以便在 JSON 中传输)
518
+ encoded_content = base64.b64encode(compressed_bytes).decode('ascii')
519
+
520
+ # 4. 计算压缩率
521
+ compression_ratio = (1 - compressed_size / original_size) * 100
522
+
523
+ logger.info(
524
+ f"压缩完成:{original_size} bytes → {compressed_size} bytes "
525
+ f"(压缩率: {compression_ratio:.1f}%)"
526
+ )
527
+
528
+ return {
529
+ 'content': encoded_content,
530
+ 'compression': TRANSCRIPT_COMPRESSION_ALGORITHM,
531
+ 'original_size': original_size,
532
+ 'compressed_size': compressed_size,
533
+ 'compression_ratio': round(compression_ratio, 2)
534
+ }
535
+
536
+ except Exception as e:
537
+ # 压缩失败,降级为不压缩
538
+ logger.error(f"压缩 transcript 失败,使用原始内容: {e}", exc_info=True)
539
+
540
+ original_size = len(content.encode('utf-8'))
541
+ return {
542
+ 'content': content,
543
+ 'compression': 'none',
544
+ 'original_size': original_size,
545
+ 'compressed_size': original_size,
546
+ 'compression_ratio': 0.0
547
+ }