iflow-mcp_developermode-korea_reversecore-mcp 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. iflow_mcp_developermode_korea_reversecore_mcp-1.0.0.dist-info/METADATA +543 -0
  2. iflow_mcp_developermode_korea_reversecore_mcp-1.0.0.dist-info/RECORD +79 -0
  3. iflow_mcp_developermode_korea_reversecore_mcp-1.0.0.dist-info/WHEEL +5 -0
  4. iflow_mcp_developermode_korea_reversecore_mcp-1.0.0.dist-info/entry_points.txt +2 -0
  5. iflow_mcp_developermode_korea_reversecore_mcp-1.0.0.dist-info/licenses/LICENSE +21 -0
  6. iflow_mcp_developermode_korea_reversecore_mcp-1.0.0.dist-info/top_level.txt +1 -0
  7. reversecore_mcp/__init__.py +9 -0
  8. reversecore_mcp/core/__init__.py +78 -0
  9. reversecore_mcp/core/audit.py +101 -0
  10. reversecore_mcp/core/binary_cache.py +138 -0
  11. reversecore_mcp/core/command_spec.py +357 -0
  12. reversecore_mcp/core/config.py +432 -0
  13. reversecore_mcp/core/container.py +288 -0
  14. reversecore_mcp/core/decorators.py +152 -0
  15. reversecore_mcp/core/error_formatting.py +93 -0
  16. reversecore_mcp/core/error_handling.py +142 -0
  17. reversecore_mcp/core/evidence.py +229 -0
  18. reversecore_mcp/core/exceptions.py +296 -0
  19. reversecore_mcp/core/execution.py +240 -0
  20. reversecore_mcp/core/ghidra.py +642 -0
  21. reversecore_mcp/core/ghidra_helper.py +481 -0
  22. reversecore_mcp/core/ghidra_manager.py +234 -0
  23. reversecore_mcp/core/json_utils.py +131 -0
  24. reversecore_mcp/core/loader.py +73 -0
  25. reversecore_mcp/core/logging_config.py +206 -0
  26. reversecore_mcp/core/memory.py +721 -0
  27. reversecore_mcp/core/metrics.py +198 -0
  28. reversecore_mcp/core/mitre_mapper.py +365 -0
  29. reversecore_mcp/core/plugin.py +45 -0
  30. reversecore_mcp/core/r2_helpers.py +404 -0
  31. reversecore_mcp/core/r2_pool.py +403 -0
  32. reversecore_mcp/core/report_generator.py +268 -0
  33. reversecore_mcp/core/resilience.py +252 -0
  34. reversecore_mcp/core/resource_manager.py +169 -0
  35. reversecore_mcp/core/result.py +132 -0
  36. reversecore_mcp/core/security.py +213 -0
  37. reversecore_mcp/core/validators.py +238 -0
  38. reversecore_mcp/dashboard/__init__.py +221 -0
  39. reversecore_mcp/prompts/__init__.py +56 -0
  40. reversecore_mcp/prompts/common.py +24 -0
  41. reversecore_mcp/prompts/game.py +280 -0
  42. reversecore_mcp/prompts/malware.py +1219 -0
  43. reversecore_mcp/prompts/report.py +150 -0
  44. reversecore_mcp/prompts/security.py +136 -0
  45. reversecore_mcp/resources.py +329 -0
  46. reversecore_mcp/server.py +727 -0
  47. reversecore_mcp/tools/__init__.py +49 -0
  48. reversecore_mcp/tools/analysis/__init__.py +74 -0
  49. reversecore_mcp/tools/analysis/capa_tools.py +215 -0
  50. reversecore_mcp/tools/analysis/die_tools.py +180 -0
  51. reversecore_mcp/tools/analysis/diff_tools.py +643 -0
  52. reversecore_mcp/tools/analysis/lief_tools.py +272 -0
  53. reversecore_mcp/tools/analysis/signature_tools.py +591 -0
  54. reversecore_mcp/tools/analysis/static_analysis.py +479 -0
  55. reversecore_mcp/tools/common/__init__.py +58 -0
  56. reversecore_mcp/tools/common/file_operations.py +352 -0
  57. reversecore_mcp/tools/common/memory_tools.py +516 -0
  58. reversecore_mcp/tools/common/patch_explainer.py +230 -0
  59. reversecore_mcp/tools/common/server_tools.py +115 -0
  60. reversecore_mcp/tools/ghidra/__init__.py +19 -0
  61. reversecore_mcp/tools/ghidra/decompilation.py +975 -0
  62. reversecore_mcp/tools/ghidra/ghidra_tools.py +1052 -0
  63. reversecore_mcp/tools/malware/__init__.py +61 -0
  64. reversecore_mcp/tools/malware/adaptive_vaccine.py +579 -0
  65. reversecore_mcp/tools/malware/dormant_detector.py +756 -0
  66. reversecore_mcp/tools/malware/ioc_tools.py +228 -0
  67. reversecore_mcp/tools/malware/vulnerability_hunter.py +519 -0
  68. reversecore_mcp/tools/malware/yara_tools.py +214 -0
  69. reversecore_mcp/tools/patch_explainer.py +19 -0
  70. reversecore_mcp/tools/radare2/__init__.py +13 -0
  71. reversecore_mcp/tools/radare2/r2_analysis.py +972 -0
  72. reversecore_mcp/tools/radare2/r2_session.py +376 -0
  73. reversecore_mcp/tools/radare2/radare2_mcp_tools.py +1183 -0
  74. reversecore_mcp/tools/report/__init__.py +4 -0
  75. reversecore_mcp/tools/report/email.py +82 -0
  76. reversecore_mcp/tools/report/report_mcp_tools.py +344 -0
  77. reversecore_mcp/tools/report/report_tools.py +1076 -0
  78. reversecore_mcp/tools/report/session.py +194 -0
  79. reversecore_mcp/tools/report_tools.py +11 -0
@@ -0,0 +1,643 @@
1
+ """Binary diffing and library matching tools for comparing binaries and identifying library code."""
2
+
3
+ import os
4
+ import re
5
+ from functools import lru_cache
6
+
7
+ from fastmcp import Context
8
+
9
+ # Use high-performance JSON implementation (3-5x faster)
10
+ from reversecore_mcp.core import json_utils as json
11
+ from reversecore_mcp.core.config import get_config
12
+ from reversecore_mcp.core.decorators import log_execution
13
+ from reversecore_mcp.core.error_handling import handle_tool_errors
14
+ from reversecore_mcp.core.execution import execute_subprocess_async
15
+ from reversecore_mcp.core.metrics import track_metrics
16
+ from reversecore_mcp.core.r2_helpers import (
17
+ build_r2_cmd as _build_r2_cmd,
18
+ )
19
+
20
+ # Import shared R2 helper functions from core (avoids circular dependencies)
21
+ from reversecore_mcp.core.r2_helpers import (
22
+ execute_r2_command as _execute_r2_command,
23
+ )
24
+ from reversecore_mcp.core.r2_helpers import (
25
+ parse_json_output as _parse_json_output,
26
+ )
27
+ from reversecore_mcp.core.result import ToolResult, failure, success
28
+ from reversecore_mcp.core.security import validate_file_path
29
+ from reversecore_mcp.core.validators import validate_tool_parameters
30
+
31
+ # Load default timeout from configuration
32
+ DEFAULT_TIMEOUT = get_config().default_tool_timeout
33
+
34
+ # OPTIMIZATION: Pre-compile regex patterns used in hot paths
35
+ _SIMILARITY_PATTERN = re.compile(r"similarity:\s*(\d+\.?\d*)")
36
+ _ADDRESS_PATTERN = re.compile(r"(0x[0-9a-fA-F]+)")
37
+ _HEX_PATTERN = re.compile(r"(?:0x)?([0-9a-fA-F]{4,})")
38
+
39
+
40
+ @lru_cache(maxsize=256)
41
+ def _extract_library_name(function_name: str) -> str:
42
+ """
43
+ Extract library name from function name.
44
+
45
+ Cached to avoid repeated string comparisons for common function names.
46
+ Optimized to call lower() only once.
47
+
48
+ Args:
49
+ function_name: Function name (e.g., "sym.imp.strcpy")
50
+
51
+ Returns:
52
+ Extracted library name or "unknown"
53
+ """
54
+ # Convert to lowercase once for efficient comparison
55
+ name_lower = function_name.lower()
56
+
57
+ # Simple heuristic extraction
58
+ if "kernel32" in name_lower:
59
+ return "kernel32"
60
+ if "msvcrt" in name_lower or "libc" in name_lower:
61
+ return "libc/msvcrt"
62
+ if "std::" in name_lower:
63
+ return "libstdc++"
64
+ if "imp." in name_lower:
65
+ return "import"
66
+ return "unknown"
67
+
68
+
69
+ @log_execution(tool_name="diff_binaries")
70
+ @track_metrics("diff_binaries")
71
+ @handle_tool_errors
72
+ async def diff_binaries(
73
+ file_path_a: str,
74
+ file_path_b: str,
75
+ function_name: str = None,
76
+ max_output_size: int = 10_000_000,
77
+ timeout: int = DEFAULT_TIMEOUT,
78
+ ) -> ToolResult:
79
+ """
80
+ Compare two binary files to identify code changes and modifications.
81
+
82
+ This tool uses radiff2 to perform binary diffing, which is essential for:
83
+ - **Patch Analysis (1-day Exploits)**: Compare pre-patch and post-patch binaries
84
+ to identify security vulnerabilities fixed in updates
85
+ - **Game Hacking**: Find offset changes after game updates to maintain functionality
86
+ - **Malware Variant Analysis**: Identify code differences between malware variants
87
+ (e.g., "90% similar to Lazarus malware, but C2 address generation changed")
88
+
89
+ The tool provides:
90
+ - Similarity score (0.0-1.0) between binaries
91
+ - List of code changes with addresses and descriptions
92
+ - Optional function-level comparison for targeted analysis
93
+
94
+ Args:
95
+ file_path_a: Path to the first binary file (e.g., pre-patch version)
96
+ file_path_b: Path to the second binary file (e.g., post-patch version)
97
+ function_name: Optional function name to compare (e.g., "main", "sym.decrypt").
98
+ If None, performs whole-binary comparison.
99
+ max_output_size: Maximum output size in bytes (default: 10MB)
100
+ timeout: Timeout in seconds (default: 300s)
101
+
102
+ Returns:
103
+ ToolResult with structured JSON containing:
104
+ - similarity: Float between 0.0 and 1.0 indicating code similarity
105
+ - changes: List of detected changes with addresses and descriptions
106
+ - function_specific: Boolean indicating if function-level diff was performed
107
+
108
+ Example:
109
+ # Compare two versions of a patched binary
110
+ diff_binaries("/app/workspace/app_v1.0.exe", "/app/workspace/app_v1.1.exe")
111
+
112
+ # Compare specific function between versions
113
+ diff_binaries("/app/workspace/malware_old.exe", "/app/workspace/malware_new.exe", "main")
114
+
115
+ Output Format:
116
+ {
117
+ "similarity": 0.95,
118
+ "function_specific": false,
119
+ "changes": [
120
+ {
121
+ "address": "0x401050",
122
+ "type": "code_change",
123
+ "description": "Instruction changed from JNZ to JZ"
124
+ },
125
+ {
126
+ "address": "0x401080",
127
+ "type": "new_block",
128
+ "description": "Added security check"
129
+ }
130
+ ],
131
+ "total_changes": 2
132
+ }
133
+ """
134
+ # Validate both file paths
135
+ validated_path_a = validate_file_path(file_path_a)
136
+ validated_path_b = validate_file_path(file_path_b)
137
+
138
+ # Validate tool parameters
139
+ validate_tool_parameters(
140
+ "diff_binaries",
141
+ {
142
+ "function_name": function_name,
143
+ "max_output_size": max_output_size,
144
+ "timeout": timeout,
145
+ },
146
+ )
147
+
148
+ try:
149
+ # Build radiff2 command
150
+ # -s: similarity score
151
+ # -C: code comparison
152
+ # -g: graph diff (if function specified)
153
+
154
+ if function_name:
155
+ # Function-specific comparison using graph diff
156
+ cmd = [
157
+ "radiff2",
158
+ "-g",
159
+ function_name,
160
+ str(validated_path_a),
161
+ str(validated_path_b),
162
+ ]
163
+ else:
164
+ # Whole-binary comparison with similarity scoring
165
+ cmd = [
166
+ "radiff2",
167
+ "-C",
168
+ str(validated_path_a),
169
+ str(validated_path_b),
170
+ ]
171
+
172
+ output, bytes_read = await execute_subprocess_async(
173
+ cmd,
174
+ max_output_size=max_output_size,
175
+ timeout=timeout,
176
+ )
177
+
178
+ # Also get similarity score (format: "similarity: 0.95")
179
+ similarity_cmd = ["radiff2", "-s", str(validated_path_a), str(validated_path_b)]
180
+ similarity_output, _ = await execute_subprocess_async(
181
+ similarity_cmd,
182
+ max_output_size=1_000_000,
183
+ timeout=60,
184
+ )
185
+
186
+ # Parse similarity score (format: "similarity: 0.95")
187
+ similarity = 0.0
188
+ # OPTIMIZATION: Use pre-compiled pattern (faster)
189
+ similarity_match = _SIMILARITY_PATTERN.search(similarity_output)
190
+ if similarity_match:
191
+ similarity = float(similarity_match.group(1))
192
+
193
+ # Parse changes from output
194
+ changes = []
195
+
196
+ # Parse the diff output to extract meaningful changes
197
+ # radiff2 output varies, so we'll capture the raw output and structure it
198
+ lines = output.strip().split("\n")
199
+
200
+ for line in lines:
201
+ if not line.strip():
202
+ continue
203
+
204
+ # Look for common patterns in radiff2 output
205
+ # OPTIMIZATION: Use pre-compiled pattern (faster)
206
+ addr_match = _ADDRESS_PATTERN.search(line)
207
+
208
+ if addr_match:
209
+ address = addr_match.group(1)
210
+
211
+ # Determine change type based on line content
212
+ change_type = "unknown"
213
+ description = line.strip()
214
+
215
+ if "new" in line.lower():
216
+ change_type = "new_block"
217
+ elif "removed" in line.lower() or "deleted" in line.lower():
218
+ change_type = "removed_block"
219
+ elif "modified" in line.lower() or "changed" in line.lower():
220
+ change_type = "code_change"
221
+ elif "jmp" in line.lower() or "call" in line.lower() or "jnz" in line.lower():
222
+ change_type = "control_flow_change"
223
+
224
+ changes.append(
225
+ {
226
+ "address": address,
227
+ "type": change_type,
228
+ "description": description,
229
+ }
230
+ )
231
+
232
+ # If no structured changes found, include summary info
233
+ if not changes and output.strip():
234
+ changes.append(
235
+ {
236
+ "type": "summary",
237
+ "description": "Binary comparison completed. See raw output for details.",
238
+ }
239
+ )
240
+
241
+ # Build result
242
+ result_data = {
243
+ "similarity": similarity,
244
+ "function_specific": bool(function_name),
245
+ "changes": changes,
246
+ "total_changes": len(changes),
247
+ "raw_output": (output if len(output) < 5000 else output[:5000] + "... (truncated)"),
248
+ }
249
+
250
+ return success(
251
+ json.dumps(result_data, indent=2),
252
+ bytes_read=bytes_read,
253
+ similarity=similarity,
254
+ total_changes=len(changes),
255
+ function_specific=bool(function_name),
256
+ )
257
+
258
+ except Exception as e:
259
+ return failure(
260
+ "DIFF_ERROR",
261
+ f"Binary diff failed: {str(e)}",
262
+ hint="Ensure both files are valid binaries and radiff2 is available. For function-level diff, verify function name exists in both binaries.",
263
+ )
264
+
265
+
266
+ @log_execution(tool_name="analyze_variant_changes")
267
+ @track_metrics("analyze_variant_changes")
268
+ @handle_tool_errors
269
+ async def analyze_variant_changes(
270
+ file_path_a: str,
271
+ file_path_b: str,
272
+ top_n: int = 3,
273
+ timeout: int = DEFAULT_TIMEOUT,
274
+ ) -> ToolResult:
275
+ """
276
+ Analyze structural changes between two binary variants (Lineage Mapper).
277
+
278
+ This tool combines binary diffing with control flow analysis to understand
279
+ *how* a binary has evolved. It identifies the most modified functions and
280
+ generates their Control Flow Graphs (CFG) for comparison.
281
+
282
+ **Use Cases:**
283
+ - **Malware Lineage**: "How did Lazarus Group modify their backdoor?"
284
+ - **Patch Diffing**: "What logic changed in the vulnerable function?"
285
+ - **Variant Analysis**: "Is this a new version of the same malware?"
286
+
287
+ Args:
288
+ file_path_a: Path to the original binary
289
+ file_path_b: Path to the variant binary
290
+ top_n: Number of top changed functions to analyze in detail (default: 3)
291
+ timeout: Execution timeout in seconds
292
+
293
+ Returns:
294
+ ToolResult with diff summary and CFG data for top changed functions.
295
+ """
296
+ # Import here to avoid circular dependency
297
+ from reversecore_mcp.tools.radare2.r2_analysis import generate_function_graph
298
+
299
+ # 1. Run diff_binaries
300
+ diff_result = await diff_binaries(file_path_a, file_path_b, timeout=timeout)
301
+
302
+ if diff_result.status == "error":
303
+ return diff_result
304
+
305
+ diff_data = (
306
+ json.loads(diff_result.data) if isinstance(diff_result.data, str) else diff_result.data
307
+ )
308
+ changes = diff_data.get("changes", [])
309
+
310
+ # 2. Identify changed functions (heuristic: group changes by address proximity or use explicit function diff if available)
311
+ # Since diff_binaries returns a flat list of changes, we'll try to map them to functions.
312
+ # For this advanced tool, we'll assume we want to analyze the functions where changes occurred.
313
+
314
+ # Get function list for file B (variant) to map addresses to names
315
+ # We use a simple r2 command to get functions
316
+ validated_path_b = validate_file_path(file_path_b)
317
+ cmd = _build_r2_cmd(str(validated_path_b), ["aflj"], "aaa")
318
+ out, _ = await execute_subprocess_async(cmd, timeout=60)
319
+
320
+ try:
321
+ funcs_b = _parse_json_output(out)
322
+ except (json.JSONDecodeError, TypeError):
323
+ funcs_b = []
324
+
325
+ # OPTIMIZATION: Pre-sort functions by offset for binary search
326
+ # This reduces O(n*m) to O(n*log(m)) complexity
327
+ # Further optimized to minimize redundant dict.get() calls
328
+ sorted_funcs = []
329
+ for f in funcs_b:
330
+ offset = f.get("offset")
331
+ size = f.get("size")
332
+ name = f.get("name", "unknown")
333
+ if offset is not None and size is not None:
334
+ sorted_funcs.append((offset, offset + size, name))
335
+ sorted_funcs.sort(key=lambda x: x[0])
336
+
337
+ # Map changes to functions using binary search
338
+ changed_funcs = {} # {func_name: count}
339
+
340
+ for change in changes:
341
+ addr_str = change.get("address")
342
+ if not addr_str:
343
+ continue
344
+ try:
345
+ addr = int(addr_str, 16)
346
+ # Binary search to find the function containing this address
347
+ left, right = 0, len(sorted_funcs) - 1
348
+ found_func = None
349
+
350
+ while left <= right:
351
+ mid = (left + right) // 2
352
+ func_start, func_end, func_name = sorted_funcs[mid]
353
+
354
+ if func_start <= addr < func_end:
355
+ found_func = func_name
356
+ break
357
+ elif addr < func_start:
358
+ right = mid - 1
359
+ else:
360
+ left = mid + 1
361
+
362
+ if found_func:
363
+ changed_funcs[found_func] = changed_funcs.get(found_func, 0) + 1
364
+ except ValueError:
365
+ # Invalid hex address format
366
+ pass
367
+
368
+ # Sort by number of changes
369
+ sorted_funcs = sorted(changed_funcs.items(), key=lambda x: x[1], reverse=True)[:top_n]
370
+
371
+ detailed_analysis = []
372
+
373
+ # 3. Generate CFG for top changed functions
374
+ for func_name, count in sorted_funcs:
375
+ # Get CFG for variant
376
+ cfg_result = await generate_function_graph(file_path_b, func_name, format="mermaid")
377
+ cfg_mermaid = cfg_result.data if cfg_result.status == "success" else "Error generating CFG"
378
+
379
+ detailed_analysis.append(
380
+ {
381
+ "function": func_name,
382
+ "change_count": count,
383
+ "cfg_mermaid": cfg_mermaid,
384
+ "analysis_hint": f"Function {func_name} has {count} modifications. Compare its logic with the original.",
385
+ }
386
+ )
387
+
388
+ return success(
389
+ {
390
+ "similarity": diff_data.get("similarity"),
391
+ "total_changes": diff_data.get("total_changes"),
392
+ "top_modified_functions": detailed_analysis,
393
+ },
394
+ description=f"Analyzed variants. Similarity: {diff_data.get('similarity')}. Detailed analysis for {len(detailed_analysis)} functions.",
395
+ )
396
+
397
+
398
+ @log_execution(tool_name="match_libraries")
399
+ @track_metrics("match_libraries")
400
+ @handle_tool_errors
401
+ async def match_libraries(
402
+ file_path: str,
403
+ signature_db: str = None,
404
+ max_output_size: int = 10_000_000,
405
+ timeout: int = 600,
406
+ ctx: Context = None,
407
+ ) -> ToolResult:
408
+ """
409
+ Match and filter known library functions to focus on user code.
410
+
411
+ This tool uses radare2's zignatures (FLIRT-compatible signature matching) to:
412
+ - **Reduce Analysis Noise**: Skip analysis of known library functions (strcpy, malloc, etc.)
413
+ - **Focus on User Code**: Identify which functions are original vs library code
414
+ - **Save Time & Tokens**: Reduce analysis scope by 80% by filtering out standard libraries
415
+ - **Improve Accuracy**: Focus AI analysis on the actual malicious/interesting code
416
+
417
+ Common use cases:
418
+ - Analyzing large binaries (>25MB) where most code is OpenSSL, zlib, MFC, etc.
419
+ - Game client reverse engineering (filter out Unreal Engine / Unity standard library)
420
+ - Malware analysis (focus on custom malware code, skip Windows API wrappers)
421
+
422
+ The tool automatically uses built-in signature databases for common libraries
423
+ and can optionally use custom signature databases for specialized analysis.
424
+
425
+ Args:
426
+ file_path: Path to the binary file to analyze
427
+ signature_db: Optional path to custom signature database file (.sig format).
428
+ If None, uses radare2's built-in signature databases.
429
+ max_output_size: Maximum output size in bytes (default: 10MB)
430
+ timeout: Timeout in seconds (default: 600s)
431
+ ctx: FastMCP Context (auto-injected)
432
+
433
+ Returns:
434
+ ToolResult with structured JSON containing:
435
+ - total_functions: Total number of functions found
436
+ - library_functions: Number of matched library functions
437
+ - user_functions: Number of unmatched (user) functions to analyze
438
+ - library_matches: List of matched library functions with details
439
+ - user_function_list: List of user function addresses/names for further analysis
440
+ - noise_reduction_percentage: Percentage of functions filtered out
441
+
442
+ Example:
443
+ # Auto-detect standard libraries
444
+ match_libraries("/app/workspace/large_app.exe")
445
+
446
+ # Use custom signature database
447
+ match_libraries("/app/workspace/game.exe", "/app/rules/game_engine.sig")
448
+
449
+ Output Format:
450
+ {
451
+ "total_functions": 1250,
452
+ "library_functions": 1000,
453
+ "user_functions": 250,
454
+ "noise_reduction_percentage": 80.0,
455
+ "library_matches": [
456
+ {
457
+ "address": "0x401000",
458
+ "name": "strcpy",
459
+ "library": "msvcrt"
460
+ },
461
+ {
462
+ "address": "0x401050",
463
+ "name": "malloc",
464
+ "library": "msvcrt"
465
+ }
466
+ ],
467
+ "user_function_list": [
468
+ "0x402000",
469
+ "0x402100",
470
+ "sym.custom_decrypt"
471
+ ]
472
+ }
473
+ """
474
+ # Validate file path
475
+ validated_path = validate_file_path(file_path)
476
+
477
+ # Validate optional signature database path
478
+ if signature_db:
479
+ validated_sig_path = validate_file_path(signature_db)
480
+
481
+ # Validate tool parameters
482
+ validate_tool_parameters(
483
+ "match_libraries",
484
+ {
485
+ "max_output_size": max_output_size,
486
+ "timeout": timeout,
487
+ },
488
+ )
489
+
490
+ try:
491
+ # Step 1: Determine analysis level based on file size
492
+ analysis_level = "aa" # Default to basic analysis
493
+ try:
494
+ file_size_mb = os.path.getsize(validated_path) / (1024 * 1024)
495
+ if file_size_mb < 10: # Full analysis for files under 10MB
496
+ analysis_level = "aaa"
497
+ if ctx:
498
+ await ctx.info(
499
+ f"File size: {file_size_mb:.1f}MB, using '{analysis_level}' analysis..."
500
+ )
501
+ except OSError:
502
+ pass
503
+
504
+ # Step 2: Build commands for signature matching
505
+ # Use radare2 to get function list with signature matching
506
+ if signature_db:
507
+ # Load custom signature database
508
+ r2_commands = [f"zg {validated_sig_path}", "aflj"]
509
+ else:
510
+ # Use built-in signatures
511
+ r2_commands = ["zg", "aflj"]
512
+
513
+ if ctx:
514
+ await ctx.report_progress(10, 100)
515
+ await ctx.info("Analyzing binary and matching signatures (this may take a while)...")
516
+
517
+ # Step 3: Execute radare2 command
518
+ output, bytes_read = await _execute_r2_command(
519
+ validated_path,
520
+ r2_commands,
521
+ analysis_level=analysis_level,
522
+ max_output_size=max_output_size,
523
+ base_timeout=timeout,
524
+ )
525
+
526
+ if ctx:
527
+ await ctx.report_progress(60, 100)
528
+ await ctx.info("Parsing function list...")
529
+
530
+ # Parse JSON output from aflj (function list JSON)
531
+ try:
532
+ # Attempt to find JSON array in output if direct parse fails
533
+ # This handles cases where 'zg' or 'aaa' might produce non-JSON output before the JSON result
534
+ functions = _parse_json_output(output)
535
+ except json.JSONDecodeError:
536
+ # If JSON parsing fails, fall back to text parsing
537
+ return failure(
538
+ "PARSE_ERROR",
539
+ "Failed to parse function list from radare2",
540
+ hint="The binary may not be analyzable or may be packed/obfuscated. Try running 'aaa' analysis first.",
541
+ )
542
+
543
+ # Categorize functions into library vs user code
544
+ library_functions = []
545
+ user_functions = []
546
+
547
+ total_functions = len(functions)
548
+ for idx, func in enumerate(functions):
549
+ # Report progress
550
+ if ctx and idx % 10 == 0: # Report every 10 functions to avoid spam
551
+ await ctx.report_progress(idx, total_functions)
552
+
553
+ name = func.get("name", "")
554
+ # Support both 'offset' (aflj) and 'vaddr' (isj) keys
555
+ # Fallback to 'realname' or other identifiers if needed
556
+ offset = func.get("offset", func.get("vaddr", 0))
557
+
558
+ # If offset is 0, try to parse it from the name if it looks like sym.func.0x...
559
+ if offset == 0 and name:
560
+ # OPTIMIZATION: Use pre-compiled pattern (faster)
561
+ hex_match = _HEX_PATTERN.search(name)
562
+ if hex_match:
563
+ try:
564
+ offset = int(hex_match.group(1), 16)
565
+ except ValueError:
566
+ pass
567
+
568
+ # Heuristic: library functions typically have names like:
569
+ # - sym.imp.* (imports)
570
+ # - sym.std::* (C++ standard library)
571
+ # - Known library prefixes
572
+ is_library = (
573
+ name.startswith("sym.imp.")
574
+ or name.startswith("sym.std::")
575
+ or name.startswith("fcn.imp.")
576
+ or "libc" in name.lower()
577
+ or "msvcrt" in name.lower()
578
+ or "kernel32" in name.lower()
579
+ )
580
+
581
+ if is_library:
582
+ library_functions.append(
583
+ {
584
+ "address": f"0x{offset:x}",
585
+ "name": name,
586
+ "library": _extract_library_name(name),
587
+ }
588
+ )
589
+ else:
590
+ user_functions.append({"address": f"0x{offset:x}", "name": name})
591
+
592
+ # Final progress report
593
+ if ctx:
594
+ await ctx.report_progress(total_functions, total_functions)
595
+
596
+ total_functions = len(functions)
597
+ library_count = len(library_functions)
598
+ user_count = len(user_functions)
599
+
600
+ # Calculate noise reduction percentage
601
+ noise_reduction = (library_count / total_functions * 100) if total_functions > 0 else 0.0
602
+
603
+ # Build result
604
+ result_data = {
605
+ "total_functions": total_functions,
606
+ "library_functions": library_count,
607
+ "user_functions": user_count,
608
+ "noise_reduction_percentage": round(noise_reduction, 2),
609
+ "library_matches": library_functions[:50], # Limit to first 50 for readability
610
+ "user_function_list": [
611
+ f["address"] for f in user_functions[:100]
612
+ ], # First 100 user functions
613
+ "summary": f"Filtered out {library_count} library functions ({noise_reduction:.1f}% noise reduction). Focus analysis on {user_count} user functions.",
614
+ "signature_db_used": signature_db if signature_db else "built-in",
615
+ }
616
+
617
+ if library_count == 0:
618
+ result_data["hint"] = (
619
+ "No library functions matched. This could mean: "
620
+ "1. No signatures loaded (check signature_db). "
621
+ "2. Binary uses statically linked libraries not in DB. "
622
+ "3. Binary is fully custom."
623
+ )
624
+
625
+ return success(
626
+ json.dumps(result_data, indent=2),
627
+ bytes_read=bytes_read,
628
+ total_functions=total_functions,
629
+ library_functions=library_count,
630
+ user_functions=user_count,
631
+ noise_reduction=round(noise_reduction, 2),
632
+ )
633
+
634
+ except Exception as e:
635
+ return failure(
636
+ "LIBRARY_MATCH_ERROR",
637
+ f"Library signature matching failed: {str(e)}",
638
+ hint="Ensure the binary is valid and radare2 signature databases are available. For custom databases, verify the .sig file format.",
639
+ )
640
+
641
+
642
+ # Note: DiffToolsPlugin has been removed.
643
+ # The diff tools are now registered via AnalysisToolsPlugin in analysis/__init__.py.