claude-mpm 5.1.9__py3-none-any.whl → 5.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of claude-mpm might be problematic. Click here for more details.

Files changed (131) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/PM_INSTRUCTIONS.md +46 -0
  3. claude_mpm/agents/agent_loader.py +10 -17
  4. claude_mpm/agents/templates/circuit-breakers.md +138 -1
  5. claude_mpm/cli/commands/agent_state_manager.py +8 -17
  6. claude_mpm/cli/commands/configure.py +1046 -149
  7. claude_mpm/cli/commands/configure_agent_display.py +13 -6
  8. claude_mpm/cli/commands/mpm_init/core.py +158 -1
  9. claude_mpm/cli/commands/mpm_init/knowledge_extractor.py +481 -0
  10. claude_mpm/cli/commands/mpm_init/prompts.py +280 -0
  11. claude_mpm/cli/commands/summarize.py +413 -0
  12. claude_mpm/cli/executor.py +8 -0
  13. claude_mpm/cli/parsers/base_parser.py +5 -0
  14. claude_mpm/cli/startup.py +60 -53
  15. claude_mpm/commands/{mpm-ticket-organize.md → mpm-organize.md} +4 -5
  16. claude_mpm/config/agent_sources.py +27 -0
  17. claude_mpm/core/framework/loaders/agent_loader.py +8 -5
  18. claude_mpm/core/socketio_pool.py +3 -3
  19. claude_mpm/core/unified_agent_registry.py +5 -15
  20. claude_mpm/hooks/claude_hooks/__pycache__/__init__.cpython-313.pyc +0 -0
  21. claude_mpm/hooks/claude_hooks/__pycache__/correlation_manager.cpython-313.pyc +0 -0
  22. claude_mpm/hooks/claude_hooks/__pycache__/event_handlers.cpython-313.pyc +0 -0
  23. claude_mpm/hooks/claude_hooks/__pycache__/hook_handler.cpython-313.pyc +0 -0
  24. claude_mpm/hooks/claude_hooks/__pycache__/memory_integration.cpython-313.pyc +0 -0
  25. claude_mpm/hooks/claude_hooks/__pycache__/response_tracking.cpython-313.pyc +0 -0
  26. claude_mpm/hooks/claude_hooks/__pycache__/tool_analysis.cpython-313.pyc +0 -0
  27. claude_mpm/hooks/claude_hooks/correlation_manager.py +60 -0
  28. claude_mpm/hooks/claude_hooks/event_handlers.py +35 -2
  29. claude_mpm/hooks/claude_hooks/hook_handler.py +4 -0
  30. claude_mpm/hooks/claude_hooks/services/__pycache__/__init__.cpython-313.pyc +0 -0
  31. claude_mpm/hooks/claude_hooks/services/__pycache__/connection_manager_http.cpython-313.pyc +0 -0
  32. claude_mpm/hooks/claude_hooks/services/__pycache__/duplicate_detector.cpython-313.pyc +0 -0
  33. claude_mpm/hooks/claude_hooks/services/__pycache__/state_manager.cpython-313.pyc +0 -0
  34. claude_mpm/hooks/claude_hooks/services/__pycache__/subagent_processor.cpython-313.pyc +0 -0
  35. claude_mpm/hooks/claude_hooks/services/connection_manager.py +4 -0
  36. claude_mpm/scripts/launch_monitor.py +93 -13
  37. claude_mpm/services/agents/agent_recommendation_service.py +279 -0
  38. claude_mpm/services/agents/deployment/agent_template_builder.py +3 -2
  39. claude_mpm/services/agents/deployment/remote_agent_discovery_service.py +322 -53
  40. claude_mpm/services/agents/git_source_manager.py +20 -0
  41. claude_mpm/services/agents/sources/git_source_sync_service.py +8 -1
  42. claude_mpm/services/agents/toolchain_detector.py +6 -5
  43. claude_mpm/services/analysis/__init__.py +11 -1
  44. claude_mpm/services/analysis/clone_detector.py +1030 -0
  45. claude_mpm/services/command_deployment_service.py +0 -2
  46. claude_mpm/services/event_bus/config.py +3 -1
  47. claude_mpm/services/monitor/daemon.py +9 -2
  48. claude_mpm/services/monitor/daemon_manager.py +39 -3
  49. claude_mpm/services/monitor/server.py +225 -19
  50. claude_mpm/services/socketio/event_normalizer.py +15 -1
  51. claude_mpm/services/socketio/server/core.py +160 -21
  52. claude_mpm/services/version_control/git_operations.py +103 -0
  53. claude_mpm/utils/agent_filters.py +17 -44
  54. {claude_mpm-5.1.9.dist-info → claude_mpm-5.4.3.dist-info}/METADATA +1 -77
  55. {claude_mpm-5.1.9.dist-info → claude_mpm-5.4.3.dist-info}/RECORD +59 -114
  56. {claude_mpm-5.1.9.dist-info → claude_mpm-5.4.3.dist-info}/entry_points.txt +0 -2
  57. claude_mpm/dashboard/analysis_runner.py +0 -455
  58. claude_mpm/dashboard/index.html +0 -13
  59. claude_mpm/dashboard/open_dashboard.py +0 -66
  60. claude_mpm/dashboard/static/css/activity.css +0 -1958
  61. claude_mpm/dashboard/static/css/connection-status.css +0 -370
  62. claude_mpm/dashboard/static/css/dashboard.css +0 -4701
  63. claude_mpm/dashboard/static/js/components/activity-tree.js +0 -1871
  64. claude_mpm/dashboard/static/js/components/agent-hierarchy.js +0 -777
  65. claude_mpm/dashboard/static/js/components/agent-inference.js +0 -956
  66. claude_mpm/dashboard/static/js/components/build-tracker.js +0 -333
  67. claude_mpm/dashboard/static/js/components/code-simple.js +0 -857
  68. claude_mpm/dashboard/static/js/components/connection-debug.js +0 -654
  69. claude_mpm/dashboard/static/js/components/diff-viewer.js +0 -891
  70. claude_mpm/dashboard/static/js/components/event-processor.js +0 -542
  71. claude_mpm/dashboard/static/js/components/event-viewer.js +0 -1155
  72. claude_mpm/dashboard/static/js/components/export-manager.js +0 -368
  73. claude_mpm/dashboard/static/js/components/file-change-tracker.js +0 -443
  74. claude_mpm/dashboard/static/js/components/file-change-viewer.js +0 -690
  75. claude_mpm/dashboard/static/js/components/file-tool-tracker.js +0 -724
  76. claude_mpm/dashboard/static/js/components/file-viewer.js +0 -580
  77. claude_mpm/dashboard/static/js/components/hud-library-loader.js +0 -211
  78. claude_mpm/dashboard/static/js/components/hud-manager.js +0 -671
  79. claude_mpm/dashboard/static/js/components/hud-visualizer.js +0 -1718
  80. claude_mpm/dashboard/static/js/components/module-viewer.js +0 -2764
  81. claude_mpm/dashboard/static/js/components/session-manager.js +0 -579
  82. claude_mpm/dashboard/static/js/components/socket-manager.js +0 -368
  83. claude_mpm/dashboard/static/js/components/ui-state-manager.js +0 -749
  84. claude_mpm/dashboard/static/js/components/unified-data-viewer.js +0 -1824
  85. claude_mpm/dashboard/static/js/components/working-directory.js +0 -920
  86. claude_mpm/dashboard/static/js/connection-manager.js +0 -536
  87. claude_mpm/dashboard/static/js/dashboard.js +0 -1914
  88. claude_mpm/dashboard/static/js/extension-error-handler.js +0 -164
  89. claude_mpm/dashboard/static/js/socket-client.js +0 -1474
  90. claude_mpm/dashboard/static/js/tab-isolation-fix.js +0 -185
  91. claude_mpm/dashboard/static/socket.io.min.js +0 -7
  92. claude_mpm/dashboard/static/socket.io.v4.8.1.backup.js +0 -7
  93. claude_mpm/dashboard/templates/code_simple.html +0 -153
  94. claude_mpm/dashboard/templates/index.html +0 -606
  95. claude_mpm/dashboard/test_dashboard.html +0 -372
  96. claude_mpm/scripts/mcp_server.py +0 -75
  97. claude_mpm/scripts/mcp_wrapper.py +0 -39
  98. claude_mpm/services/mcp_gateway/__init__.py +0 -159
  99. claude_mpm/services/mcp_gateway/auto_configure.py +0 -369
  100. claude_mpm/services/mcp_gateway/config/__init__.py +0 -17
  101. claude_mpm/services/mcp_gateway/config/config_loader.py +0 -296
  102. claude_mpm/services/mcp_gateway/config/config_schema.py +0 -243
  103. claude_mpm/services/mcp_gateway/config/configuration.py +0 -429
  104. claude_mpm/services/mcp_gateway/core/__init__.py +0 -43
  105. claude_mpm/services/mcp_gateway/core/base.py +0 -312
  106. claude_mpm/services/mcp_gateway/core/exceptions.py +0 -253
  107. claude_mpm/services/mcp_gateway/core/interfaces.py +0 -443
  108. claude_mpm/services/mcp_gateway/core/process_pool.py +0 -977
  109. claude_mpm/services/mcp_gateway/core/singleton_manager.py +0 -315
  110. claude_mpm/services/mcp_gateway/core/startup_verification.py +0 -316
  111. claude_mpm/services/mcp_gateway/main.py +0 -589
  112. claude_mpm/services/mcp_gateway/registry/__init__.py +0 -12
  113. claude_mpm/services/mcp_gateway/registry/service_registry.py +0 -412
  114. claude_mpm/services/mcp_gateway/registry/tool_registry.py +0 -489
  115. claude_mpm/services/mcp_gateway/server/__init__.py +0 -15
  116. claude_mpm/services/mcp_gateway/server/mcp_gateway.py +0 -414
  117. claude_mpm/services/mcp_gateway/server/stdio_handler.py +0 -372
  118. claude_mpm/services/mcp_gateway/server/stdio_server.py +0 -712
  119. claude_mpm/services/mcp_gateway/tools/__init__.py +0 -36
  120. claude_mpm/services/mcp_gateway/tools/base_adapter.py +0 -485
  121. claude_mpm/services/mcp_gateway/tools/document_summarizer.py +0 -789
  122. claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +0 -654
  123. claude_mpm/services/mcp_gateway/tools/health_check_tool.py +0 -456
  124. claude_mpm/services/mcp_gateway/tools/hello_world.py +0 -551
  125. claude_mpm/services/mcp_gateway/tools/kuzu_memory_service.py +0 -555
  126. claude_mpm/services/mcp_gateway/utils/__init__.py +0 -14
  127. claude_mpm/services/mcp_gateway/utils/package_version_checker.py +0 -160
  128. claude_mpm/services/mcp_gateway/utils/update_preferences.py +0 -170
  129. {claude_mpm-5.1.9.dist-info → claude_mpm-5.4.3.dist-info}/WHEEL +0 -0
  130. {claude_mpm-5.1.9.dist-info → claude_mpm-5.4.3.dist-info}/licenses/LICENSE +0 -0
  131. {claude_mpm-5.1.9.dist-info → claude_mpm-5.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1030 @@
1
+ """Code clone detection service using AST-based similarity analysis.
2
+
3
+ This module provides functionality to detect code clones (duplicated or similar code)
4
+ across Python codebases and suggest refactoring opportunities.
5
+
6
+ Extended to support multi-language clone detection using tree-sitter for:
7
+ JavaScript, TypeScript, Go, Rust, Java, Ruby, PHP, C, C++
8
+ """
9
+
10
+ import ast
11
+ import difflib
12
+ import importlib.util
13
+ import logging
14
+ from dataclasses import dataclass, field
15
+ from pathlib import Path
16
+ from typing import Any, ClassVar
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Check for pylint availability (optional dependency for clone detection)
21
+ PYLINT_AVAILABLE = importlib.util.find_spec("pylint") is not None
22
+ Symilar: Any = None
23
+ if PYLINT_AVAILABLE:
24
+ try:
25
+ from pylint.checkers.symilar import Symilar
26
+ except ImportError:
27
+ PYLINT_AVAILABLE = False
28
+
29
+ # Check for tree-sitter availability
30
+ TREE_SITTER_AVAILABLE = importlib.util.find_spec("tree_sitter") is not None
31
+ if TREE_SITTER_AVAILABLE:
32
+ import tree_sitter # type: ignore[import-not-found]
33
+
34
+
35
+ @dataclass
36
+ class CloneReport:
37
+ """Report of detected code clone between two files.
38
+
39
+ Attributes:
40
+ file1: First file containing cloned code
41
+ file2: Second file containing cloned code
42
+ line_start1: Starting line number in file1
43
+ line_end1: Ending line number in file1
44
+ line_start2: Starting line number in file2
45
+ line_end2: Ending line number in file2
46
+ similarity: Similarity score from 0.0 to 1.0
47
+ clone_type: Type of clone ("exact", "renamed", "modified")
48
+ code_snippet1: Code snippet from file1
49
+ code_snippet2: Code snippet from file2
50
+ """
51
+
52
+ file1: Path
53
+ file2: Path
54
+ line_start1: int
55
+ line_end1: int
56
+ line_start2: int
57
+ line_end2: int
58
+ similarity: float
59
+ clone_type: str
60
+ code_snippet1: str
61
+ code_snippet2: str
62
+
63
+ def __post_init__(self) -> None:
64
+ """Validate clone report fields."""
65
+ if not 0.0 <= self.similarity <= 1.0:
66
+ raise ValueError(
67
+ f"Similarity must be between 0.0 and 1.0, got {self.similarity}"
68
+ )
69
+ if self.clone_type not in ("exact", "renamed", "modified"):
70
+ raise ValueError(
71
+ f"Clone type must be 'exact', 'renamed', or 'modified', got {self.clone_type}"
72
+ )
73
+
74
+
75
+ @dataclass
76
+ class RefactoringSuggestion:
77
+ """Suggestion for refactoring detected clones.
78
+
79
+ Attributes:
80
+ description: Human-readable description of the refactoring
81
+ affected_files: List of files that would be affected
82
+ estimated_reduction: Estimated lines of code saved
83
+ suggested_function_name: Suggested name for extracted function
84
+ parameters: List of parameter names for extracted function
85
+ code_template: Template code showing the suggested refactoring
86
+ """
87
+
88
+ description: str
89
+ affected_files: list[Path]
90
+ estimated_reduction: int
91
+ suggested_function_name: str
92
+ parameters: list[str]
93
+ code_template: str
94
+
95
+
96
+ @dataclass
97
+ class SimilarityReport:
98
+ """Report of similar functions between two files.
99
+
100
+ Attributes:
101
+ file1: First file path
102
+ file2: Second file path
103
+ similar_functions: List of tuples (func1_name, func2_name, similarity_score)
104
+ overall_similarity: Overall similarity between files (0.0 to 1.0)
105
+ """
106
+
107
+ file1: Path
108
+ file2: Path
109
+ similar_functions: list[tuple[str, str, float]] = field(default_factory=list)
110
+ overall_similarity: float = 0.0
111
+
112
+
113
+ class CloneDetector:
114
+ """AST-based code clone detector using pycode_similar and pylint.
115
+
116
+ This class provides methods to detect code clones, analyze similarity between
117
+ functions, and suggest refactoring opportunities to reduce code duplication.
118
+
119
+ Features:
120
+ - Exact clone detection (Type-1): Identical code blocks
121
+ - Renamed clone detection (Type-2): Same structure, different identifiers
122
+ - Modified clone detection (Type-3): Similar logic with minor changes
123
+ - Multi-language support: Python, JavaScript, TypeScript, Go, Rust, Java, Ruby, PHP, C, C++
124
+ """
125
+
126
+ # Similarity thresholds for clone classification
127
+ EXACT_THRESHOLD: ClassVar[float] = 0.95
128
+ RENAMED_THRESHOLD: ClassVar[float] = 0.80
129
+ MODIFIED_THRESHOLD: ClassVar[float] = 0.60
130
+
131
+ # Minimum lines for clone detection
132
+ MIN_CLONE_LINES: ClassVar[int] = 4
133
+
134
+ # Language extension mapping
135
+ LANGUAGE_EXTENSIONS: ClassVar[dict[str, list[str]]] = {
136
+ "python": [".py"],
137
+ "javascript": [".js", ".jsx", ".mjs"],
138
+ "typescript": [".ts", ".tsx"],
139
+ "go": [".go"],
140
+ "rust": [".rs"],
141
+ "java": [".java"],
142
+ "ruby": [".rb"],
143
+ "php": [".php"],
144
+ "c": [".c", ".h"],
145
+ "cpp": [".cpp", ".cc", ".cxx", ".hpp", ".hh", ".hxx"],
146
+ }
147
+
148
+ # Tree-sitter language module names
149
+ TREE_SITTER_LANGUAGES: ClassVar[dict[str, str]] = {
150
+ "python": "tree_sitter_python",
151
+ "javascript": "tree_sitter_javascript",
152
+ "typescript": "tree_sitter_typescript",
153
+ "go": "tree_sitter_go",
154
+ "rust": "tree_sitter_rust",
155
+ "java": "tree_sitter_java",
156
+ "ruby": "tree_sitter_ruby",
157
+ "php": "tree_sitter_php",
158
+ "c": "tree_sitter_c",
159
+ "cpp": "tree_sitter_cpp",
160
+ }
161
+
162
+ def __init__(self, min_similarity: float = 0.60, min_lines: int = 4) -> None:
163
+ """Initialize clone detector.
164
+
165
+ Args:
166
+ min_similarity: Minimum similarity threshold (0.0 to 1.0)
167
+ min_lines: Minimum number of lines to consider for clones
168
+ """
169
+ if not 0.0 <= min_similarity <= 1.0:
170
+ raise ValueError(
171
+ f"min_similarity must be between 0.0 and 1.0, got {min_similarity}"
172
+ )
173
+ if min_lines < 1:
174
+ raise ValueError(f"min_lines must be >= 1, got {min_lines}")
175
+
176
+ self.min_similarity = min_similarity
177
+ self.min_lines = min_lines
178
+ self._parsers: dict[str, Any] = {}
179
+ self._init_tree_sitter_parsers()
180
+
181
+ def _init_tree_sitter_parsers(self) -> None:
182
+ """Initialize tree-sitter parsers for supported languages."""
183
+ if not TREE_SITTER_AVAILABLE:
184
+ logger.debug("tree-sitter not available - multi-language support disabled")
185
+ return
186
+
187
+ for lang, module_name in self.TREE_SITTER_LANGUAGES.items():
188
+ try:
189
+ # Try to import language module
190
+ spec = importlib.util.find_spec(module_name)
191
+ if spec is None:
192
+ continue
193
+
194
+ # Dynamic import
195
+ module = importlib.import_module(module_name)
196
+
197
+ # Create parser with language
198
+ parser = tree_sitter.Parser()
199
+
200
+ # Handle different tree-sitter API versions
201
+ if hasattr(module, "language"):
202
+ lang_obj = tree_sitter.Language(module.language())
203
+ if hasattr(parser, "set_language"):
204
+ parser.set_language(lang_obj)
205
+ else:
206
+ # Newer API - create parser with language
207
+ parser = tree_sitter.Parser(lang_obj)
208
+
209
+ self._parsers[lang] = parser
210
+ logger.debug(f"Initialized tree-sitter parser for {lang}")
211
+
212
+ except (ImportError, AttributeError) as e:
213
+ logger.debug(f"Could not load parser for {lang}: {e}")
214
+ continue
215
+
216
+ def _detect_language(self, file_path: Path) -> str | None:
217
+ """Detect programming language from file extension.
218
+
219
+ Args:
220
+ file_path: Path to source file
221
+
222
+ Returns:
223
+ Language name or None if not supported
224
+ """
225
+ ext = file_path.suffix.lower()
226
+ for lang, extensions in self.LANGUAGE_EXTENSIONS.items():
227
+ if ext in extensions:
228
+ return lang
229
+ return None
230
+
231
+ def detect_clones(
232
+ self, project_path: Path, languages: list[str] | None = None
233
+ ) -> list[CloneReport]:
234
+ """Detect code clones in a project directory.
235
+
236
+ Supports multi-language detection using tree-sitter for non-Python languages
237
+ and pylint for Python files.
238
+
239
+ Args:
240
+ project_path: Root directory of project to analyze
241
+ languages: List of languages to analyze (None = all supported languages)
242
+
243
+ Returns:
244
+ List of CloneReport objects describing detected clones
245
+
246
+ Raises:
247
+ ValueError: If project_path doesn't exist or isn't a directory
248
+ """
249
+ if not project_path.exists():
250
+ raise ValueError(f"Project path does not exist: {project_path}")
251
+ if not project_path.is_dir():
252
+ raise ValueError(f"Project path is not a directory: {project_path}")
253
+
254
+ logger.info("Detecting clones in project: %s", project_path)
255
+
256
+ # Determine which languages to analyze
257
+ target_languages = (
258
+ languages if languages else list(self.LANGUAGE_EXTENSIONS.keys())
259
+ )
260
+
261
+ # Collect files by language
262
+ files_by_language: dict[str, list[Path]] = {
263
+ lang: [] for lang in target_languages
264
+ }
265
+
266
+ # Scan project for files
267
+ for lang in target_languages:
268
+ extensions = self.LANGUAGE_EXTENSIONS.get(lang, [])
269
+ for ext in extensions:
270
+ files_by_language[lang].extend(project_path.rglob(f"*{ext}"))
271
+
272
+ # Remove empty language groups
273
+ files_by_language = {
274
+ lang: files for lang, files in files_by_language.items() if files
275
+ }
276
+
277
+ if not files_by_language:
278
+ logger.warning("No supported files found in %s", project_path)
279
+ return []
280
+
281
+ total_files = sum(len(files) for files in files_by_language.values())
282
+ logger.info(
283
+ "Found %d files across %d languages", total_files, len(files_by_language)
284
+ )
285
+
286
+ # Detect clones per language
287
+ all_clones: list[CloneReport] = []
288
+
289
+ for lang, files in files_by_language.items():
290
+ logger.info("Analyzing %d %s files", len(files), lang)
291
+ try:
292
+ if lang == "python":
293
+ # Use pylint for Python
294
+ clones = self._detect_with_pylint(files)
295
+ else:
296
+ # Use tree-sitter for other languages
297
+ clones = self._detect_with_tree_sitter(files, lang)
298
+
299
+ all_clones.extend(clones)
300
+ logger.info("Found %d clones in %s files", len(clones), lang)
301
+
302
+ except Exception as e:
303
+ logger.error("Error detecting clones in %s files: %s", lang, e)
304
+
305
+ logger.info("Detected %d total clones", len(all_clones))
306
+ return all_clones
307
+
308
+ def _detect_with_pylint(self, files: list[Path]) -> list[CloneReport]:
309
+ """Detect clones using pylint's Similar checker.
310
+
311
+ Args:
312
+ files: List of Python files to analyze
313
+
314
+ Returns:
315
+ List of CloneReport objects
316
+ """
317
+ clones: list[CloneReport] = []
318
+
319
+ # Create Symilar instance with our minimum line threshold
320
+ similar = Symilar(
321
+ min_lines=self.min_lines,
322
+ ignore_comments=True,
323
+ ignore_docstrings=True,
324
+ ignore_imports=False,
325
+ )
326
+
327
+ # Process files
328
+ for file_path in files:
329
+ try:
330
+ with open(file_path, encoding="utf-8") as f:
331
+ similar.append_stream(str(file_path), f, file_path.name)
332
+ except Exception as e:
333
+ logger.warning("Error reading %s: %s", file_path, e)
334
+ continue
335
+
336
+ # Run similarity analysis
337
+ try:
338
+ similar.run()
339
+
340
+ # Extract clone information from Similar instance
341
+ # Similar stores results in linesets which we need to process
342
+ for duplicate in similar._compute_sims():
343
+ # Each duplicate is ((file1, start1, end1), (file2, start2, end2))
344
+ if len(duplicate) >= 2:
345
+ loc1, loc2 = duplicate[0], duplicate[1]
346
+ file1_path = Path(loc1[0])
347
+ file2_path = Path(loc2[0])
348
+
349
+ # Read code snippets
350
+ snippet1 = self._read_lines(file1_path, loc1[1], loc1[2])
351
+ snippet2 = self._read_lines(file2_path, loc2[1], loc2[2])
352
+
353
+ # Calculate similarity
354
+ similarity = self._calculate_similarity(snippet1, snippet2)
355
+
356
+ # Determine clone type
357
+ clone_type = self._classify_clone_type(similarity)
358
+
359
+ # Only include if meets minimum similarity threshold
360
+ if similarity >= self.min_similarity:
361
+ clone = CloneReport(
362
+ file1=file1_path,
363
+ file2=file2_path,
364
+ line_start1=loc1[1],
365
+ line_end1=loc1[2],
366
+ line_start2=loc2[1],
367
+ line_end2=loc2[2],
368
+ similarity=similarity,
369
+ clone_type=clone_type,
370
+ code_snippet1=snippet1,
371
+ code_snippet2=snippet2,
372
+ )
373
+ clones.append(clone)
374
+
375
+ except Exception as e:
376
+ logger.error("Error running similarity analysis: %s", e)
377
+
378
+ return clones
379
+
380
+ def _detect_with_tree_sitter(
381
+ self, files: list[Path], language: str
382
+ ) -> list[CloneReport]:
383
+ """Detect clones using tree-sitter for non-Python languages.
384
+
385
+ Args:
386
+ files: List of source files to analyze
387
+ language: Programming language
388
+
389
+ Returns:
390
+ List of CloneReport objects
391
+ """
392
+ if language not in self._parsers:
393
+ logger.warning("No parser available for %s", language)
394
+ return []
395
+
396
+ parser = self._parsers[language]
397
+ clones: list[CloneReport] = []
398
+
399
+ # Extract code blocks from all files
400
+ file_blocks: dict[Path, list[tuple[int, int, str, str]]] = {}
401
+
402
+ for file_path in files:
403
+ try:
404
+ blocks = self._extract_code_blocks(file_path, parser, language)
405
+ if blocks:
406
+ file_blocks[file_path] = blocks
407
+ except Exception as e:
408
+ logger.warning("Error extracting blocks from %s: %s", file_path, e)
409
+
410
+ # Compare all block pairs across files
411
+ file_paths = list(file_blocks.keys())
412
+ for i, file1 in enumerate(file_paths):
413
+ for file2 in file_paths[i + 1 :]:
414
+ clones.extend(
415
+ self._compare_file_blocks(
416
+ file1, file_blocks[file1], file2, file_blocks[file2]
417
+ )
418
+ )
419
+
420
+ return clones
421
+
422
+ def _extract_code_blocks(
423
+ self, file_path: Path, parser: Any, language: str
424
+ ) -> list[tuple[int, int, str, str]]:
425
+ """Extract code blocks from a file using tree-sitter.
426
+
427
+ Args:
428
+ file_path: Path to source file
429
+ parser: Tree-sitter parser for the language
430
+ language: Programming language
431
+
432
+ Returns:
433
+ List of tuples: (start_line, end_line, code_text, normalized_ast)
434
+ """
435
+ try:
436
+ with open(file_path, "rb") as f:
437
+ source = f.read()
438
+
439
+ tree = parser.parse(source)
440
+ blocks: list[tuple[int, int, str, str]] = []
441
+
442
+ # Extract function/method blocks
443
+ self._walk_tree_for_blocks(tree.root_node, source, language, blocks)
444
+
445
+ return blocks
446
+
447
+ except Exception as e:
448
+ logger.debug("Error parsing %s: %s", file_path, e)
449
+ return []
450
+
451
+ def _walk_tree_for_blocks(
452
+ self,
453
+ node: Any,
454
+ source: bytes,
455
+ language: str,
456
+ blocks: list[tuple[int, int, str, str]],
457
+ ) -> None:
458
+ """Recursively walk tree-sitter AST to extract code blocks.
459
+
460
+ Args:
461
+ node: Tree-sitter node
462
+ source: Source code bytes
463
+ language: Programming language
464
+ blocks: Output list to append blocks to
465
+ """
466
+ # Define function/method node types per language
467
+ function_types = {
468
+ "javascript": [
469
+ "function_declaration",
470
+ "arrow_function",
471
+ "method_definition",
472
+ ],
473
+ "typescript": [
474
+ "function_declaration",
475
+ "arrow_function",
476
+ "method_definition",
477
+ ],
478
+ "go": ["function_declaration", "method_declaration"],
479
+ "rust": ["function_item", "impl_item"],
480
+ "java": ["method_declaration", "constructor_declaration"],
481
+ "ruby": ["method", "singleton_method"],
482
+ "php": ["function_definition", "method_declaration"],
483
+ "c": ["function_definition"],
484
+ "cpp": ["function_definition"],
485
+ }
486
+
487
+ target_types = function_types.get(language, ["function_declaration"])
488
+
489
+ # Check if this node is a function/method
490
+ if node.type in target_types:
491
+ # Extract code block
492
+ start_line = node.start_point[0] + 1
493
+ end_line = node.end_point[0] + 1
494
+ line_count = end_line - start_line + 1
495
+
496
+ # Only consider blocks meeting minimum line threshold
497
+ if line_count >= self.min_lines:
498
+ code_text = source[node.start_byte : node.end_byte].decode(
499
+ "utf-8", errors="ignore"
500
+ )
501
+ normalized = self._normalize_ast(node, source, language)
502
+
503
+ blocks.append((start_line, end_line, code_text, normalized))
504
+
505
+ # Recursively process children
506
+ for child in node.children:
507
+ self._walk_tree_for_blocks(child, source, language, blocks)
508
+
509
+ def _normalize_ast(self, node: Any, source: bytes, language: str) -> str:
510
+ """Normalize AST to detect Type-2 clones (renamed identifiers).
511
+
512
+ Replaces variable names, function names with generic tokens to detect
513
+ structural similarity even when identifiers differ.
514
+
515
+ Args:
516
+ node: Tree-sitter node
517
+ source: Source code bytes
518
+ language: Programming language
519
+
520
+ Returns:
521
+ Normalized AST representation as string
522
+ """
523
+ # Build normalized representation by replacing identifiers
524
+ # This allows detecting clones where only variable names differ
525
+
526
+ def normalize_node(n: Any) -> str:
527
+ # Replace identifier nodes with generic token
528
+ if n.type == "identifier":
529
+ return "<ID>"
530
+ if n.type in ("string", "string_literal", "char_literal"):
531
+ return "<STR>"
532
+ if n.type in ("number", "integer", "float"):
533
+ return "<NUM>"
534
+ if n.type == "comment":
535
+ return "" # Ignore comments
536
+ if not n.children:
537
+ # Leaf node - use actual text
538
+ try:
539
+ return source[n.start_byte : n.end_byte].decode(
540
+ "utf-8", errors="ignore"
541
+ )
542
+ except Exception:
543
+ return n.type
544
+ else:
545
+ # Non-leaf - recursively normalize children
546
+ parts = [normalize_node(child) for child in n.children]
547
+ return f"({n.type} {' '.join(p for p in parts if p)})"
548
+
549
+ return normalize_node(node)
550
+
551
+ def _compare_file_blocks(
552
+ self,
553
+ file1: Path,
554
+ blocks1: list[tuple[int, int, str, str]],
555
+ file2: Path,
556
+ blocks2: list[tuple[int, int, str, str]],
557
+ ) -> list[CloneReport]:
558
+ """Compare code blocks between two files.
559
+
560
+ Args:
561
+ file1: First file path
562
+ blocks1: Code blocks from first file
563
+ file2: Second file path
564
+ blocks2: Code blocks from second file
565
+
566
+ Returns:
567
+ List of detected clones
568
+ """
569
+ clones: list[CloneReport] = []
570
+
571
+ for start1, end1, code1, norm1 in blocks1:
572
+ for start2, end2, code2, norm2 in blocks2:
573
+ # Calculate similarity using both raw text and normalized AST
574
+ text_similarity = self._calculate_similarity(code1, code2)
575
+ ast_similarity = self._calculate_similarity(norm1, norm2)
576
+
577
+ # Use max of both similarities to catch Type-2 clones
578
+ similarity = max(text_similarity, ast_similarity)
579
+
580
+ if similarity >= self.min_similarity:
581
+ clone_type = self._classify_clone_type(similarity)
582
+
583
+ clone = CloneReport(
584
+ file1=file1,
585
+ file2=file2,
586
+ line_start1=start1,
587
+ line_end1=end1,
588
+ line_start2=start2,
589
+ line_end2=end2,
590
+ similarity=similarity,
591
+ clone_type=clone_type,
592
+ code_snippet1=code1,
593
+ code_snippet2=code2,
594
+ )
595
+ clones.append(clone)
596
+
597
+ return clones
598
+
599
+ def _read_lines(self, file_path: Path, start_line: int, end_line: int) -> str:
600
+ """Read specific lines from a file.
601
+
602
+ Args:
603
+ file_path: Path to file
604
+ start_line: Starting line number (1-indexed)
605
+ end_line: Ending line number (1-indexed)
606
+
607
+ Returns:
608
+ String containing the specified lines
609
+ """
610
+ try:
611
+ with open(file_path, encoding="utf-8") as f:
612
+ lines = f.readlines()
613
+ # Convert to 0-indexed
614
+ return "".join(lines[start_line - 1 : end_line])
615
+ except Exception as e:
616
+ logger.warning("Error reading lines from %s: %s", file_path, e)
617
+ return ""
618
+
619
+ def _calculate_similarity(self, text1: str, text2: str) -> float:
620
+ """Calculate similarity between two text strings.
621
+
622
+ Uses difflib's SequenceMatcher for similarity calculation.
623
+
624
+ Args:
625
+ text1: First text string
626
+ text2: Second text string
627
+
628
+ Returns:
629
+ Similarity score from 0.0 to 1.0
630
+ """
631
+ return difflib.SequenceMatcher(None, text1, text2).ratio()
632
+
633
+ def _classify_clone_type(self, similarity: float) -> str:
634
+ """Classify clone type based on similarity score.
635
+
636
+ Args:
637
+ similarity: Similarity score from 0.0 to 1.0
638
+
639
+ Returns:
640
+ Clone type: "exact", "renamed", or "modified"
641
+ """
642
+ if similarity >= self.EXACT_THRESHOLD:
643
+ return "exact"
644
+ if similarity >= self.RENAMED_THRESHOLD:
645
+ return "renamed"
646
+ return "modified"
647
+
648
+ def find_similar_functions(self, file1: Path, file2: Path) -> SimilarityReport:
649
+ """Find similar functions between two files.
650
+
651
+ Uses AST analysis to compare function structures and identify similar
652
+ implementations. Supports multi-language comparison using tree-sitter.
653
+
654
+ Args:
655
+ file1: First source file path
656
+ file2: Second source file path
657
+
658
+ Returns:
659
+ SimilarityReport with function-level similarity analysis
660
+
661
+ Raises:
662
+ ValueError: If files don't exist or have incompatible languages
663
+ """
664
+ if not file1.exists() or not file2.exists():
665
+ raise ValueError("Both files must exist")
666
+
667
+ # Detect languages
668
+ lang1 = self._detect_language(file1)
669
+ lang2 = self._detect_language(file2)
670
+
671
+ if lang1 is None or lang2 is None:
672
+ raise ValueError(f"Unsupported file types: {file1.suffix}, {file2.suffix}")
673
+
674
+ if lang1 != lang2:
675
+ raise ValueError(f"Cannot compare different languages: {lang1} vs {lang2}")
676
+
677
+ logger.info(
678
+ "Analyzing function similarity between %s and %s (%s)", file1, file2, lang1
679
+ )
680
+
681
+ # Use language-specific analysis
682
+ if lang1 == "python":
683
+ return self._find_similar_functions_python(file1, file2)
684
+ return self._find_similar_functions_tree_sitter(file1, file2, lang1)
685
+
686
+ def _find_similar_functions_python(
687
+ self, file1: Path, file2: Path
688
+ ) -> SimilarityReport:
689
+ """Find similar functions in Python files using AST.
690
+
691
+ Args:
692
+ file1: First Python file
693
+ file2: Second Python file
694
+
695
+ Returns:
696
+ SimilarityReport with function comparisons
697
+ """
698
+ # Parse AST for both files
699
+ try:
700
+ tree1 = self._parse_file(file1)
701
+ tree2 = self._parse_file(file2)
702
+ except Exception as e:
703
+ logger.error("Error parsing files: %s", e)
704
+ return SimilarityReport(file1=file1, file2=file2)
705
+
706
+ # Extract functions from both files
707
+ funcs1 = self._extract_functions(tree1)
708
+ funcs2 = self._extract_functions(tree2)
709
+
710
+ logger.debug("Found %d functions in %s", len(funcs1), file1)
711
+ logger.debug("Found %d functions in %s", len(funcs2), file2)
712
+
713
+ # Compare all function pairs
714
+ similar_functions: list[tuple[str, str, float]] = []
715
+ for name1, func1 in funcs1.items():
716
+ for name2, func2 in funcs2.items():
717
+ similarity = self._compare_functions(func1, func2)
718
+ if similarity >= self.min_similarity:
719
+ similar_functions.append((name1, name2, similarity))
720
+
721
+ # Calculate overall file similarity
722
+ overall_similarity = 0.0
723
+ if similar_functions:
724
+ overall_similarity = sum(s for _, _, s in similar_functions) / len(
725
+ similar_functions
726
+ )
727
+
728
+ return SimilarityReport(
729
+ file1=file1,
730
+ file2=file2,
731
+ similar_functions=similar_functions,
732
+ overall_similarity=overall_similarity,
733
+ )
734
+
735
+ def _find_similar_functions_tree_sitter(
736
+ self, file1: Path, file2: Path, language: str
737
+ ) -> SimilarityReport:
738
+ """Find similar functions using tree-sitter.
739
+
740
+ Args:
741
+ file1: First source file
742
+ file2: Second source file
743
+ language: Programming language
744
+
745
+ Returns:
746
+ SimilarityReport with function comparisons
747
+ """
748
+ if language not in self._parsers:
749
+ logger.warning("No parser for %s", language)
750
+ return SimilarityReport(file1=file1, file2=file2)
751
+
752
+ parser = self._parsers[language]
753
+
754
+ # Extract blocks from both files
755
+ blocks1 = self._extract_code_blocks(file1, parser, language)
756
+ blocks2 = self._extract_code_blocks(file2, parser, language)
757
+
758
+ logger.debug("Found %d blocks in %s", len(blocks1), file1)
759
+ logger.debug("Found %d blocks in %s", len(blocks2), file2)
760
+
761
+ # Compare all block pairs
762
+ similar_functions: list[tuple[str, str, float]] = []
763
+
764
+ for i, (start1, end1, code1, norm1) in enumerate(blocks1):
765
+ for j, (start2, end2, code2, norm2) in enumerate(blocks2):
766
+ # Calculate similarity
767
+ text_sim = self._calculate_similarity(code1, code2)
768
+ ast_sim = self._calculate_similarity(norm1, norm2)
769
+ similarity = max(text_sim, ast_sim)
770
+
771
+ if similarity >= self.min_similarity:
772
+ # Use line numbers as function identifiers
773
+ name1 = f"block_{start1}-{end1}"
774
+ name2 = f"block_{start2}-{end2}"
775
+ similar_functions.append((name1, name2, similarity))
776
+
777
+ # Calculate overall similarity
778
+ overall_similarity = 0.0
779
+ if similar_functions:
780
+ overall_similarity = sum(s for _, _, s in similar_functions) / len(
781
+ similar_functions
782
+ )
783
+
784
+ return SimilarityReport(
785
+ file1=file1,
786
+ file2=file2,
787
+ similar_functions=similar_functions,
788
+ overall_similarity=overall_similarity,
789
+ )
790
+
791
+ def _parse_file(self, file_path: Path) -> ast.AST:
792
+ """Parse Python file into AST.
793
+
794
+ Args:
795
+ file_path: Path to Python file
796
+
797
+ Returns:
798
+ AST node
799
+
800
+ Raises:
801
+ SyntaxError: If file has syntax errors
802
+ """
803
+ with open(file_path, encoding="utf-8") as f:
804
+ return ast.parse(f.read(), filename=str(file_path))
805
+
806
+ def _extract_functions(self, tree: ast.AST) -> dict[str, ast.FunctionDef]:
807
+ """Extract function definitions from AST.
808
+
809
+ Args:
810
+ tree: AST root node
811
+
812
+ Returns:
813
+ Dictionary mapping function names to FunctionDef nodes
814
+ """
815
+ functions: dict[str, ast.FunctionDef] = {}
816
+ for node in ast.walk(tree):
817
+ if isinstance(node, ast.FunctionDef):
818
+ functions[node.name] = node
819
+ return functions
820
+
821
+ def _compare_functions(
822
+ self, func1: ast.FunctionDef, func2: ast.FunctionDef
823
+ ) -> float:
824
+ """Compare two function AST nodes for similarity.
825
+
826
+ Args:
827
+ func1: First function AST node
828
+ func2: Second function AST node
829
+
830
+ Returns:
831
+ Similarity score from 0.0 to 1.0
832
+ """
833
+ # Convert AST to source code
834
+ code1 = ast.unparse(func1)
835
+ code2 = ast.unparse(func2)
836
+
837
+ # Use difflib for comparison
838
+ return self._calculate_similarity(code1, code2)
839
+
840
+ def suggest_parameterization(
841
+ self, clones: list[CloneReport]
842
+ ) -> list[RefactoringSuggestion]:
843
+ """Suggest parameterization opportunities for detected clones.
844
+
845
+ Analyzes clone groups and suggests how to extract common logic into
846
+ reusable functions.
847
+
848
+ Args:
849
+ clones: List of detected clone reports
850
+
851
+ Returns:
852
+ List of RefactoringSuggestion objects
853
+ """
854
+ logger.info("Generating refactoring suggestions for %d clones", len(clones))
855
+
856
+ # Group clones by similarity
857
+ clone_groups = self._group_similar_clones(clones)
858
+
859
+ suggestions: list[RefactoringSuggestion] = []
860
+ for group in clone_groups:
861
+ try:
862
+ suggestion = self._create_suggestion(group)
863
+ if suggestion:
864
+ suggestions.append(suggestion)
865
+ except Exception as e:
866
+ logger.warning("Error creating suggestion: %s", e)
867
+
868
+ logger.info("Generated %d refactoring suggestions", len(suggestions))
869
+ return suggestions
870
+
871
+ def _group_similar_clones(
872
+ self, clones: list[CloneReport]
873
+ ) -> list[list[CloneReport]]:
874
+ """Group clones that are similar to each other.
875
+
876
+ Args:
877
+ clones: List of clone reports
878
+
879
+ Returns:
880
+ List of clone groups
881
+ """
882
+ # Simple grouping: clones with same code are grouped together
883
+ groups: dict[str, list[CloneReport]] = {}
884
+ for clone in clones:
885
+ # Use code snippet as key (normalize whitespace)
886
+ key = " ".join(clone.code_snippet1.split())
887
+ if key not in groups:
888
+ groups[key] = []
889
+ groups[key].append(clone)
890
+
891
+ return list(groups.values())
892
+
893
+ def _create_suggestion(
894
+ self, clone_group: list[CloneReport]
895
+ ) -> RefactoringSuggestion | None:
896
+ """Create refactoring suggestion for a group of clones.
897
+
898
+ Args:
899
+ clone_group: List of similar clones
900
+
901
+ Returns:
902
+ RefactoringSuggestion or None if no suggestion possible
903
+ """
904
+ if not clone_group:
905
+ return None
906
+
907
+ # Use first clone as representative
908
+ representative = clone_group[0]
909
+
910
+ # Collect all affected files
911
+ affected_files: set[Path] = set()
912
+ for clone in clone_group:
913
+ affected_files.add(clone.file1)
914
+ affected_files.add(clone.file2)
915
+
916
+ # Calculate estimated reduction
917
+ # Each clone instance can be replaced with 1-2 lines (function call)
918
+ lines_per_clone = representative.line_end1 - representative.line_start1 + 1
919
+ estimated_reduction = (len(clone_group) * lines_per_clone) - lines_per_clone
920
+
921
+ # Generate function name suggestion
922
+ suggested_name = self._suggest_function_name(representative)
923
+
924
+ # Analyze code to identify potential parameters
925
+ parameters = self._identify_parameters(representative)
926
+
927
+ # Create code template
928
+ code_template = self._create_code_template(
929
+ suggested_name, parameters, representative.code_snippet1
930
+ )
931
+
932
+ description = (
933
+ f"Extract {len(clone_group)} similar code blocks "
934
+ f"from {len(affected_files)} files into reusable function"
935
+ )
936
+
937
+ return RefactoringSuggestion(
938
+ description=description,
939
+ affected_files=list(affected_files),
940
+ estimated_reduction=estimated_reduction,
941
+ suggested_function_name=suggested_name,
942
+ parameters=parameters,
943
+ code_template=code_template,
944
+ )
945
+
946
+ def _suggest_function_name(self, clone: CloneReport) -> str:
947
+ """Suggest a function name based on clone code.
948
+
949
+ Args:
950
+ clone: Clone report
951
+
952
+ Returns:
953
+ Suggested function name
954
+ """
955
+ # Simple heuristic: extract first meaningful identifier
956
+ try:
957
+ tree = ast.parse(clone.code_snippet1)
958
+ for node in ast.walk(tree):
959
+ if isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
960
+ return f"extracted_{node.func.id}"
961
+ except Exception:
962
+ pass
963
+
964
+ return "extracted_function"
965
+
966
+ def _identify_parameters(self, clone: CloneReport) -> list[str]:
967
+ """Identify potential parameters for extracted function.
968
+
969
+ Args:
970
+ clone: Clone report
971
+
972
+ Returns:
973
+ List of parameter names
974
+ """
975
+ parameters: list[str] = []
976
+
977
+ try:
978
+ tree = ast.parse(clone.code_snippet1)
979
+ # Collect names that are used but not defined in the snippet
980
+ names_used: set[str] = set()
981
+ names_defined: set[str] = set()
982
+
983
+ for node in ast.walk(tree):
984
+ if isinstance(node, ast.Name):
985
+ if isinstance(node.ctx, ast.Store):
986
+ names_defined.add(node.id)
987
+ elif isinstance(node.ctx, ast.Load):
988
+ names_used.add(node.id)
989
+
990
+ # Parameters are names used but not defined
991
+ parameters = list(names_used - names_defined)
992
+
993
+ except Exception as e:
994
+ logger.debug("Error identifying parameters: %s", e)
995
+
996
+ return parameters or ["data"] # Default parameter if none found
997
+
998
+ def _create_code_template(
999
+ self, func_name: str, parameters: list[str], code: str
1000
+ ) -> str:
1001
+ """Create code template for suggested refactoring.
1002
+
1003
+ Args:
1004
+ func_name: Suggested function name
1005
+ parameters: List of parameter names
1006
+ code: Clone code snippet
1007
+
1008
+ Returns:
1009
+ Code template string
1010
+ """
1011
+ param_str = ", ".join(parameters)
1012
+ indent = " "
1013
+
1014
+ # Indent code block
1015
+ indented_code = "\n".join(
1016
+ f"{indent}{line}" if line.strip() else "" for line in code.split("\n")
1017
+ )
1018
+
1019
+ return (
1020
+ f"""def {func_name}({param_str}):
1021
+ \"\"\"Extracted common logic from multiple locations.
1022
+
1023
+ Args:
1024
+ {indent}{indent}"""
1025
+ + f"\n{indent}{indent}".join(f"{p}: Parameter" for p in parameters)
1026
+ + f"""
1027
+ \"\"\"
1028
+ {indented_code}
1029
+ """
1030
+ )