treepeat 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. treepeat-0.0.1/PKG-INFO +89 -0
  2. treepeat-0.0.1/README.md +68 -0
  3. treepeat-0.0.1/pyproject.toml +56 -0
  4. treepeat-0.0.1/setup.cfg +4 -0
  5. treepeat-0.0.1/treepeat/__init__.py +0 -0
  6. treepeat-0.0.1/treepeat/cli/__init__.py +5 -0
  7. treepeat-0.0.1/treepeat/cli/cli.py +60 -0
  8. treepeat-0.0.1/treepeat/cli/commands/__init__.py +7 -0
  9. treepeat-0.0.1/treepeat/cli/commands/detect.py +390 -0
  10. treepeat-0.0.1/treepeat/cli/commands/list_ruleset.py +82 -0
  11. treepeat-0.0.1/treepeat/cli/commands/treesitter.py +316 -0
  12. treepeat-0.0.1/treepeat/config.py +103 -0
  13. treepeat-0.0.1/treepeat/diff.py +208 -0
  14. treepeat-0.0.1/treepeat/formatters/__init__.py +6 -0
  15. treepeat-0.0.1/treepeat/formatters/json.py +52 -0
  16. treepeat-0.0.1/treepeat/formatters/sarif.py +172 -0
  17. treepeat-0.0.1/treepeat/models/__init__.py +5 -0
  18. treepeat-0.0.1/treepeat/models/ast.py +50 -0
  19. treepeat-0.0.1/treepeat/models/normalization.py +41 -0
  20. treepeat-0.0.1/treepeat/models/shingle.py +116 -0
  21. treepeat-0.0.1/treepeat/models/similarity.py +122 -0
  22. treepeat-0.0.1/treepeat/pipeline/__init__.py +5 -0
  23. treepeat-0.0.1/treepeat/pipeline/auto_chunk_extraction.py +243 -0
  24. treepeat-0.0.1/treepeat/pipeline/languages/__init__.py +54 -0
  25. treepeat-0.0.1/treepeat/pipeline/languages/base.py +36 -0
  26. treepeat-0.0.1/treepeat/pipeline/languages/bash.py +58 -0
  27. treepeat-0.0.1/treepeat/pipeline/languages/css.py +71 -0
  28. treepeat-0.0.1/treepeat/pipeline/languages/html.py +60 -0
  29. treepeat-0.0.1/treepeat/pipeline/languages/javascript.py +78 -0
  30. treepeat-0.0.1/treepeat/pipeline/languages/markdown.py +30 -0
  31. treepeat-0.0.1/treepeat/pipeline/languages/python.py +132 -0
  32. treepeat-0.0.1/treepeat/pipeline/languages/sql.py +53 -0
  33. treepeat-0.0.1/treepeat/pipeline/languages/typescript.py +10 -0
  34. treepeat-0.0.1/treepeat/pipeline/lsh_stage.py +361 -0
  35. treepeat-0.0.1/treepeat/pipeline/minhash_stage.py +63 -0
  36. treepeat-0.0.1/treepeat/pipeline/parse.py +310 -0
  37. treepeat-0.0.1/treepeat/pipeline/pipeline.py +243 -0
  38. treepeat-0.0.1/treepeat/pipeline/region_extraction.py +248 -0
  39. treepeat-0.0.1/treepeat/pipeline/rules/__init__.py +0 -0
  40. treepeat-0.0.1/treepeat/pipeline/rules/engine.py +314 -0
  41. treepeat-0.0.1/treepeat/pipeline/rules/models.py +46 -0
  42. treepeat-0.0.1/treepeat/pipeline/rules/parser.py +137 -0
  43. treepeat-0.0.1/treepeat/pipeline/rules_factory.py +54 -0
  44. treepeat-0.0.1/treepeat/pipeline/shingle.py +289 -0
  45. treepeat-0.0.1/treepeat/pipeline/statistical_chunk_extraction.py +441 -0
  46. treepeat-0.0.1/treepeat/pipeline/verification.py +218 -0
  47. treepeat-0.0.1/treepeat/terminal_detect.py +274 -0
  48. treepeat-0.0.1/treepeat.egg-info/PKG-INFO +89 -0
  49. treepeat-0.0.1/treepeat.egg-info/SOURCES.txt +51 -0
  50. treepeat-0.0.1/treepeat.egg-info/dependency_links.txt +1 -0
  51. treepeat-0.0.1/treepeat.egg-info/entry_points.txt +2 -0
  52. treepeat-0.0.1/treepeat.egg-info/requires.txt +10 -0
  53. treepeat-0.0.1/treepeat.egg-info/top_level.txt +1 -0
@@ -0,0 +1,89 @@
1
+ Metadata-Version: 2.4
2
+ Name: treepeat
3
+ Version: 0.0.1
4
+ Summary: treepeat: a treesitter-based CLI tool to detect similar code
5
+ Author-email: Dane Summers <dsummersl@gmail.com>
6
+ License-Expression: Apache-2.0
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Typing :: Typed
9
+ Requires-Python: >=3.11
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: click>=8.3.0
12
+ Requires-Dist: datasketch>=1.6.5
13
+ Requires-Dist: pydantic>=2.12.3
14
+ Requires-Dist: pydantic-settings>=2.0.0
15
+ Requires-Dist: python-magic>=0.4.27
16
+ Requires-Dist: rich>=14.2.0
17
+ Requires-Dist: sarif-pydantic>=0.6.2
18
+ Requires-Dist: tdqm>=0.0.1
19
+ Requires-Dist: tree-sitter>=0.25.2
20
+ Requires-Dist: tree-sitter-language-pack>=0.10.0
21
+
22
+ # Treepeat - finding groups of similar code
23
+
24
+ `treepeat` is a CLI tool that detects similar code using treesitter AST analysis with locality-sensitive hashing:
25
+ - Find **duplicate** code blocks meaningful to the language (classes/functions), not just lines.
26
+ - near-duplicates
27
+ - structurally similar text.
28
+
29
+ Helpers: This is very much an proof of concept - I'm happy with it, but I haven't supported very many languages at present. PRs welcome!
30
+
31
+ ## Usage
32
+
33
+ ### detect
34
+
35
+ Scan a codebase for similar or duplicate code blocks using tree-sitter AST analysis and locality-sensitive hashing.
36
+
37
+ Key flags:
38
+ - `--ruleset`: Normalization ruleset to use (`none`, `default`, `loose`) - controls how code is normalized before comparison
39
+ - `--similarity`: Percent similarity from 1-100 (default: 100 for exact duplicates)
40
+ - `--min-lines`: Minimum number of lines for a match (default: 5)
41
+ - `--diff`: Show side-by-side comparisons of similar blocks
42
+ - `--format`: Output format - `console` (default) or `sarif` for CI integration
43
+
44
+ ```bash
45
+ # Find exact duplicates
46
+ treepeat detect /path/to/codebase
47
+
48
+ # Find near-duplicates with 80% similarity threshold
49
+ treepeat detect --similarity 80 /path/to/codebase
50
+
51
+ # Show diffs between similar blocks and use loose ruleset
52
+ treepeat --ruleset loose detect --diff --min-lines 10 /path/to/codebase
53
+
54
+ # Output results in SARIF format for CI tools
55
+ treepeat detect --format sarif -o results.sarif /path/to/codebase
56
+ ```
57
+
58
+ ### Other sub commands
59
+
60
+ #### list-ruleset
61
+
62
+ List all rules in a ruleset, along with their descriptions. Use `--language` to see which rules apply to a specific language.
63
+
64
+ #### treesitter
65
+
66
+ Display how treepeat normalizes source code into tree-sitter tokens for similarity detection -- helpful for debugging why a certain section of a file might be similar to another. Shows the original source code side-by-side with the normalized token representation.
67
+
68
+ ## Dev setup
69
+
70
+ ```bash
71
+ make setup
72
+ make test
73
+ ```
74
+
75
+ ## Benchmarks
76
+
77
+ treepeat includes a testing framework for comparing duplication detection tools against real-world codebases.
78
+
79
+ ```bash
80
+ # Run all benchmark tests
81
+ make benchmark
82
+
83
+ # Compare results across tools
84
+ make benchmark-compare
85
+ ```
86
+
87
+ ## ADRs
88
+
89
+ Architecture Decision Records live in docs/adr.
@@ -0,0 +1,68 @@
1
+ # Treepeat - finding groups of similar code
2
+
3
+ `treepeat` is a CLI tool that detects similar code using treesitter AST analysis with locality-sensitive hashing:
4
+ - Find **duplicate** code blocks meaningful to the language (classes/functions), not just lines.
5
+ - near-duplicates
6
+ - structurally similar text.
7
+
8
+ Helpers: This is very much an proof of concept - I'm happy with it, but I haven't supported very many languages at present. PRs welcome!
9
+
10
+ ## Usage
11
+
12
+ ### detect
13
+
14
+ Scan a codebase for similar or duplicate code blocks using tree-sitter AST analysis and locality-sensitive hashing.
15
+
16
+ Key flags:
17
+ - `--ruleset`: Normalization ruleset to use (`none`, `default`, `loose`) - controls how code is normalized before comparison
18
+ - `--similarity`: Percent similarity from 1-100 (default: 100 for exact duplicates)
19
+ - `--min-lines`: Minimum number of lines for a match (default: 5)
20
+ - `--diff`: Show side-by-side comparisons of similar blocks
21
+ - `--format`: Output format - `console` (default) or `sarif` for CI integration
22
+
23
+ ```bash
24
+ # Find exact duplicates
25
+ treepeat detect /path/to/codebase
26
+
27
+ # Find near-duplicates with 80% similarity threshold
28
+ treepeat detect --similarity 80 /path/to/codebase
29
+
30
+ # Show diffs between similar blocks and use loose ruleset
31
+ treepeat --ruleset loose detect --diff --min-lines 10 /path/to/codebase
32
+
33
+ # Output results in SARIF format for CI tools
34
+ treepeat detect --format sarif -o results.sarif /path/to/codebase
35
+ ```
36
+
37
+ ### Other sub commands
38
+
39
+ #### list-ruleset
40
+
41
+ List all rules in a ruleset, along with their descriptions. Use `--language` to see which rules apply to a specific language.
42
+
43
+ #### treesitter
44
+
45
+ Display how treepeat normalizes source code into tree-sitter tokens for similarity detection -- helpful for debugging why a certain section of a file might be similar to another. Shows the original source code side-by-side with the normalized token representation.
46
+
47
+ ## Dev setup
48
+
49
+ ```bash
50
+ make setup
51
+ make test
52
+ ```
53
+
54
+ ## Benchmarks
55
+
56
+ treepeat includes a testing framework for comparing duplication detection tools against real-world codebases.
57
+
58
+ ```bash
59
+ # Run all benchmark tests
60
+ make benchmark
61
+
62
+ # Compare results across tools
63
+ make benchmark-compare
64
+ ```
65
+
66
+ ## ADRs
67
+
68
+ Architecture Decision Records live in docs/adr.
@@ -0,0 +1,56 @@
1
+ [project]
2
+ name = "treepeat"
3
+ version = "0.0.1"
4
+ description = "treepeat: a treesitter-based CLI tool to detect similar code"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ authors = [{ name = "Dane Summers", email = "dsummersl@gmail.com" }]
8
+ license = "Apache-2.0"
9
+ classifiers = [
10
+ "Programming Language :: Python :: 3",
11
+ "Typing :: Typed",
12
+ ]
13
+ dependencies = [
14
+ "click>=8.3.0",
15
+ "datasketch>=1.6.5",
16
+ "pydantic>=2.12.3",
17
+ "pydantic-settings>=2.0.0",
18
+ "python-magic>=0.4.27",
19
+ "rich>=14.2.0",
20
+ "sarif-pydantic>=0.6.2",
21
+ "tdqm>=0.0.1",
22
+ "tree-sitter>=0.25.2",
23
+ "tree-sitter-language-pack>=0.10.0",
24
+ ]
25
+
26
+ [project.scripts]
27
+ treepeat = "treepeat.cli:main"
28
+
29
+ [dependency-groups]
30
+ dev = [
31
+ "pytest>=8",
32
+ "pytest-cov>=6",
33
+ "ruff>=0.5",
34
+ "mypy>=1.10",
35
+ "pre-commit>=3.7",
36
+ "radon>=6.0.1",
37
+ ]
38
+
39
+ [tool.uv]
40
+ package = true
41
+
42
+ [tool.ruff]
43
+ line-length = 100
44
+ exclude = [".venv", ".git", "build", "dist", "tests/fixtures", "benchmark-tests/codebases"]
45
+
46
+ [tool.pyright]
47
+ exclude = ["benchmark-tests"]
48
+
49
+ [tool.mypy]
50
+ python_version = "3.11"
51
+ strict = true
52
+ packages = ["treepeat"]
53
+
54
+ [tool.pytest.ini_options]
55
+ addopts = "--cov=treepeat --cov-report=term-missing"
56
+ norecursedirs = "benchmark-tests"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,5 @@
1
+ """CLI module."""
2
+
3
+ from treepeat.cli.cli import main
4
+
5
+ __all__ = ["main"]
@@ -0,0 +1,60 @@
1
+ """CLI interface for treepeat."""
2
+
3
+ import logging
4
+
5
+ import click
6
+ from rich.console import Console
7
+ from rich.logging import RichHandler
8
+
9
+ from treepeat.cli.commands import detect, list_ruleset, treesitter
10
+
11
+ console = Console()
12
+
13
+
14
+ def setup_logging(log_level: str) -> None:
15
+ """Configure logging with rich handler."""
16
+ logging.basicConfig(
17
+ level=log_level,
18
+ format="%(message)s",
19
+ handlers=[RichHandler(console=console, rich_tracebacks=True)],
20
+ )
21
+
22
+
23
+ @click.group()
24
+ @click.pass_context
25
+ @click.option(
26
+ "--log-level",
27
+ "-l",
28
+ type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False),
29
+ default="WARNING",
30
+ help="Set the logging level",
31
+ )
32
+ @click.option(
33
+ "--ruleset",
34
+ "-r",
35
+ type=click.Choice(["none", "default", "loose"], case_sensitive=False),
36
+ default="default",
37
+ help="Built-in ruleset profile to use (default: default)",
38
+ )
39
+ def main(
40
+ ctx: click.Context,
41
+ log_level: str,
42
+ ruleset: str,
43
+ ) -> None:
44
+ """Tree-sitter based similarity detector."""
45
+ setup_logging(log_level.upper())
46
+
47
+ # Store common options in context for subcommands
48
+ ctx.ensure_object(dict)
49
+ ctx.obj["log_level"] = log_level
50
+ ctx.obj["ruleset"] = ruleset
51
+
52
+
53
+ # Register subcommands
54
+ main.add_command(detect)
55
+ main.add_command(treesitter)
56
+ main.add_command(list_ruleset)
57
+
58
+
59
+ if __name__ == "__main__":
60
+ main()
@@ -0,0 +1,7 @@
1
+ """CLI subcommands."""
2
+
3
+ from .detect import detect
4
+ from .list_ruleset import list_ruleset
5
+ from .treesitter import treesitter
6
+
7
+ __all__ = ["detect", "list_ruleset", "treesitter"]
@@ -0,0 +1,390 @@
1
+ """Detect command - find similar code regions."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import click
7
+ from rich.console import Console
8
+ from rich.table import Table
9
+
10
+ from treepeat.config import LSHSettings, MinHashSettings, PipelineSettings, RulesSettings, ShingleSettings, set_settings
11
+ from treepeat.formatters import format_as_sarif
12
+ from treepeat.models.similarity import Region, RegionSignature, SimilarRegionGroup, SimilarityResult
13
+ from treepeat.pipeline.pipeline import run_pipeline
14
+
15
+ console = Console()
16
+
17
+
18
+ def _parse_patterns(pattern_string: str) -> list[str]:
19
+ """Parse comma-separated pattern string into list."""
20
+ return [p.strip() for p in pattern_string.split(",") if p.strip()]
21
+
22
+
23
+ def _create_rules_settings(ruleset: str) -> RulesSettings:
24
+ """Create RulesSettings."""
25
+ return RulesSettings(ruleset=ruleset)
26
+
27
+
28
+ def _configure_settings(
29
+ ruleset: str,
30
+ similarity_percent: float,
31
+ min_lines: int,
32
+ ignore: str,
33
+ ignore_files: str,
34
+ ignore_node_types: str,
35
+ ) -> None:
36
+ """Configure pipeline settings."""
37
+ lsh_settings = LSHSettings(
38
+ similarity_percent=similarity_percent / 100.0,
39
+ min_lines=min_lines,
40
+ ignore_node_types=_parse_patterns(ignore_node_types),
41
+ )
42
+
43
+ settings = PipelineSettings(
44
+ rules=_create_rules_settings(ruleset),
45
+ shingle=ShingleSettings(), # Uses default k=3
46
+ minhash=MinHashSettings(), # Uses default num_perm=128
47
+ lsh=lsh_settings,
48
+ ignore_patterns=_parse_patterns(ignore),
49
+ ignore_file_patterns=_parse_patterns(ignore_files),
50
+ )
51
+ set_settings(settings)
52
+
53
+
54
+ def _write_output(text: str, output_path: Path | None) -> None:
55
+ """Write output text to file or stdout."""
56
+ if output_path:
57
+ output_path.write_text(text)
58
+ else:
59
+ print(text)
60
+
61
+
62
+ def _run_pipeline_with_ui(path: Path, output_format: str) -> SimilarityResult:
63
+ """Run the pipeline with appropriate UI feedback based on output format."""
64
+ if output_format.lower() == "console":
65
+ from treepeat.config import get_settings
66
+ settings = get_settings()
67
+ console.print(f"\nRuleset: [cyan]{settings.rules.ruleset}[/cyan]")
68
+ console.print(f"Analyzing: [cyan]{path}[/cyan]\n")
69
+ with console.status("[bold green]Running pipeline..."):
70
+ return run_pipeline(path)
71
+ else:
72
+ return run_pipeline(path)
73
+
74
+
75
+ def _group_signatures_by_file(
76
+ signatures: list[RegionSignature],
77
+ ) -> dict[Path, list[RegionSignature]]:
78
+ """Group region signatures by file path."""
79
+ regions_by_file: dict[Path, list[RegionSignature]] = {}
80
+ for sig in signatures:
81
+ path = sig.region.path
82
+ if path not in regions_by_file:
83
+ regions_by_file[path] = []
84
+ regions_by_file[path].append(sig)
85
+ return regions_by_file
86
+
87
+
88
+ def _get_group_sort_key(group: SimilarRegionGroup) -> tuple[float, float]:
89
+ """Get sort key for a similarity group by similarity and average line count."""
90
+ avg_lines = sum(r.end_line - r.start_line + 1 for r in group.regions) / len(group.regions)
91
+ return (group.similarity, avg_lines)
92
+
93
+
94
+ def _format_region_name(region: Region) -> str:
95
+ """Format region name with type if not lines."""
96
+ if region.region_type == "lines":
97
+ return region.region_name
98
+ return f"{region.region_name}({region.region_type})"
99
+
100
+
101
+ def _display_group(group: SimilarRegionGroup, show_diff: bool = False) -> None:
102
+ """Display a single similarity group with optional diff."""
103
+ from treepeat.diff import display_diff
104
+
105
+ # Display similarity group header
106
+ console.print(f"Similar group found ([bold]{group.similarity:.1%}[/bold] similar, {group.size} regions):")
107
+
108
+ # Display all regions in the group
109
+ for i, region in enumerate(group.regions):
110
+ lines = region.end_line - region.start_line + 1
111
+ prefix = " - " if i == 0 else " "
112
+ region_display = _format_region_name(region)
113
+ console.print(
114
+ f"{prefix}{region.path} [{region.start_line}:{region.end_line}] "
115
+ f"({lines} lines) {region_display}"
116
+ )
117
+
118
+ # Show diff if requested and we have at least 2 regions
119
+ if show_diff and len(group.regions) >= 2:
120
+ console.print()
121
+ display_diff(group.regions[0], group.regions[1])
122
+ else:
123
+ console.print() # Blank line between groups
124
+
125
+
126
+ def display_similar_groups(result: SimilarityResult, show_diff: bool = False) -> None:
127
+ """Display similar region groups with optional diff."""
128
+ if not result.similar_groups:
129
+ console.print("\n[yellow]No similar regions found above threshold.[/yellow]")
130
+ return
131
+
132
+ console.print("\n[bold cyan]Similar Regions:[/bold cyan]")
133
+ sorted_groups = sorted(result.similar_groups, key=_get_group_sort_key)
134
+
135
+ for group in sorted_groups:
136
+ _display_group(group, show_diff=show_diff)
137
+
138
+
139
+ def display_failed_files(result: SimilarityResult, show_details: bool) -> None:
140
+ """Display failed files with optional error details."""
141
+ if not result.failed_files:
142
+ return
143
+
144
+ console.print("\n[bold red]Failed Files:[/bold red]")
145
+ for file_path, error in result.failed_files.items():
146
+ console.print(f" [red]✗[/red] {file_path}")
147
+ if show_details:
148
+ console.print(f" [dim]{error}[/dim]")
149
+
150
+
151
+ def _init_language_stats(
152
+ stats_by_format: dict[str, dict[str, int | set[Path]]], language: str
153
+ ) -> None:
154
+ """Initialize stats entry for a language if not present."""
155
+ if language not in stats_by_format:
156
+ stats_by_format[language] = {"files": set(), "groups": 0, "lines": 0}
157
+
158
+
159
+ def _collect_files_from_signatures(
160
+ signatures: list[RegionSignature],
161
+ ) -> dict[str, dict[str, int | set[Path]]]:
162
+ """Collect all processed files from signatures."""
163
+ stats_by_format: dict[str, dict[str, int | set[Path]]] = {}
164
+
165
+ for signature in signatures:
166
+ region = signature.region
167
+ language = region.language
168
+ _init_language_stats(stats_by_format, language)
169
+ stats = stats_by_format[language]
170
+ stats["files"].add(region.path) # type: ignore[union-attr]
171
+
172
+ return stats_by_format
173
+
174
+
175
+ def _add_duplicate_stats(
176
+ stats_by_format: dict[str, dict[str, int | set[Path]]],
177
+ similar_groups: list[SimilarRegionGroup],
178
+ ) -> None:
179
+ """Add group counts and duplicate lines from similar groups."""
180
+ for group in similar_groups:
181
+ for region in group.regions:
182
+ language = region.language
183
+ _init_language_stats(stats_by_format, language)
184
+ stats = stats_by_format[language]
185
+ stats["lines"] += region.end_line - region.start_line + 1 # type: ignore[operator]
186
+
187
+ # Count group once per language (use first region's language)
188
+ if group.regions:
189
+ first_language = group.regions[0].language
190
+ stats_by_format[first_language]["groups"] += 1 # type: ignore[operator]
191
+
192
+
193
+ def _collect_format_statistics(result: SimilarityResult) -> dict[str, dict[str, int | set[Path]]]:
194
+ """Collect statistics by language/format from all processed files."""
195
+ stats_by_format = _collect_files_from_signatures(result.signatures)
196
+ _add_duplicate_stats(stats_by_format, result.similar_groups)
197
+ return stats_by_format
198
+
199
+
200
+ def _populate_summary_table(
201
+ table: Table,
202
+ stats_by_format: dict[str, dict[str, int | set[Path]]],
203
+ ) -> tuple[set[Path], int, int]:
204
+ """Populate summary table with format statistics and return totals."""
205
+ total_files: set[Path] = set()
206
+ total_groups = 0
207
+ total_lines = 0
208
+
209
+ for language in sorted(stats_by_format.keys()):
210
+ stats = stats_by_format[language]
211
+ files = stats["files"]
212
+ groups = stats["groups"]
213
+ lines = stats["lines"]
214
+
215
+ # Type narrowing assertions
216
+ assert isinstance(files, set)
217
+ assert isinstance(groups, int)
218
+ assert isinstance(lines, int)
219
+
220
+ table.add_row(
221
+ language,
222
+ str(len(files)),
223
+ str(groups),
224
+ str(lines),
225
+ )
226
+
227
+ # Accumulate totals
228
+ total_files.update(files)
229
+ total_groups += groups
230
+ total_lines += lines
231
+
232
+ return total_files, total_groups, total_lines
233
+
234
+
235
+ def display_summary_table(result: SimilarityResult) -> None:
236
+ """Display summary table with statistics by format."""
237
+ # Show stats even if no similar groups found (to show all processed files)
238
+ if not result.signatures:
239
+ return
240
+
241
+ stats_by_format = _collect_format_statistics(result)
242
+
243
+ # Create summary table
244
+ table = Table(show_header=True, header_style="bold")
245
+ table.add_column("Format", style="cyan")
246
+ table.add_column("# Files", justify="right")
247
+ table.add_column("Groups Found", justify="right")
248
+ table.add_column("Lines", justify="right")
249
+
250
+ # Populate table and calculate totals
251
+ total_files, total_groups, total_lines = _populate_summary_table(table, stats_by_format)
252
+
253
+ # Add totals row
254
+ table.add_row(
255
+ "[bold]Totals[/bold]",
256
+ f"[bold]{len(total_files)}[/bold]",
257
+ f"[bold]{total_groups}[/bold]",
258
+ f"[bold]{total_lines}[/bold]",
259
+ end_section=True,
260
+ )
261
+
262
+ console.print("\n")
263
+ console.print(table)
264
+
265
+
266
+ def _handle_output(
267
+ result: SimilarityResult,
268
+ output_format: str,
269
+ output_path: Path | None,
270
+ log_level: str,
271
+ show_diff: bool = False,
272
+ ) -> None:
273
+ """Handle formatting and outputting results."""
274
+ if output_format.lower() == "sarif":
275
+ output_text = format_as_sarif(result, pretty=True)
276
+ _write_output(output_text, output_path)
277
+ else: # console
278
+ display_similar_groups(result, show_diff=show_diff)
279
+ display_failed_files(result, show_details=(log_level.upper() == "DEBUG"))
280
+ display_summary_table(result)
281
+ console.print()
282
+
283
+
284
+ def _check_result_errors(result: SimilarityResult, output_format: str) -> None:
285
+ """Check for errors in the result and exit if necessary."""
286
+ if result.success_count == 0 and result.failure_count > 0:
287
+ if output_format.lower() == "console":
288
+ console.print("[bold red]Error:[/bold red] Failed to parse any files")
289
+ display_failed_files(result, show_details=True)
290
+ sys.exit(1)
291
+
292
+
293
+ @click.command()
294
+ @click.argument("path", type=click.Path(exists=True, path_type=Path))
295
+ @click.pass_context
296
+ @click.option(
297
+ "--similarity",
298
+ "-s",
299
+ type=click.IntRange(5, 100),
300
+ default=100,
301
+ help="Percent similarity threshold (default: 100)",
302
+ )
303
+ @click.option(
304
+ "--min-lines",
305
+ "-ml",
306
+ type=click.IntRange(1),
307
+ default=5,
308
+ help="Minimum number of lines to be considered similar (default: 5)",
309
+ )
310
+ @click.option(
311
+ "--format",
312
+ "-f",
313
+ "output_format",
314
+ type=click.Choice(["console", "sarif"], case_sensitive=False),
315
+ default="console",
316
+ help="Output format (default: console)",
317
+ )
318
+ @click.option(
319
+ "--output",
320
+ "-o",
321
+ type=click.Path(path_type=Path),
322
+ default=None,
323
+ help="Output file path (default: stdout)",
324
+ )
325
+ @click.option(
326
+ "--ignore",
327
+ "-i",
328
+ type=str,
329
+ default="",
330
+ help="Comma-separated list of glob patterns to ignore files (e.g., '*.test.py,**/node_modules/**')",
331
+ )
332
+ @click.option(
333
+ "--ignore-files",
334
+ "-if",
335
+ type=str,
336
+ default="**/.*ignore",
337
+ help="Comma-separated list of glob patterns to find ignore files (default: '**/.*ignore')",
338
+ )
339
+ @click.option(
340
+ "--diff",
341
+ "-d",
342
+ is_flag=True,
343
+ default=False,
344
+ help="Show side-by-side diff between the first two files in each similar group (console format only)",
345
+ )
346
+ @click.option(
347
+ "--fail",
348
+ is_flag=True,
349
+ default=False,
350
+ help="Exit with error code 1 if any similar blocks are detected",
351
+ )
352
+ @click.option(
353
+ "--ignore-node-types",
354
+ "-int",
355
+ type=str,
356
+ default="",
357
+ help="Comma-separated list of AST node types to ignore during region extraction (e.g., 'parameters,argument_list')",
358
+ )
359
+ def detect(
360
+ ctx: click.Context,
361
+ path: Path,
362
+ similarity: float,
363
+ min_lines: int,
364
+ output_format: str,
365
+ output: Path | None,
366
+ ignore: str,
367
+ ignore_files: str,
368
+ diff: bool,
369
+ fail: bool,
370
+ ignore_node_types: str,
371
+ ) -> None:
372
+ """Detect similar code regions of files in a path."""
373
+ log_level = ctx.obj["log_level"]
374
+ ruleset = ctx.obj["ruleset"]
375
+
376
+ _configure_settings(
377
+ ruleset,
378
+ similarity,
379
+ min_lines,
380
+ ignore,
381
+ ignore_files,
382
+ ignore_node_types,
383
+ )
384
+ result = _run_pipeline_with_ui(path, output_format)
385
+ _check_result_errors(result, output_format)
386
+ _handle_output(result, output_format, output, log_level, diff)
387
+
388
+ # Exit with error code 1 in strict mode if any similar blocks are detected
389
+ if fail and result.similar_groups:
390
+ sys.exit(1)