treepeat 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- treepeat-0.0.1/PKG-INFO +89 -0
- treepeat-0.0.1/README.md +68 -0
- treepeat-0.0.1/pyproject.toml +56 -0
- treepeat-0.0.1/setup.cfg +4 -0
- treepeat-0.0.1/treepeat/__init__.py +0 -0
- treepeat-0.0.1/treepeat/cli/__init__.py +5 -0
- treepeat-0.0.1/treepeat/cli/cli.py +60 -0
- treepeat-0.0.1/treepeat/cli/commands/__init__.py +7 -0
- treepeat-0.0.1/treepeat/cli/commands/detect.py +390 -0
- treepeat-0.0.1/treepeat/cli/commands/list_ruleset.py +82 -0
- treepeat-0.0.1/treepeat/cli/commands/treesitter.py +316 -0
- treepeat-0.0.1/treepeat/config.py +103 -0
- treepeat-0.0.1/treepeat/diff.py +208 -0
- treepeat-0.0.1/treepeat/formatters/__init__.py +6 -0
- treepeat-0.0.1/treepeat/formatters/json.py +52 -0
- treepeat-0.0.1/treepeat/formatters/sarif.py +172 -0
- treepeat-0.0.1/treepeat/models/__init__.py +5 -0
- treepeat-0.0.1/treepeat/models/ast.py +50 -0
- treepeat-0.0.1/treepeat/models/normalization.py +41 -0
- treepeat-0.0.1/treepeat/models/shingle.py +116 -0
- treepeat-0.0.1/treepeat/models/similarity.py +122 -0
- treepeat-0.0.1/treepeat/pipeline/__init__.py +5 -0
- treepeat-0.0.1/treepeat/pipeline/auto_chunk_extraction.py +243 -0
- treepeat-0.0.1/treepeat/pipeline/languages/__init__.py +54 -0
- treepeat-0.0.1/treepeat/pipeline/languages/base.py +36 -0
- treepeat-0.0.1/treepeat/pipeline/languages/bash.py +58 -0
- treepeat-0.0.1/treepeat/pipeline/languages/css.py +71 -0
- treepeat-0.0.1/treepeat/pipeline/languages/html.py +60 -0
- treepeat-0.0.1/treepeat/pipeline/languages/javascript.py +78 -0
- treepeat-0.0.1/treepeat/pipeline/languages/markdown.py +30 -0
- treepeat-0.0.1/treepeat/pipeline/languages/python.py +132 -0
- treepeat-0.0.1/treepeat/pipeline/languages/sql.py +53 -0
- treepeat-0.0.1/treepeat/pipeline/languages/typescript.py +10 -0
- treepeat-0.0.1/treepeat/pipeline/lsh_stage.py +361 -0
- treepeat-0.0.1/treepeat/pipeline/minhash_stage.py +63 -0
- treepeat-0.0.1/treepeat/pipeline/parse.py +310 -0
- treepeat-0.0.1/treepeat/pipeline/pipeline.py +243 -0
- treepeat-0.0.1/treepeat/pipeline/region_extraction.py +248 -0
- treepeat-0.0.1/treepeat/pipeline/rules/__init__.py +0 -0
- treepeat-0.0.1/treepeat/pipeline/rules/engine.py +314 -0
- treepeat-0.0.1/treepeat/pipeline/rules/models.py +46 -0
- treepeat-0.0.1/treepeat/pipeline/rules/parser.py +137 -0
- treepeat-0.0.1/treepeat/pipeline/rules_factory.py +54 -0
- treepeat-0.0.1/treepeat/pipeline/shingle.py +289 -0
- treepeat-0.0.1/treepeat/pipeline/statistical_chunk_extraction.py +441 -0
- treepeat-0.0.1/treepeat/pipeline/verification.py +218 -0
- treepeat-0.0.1/treepeat/terminal_detect.py +274 -0
- treepeat-0.0.1/treepeat.egg-info/PKG-INFO +89 -0
- treepeat-0.0.1/treepeat.egg-info/SOURCES.txt +51 -0
- treepeat-0.0.1/treepeat.egg-info/dependency_links.txt +1 -0
- treepeat-0.0.1/treepeat.egg-info/entry_points.txt +2 -0
- treepeat-0.0.1/treepeat.egg-info/requires.txt +10 -0
- treepeat-0.0.1/treepeat.egg-info/top_level.txt +1 -0
treepeat-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: treepeat
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: treepeat: a treesitter-based CLI tool to detect similar code
|
|
5
|
+
Author-email: Dane Summers <dsummersl@gmail.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Typing :: Typed
|
|
9
|
+
Requires-Python: >=3.11
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: click>=8.3.0
|
|
12
|
+
Requires-Dist: datasketch>=1.6.5
|
|
13
|
+
Requires-Dist: pydantic>=2.12.3
|
|
14
|
+
Requires-Dist: pydantic-settings>=2.0.0
|
|
15
|
+
Requires-Dist: python-magic>=0.4.27
|
|
16
|
+
Requires-Dist: rich>=14.2.0
|
|
17
|
+
Requires-Dist: sarif-pydantic>=0.6.2
|
|
18
|
+
Requires-Dist: tdqm>=0.0.1
|
|
19
|
+
Requires-Dist: tree-sitter>=0.25.2
|
|
20
|
+
Requires-Dist: tree-sitter-language-pack>=0.10.0
|
|
21
|
+
|
|
22
|
+
# Treepeat - finding groups of similar code
|
|
23
|
+
|
|
24
|
+
`treepeat` is a CLI tool that detects similar code using treesitter AST analysis with locality-sensitive hashing:
|
|
25
|
+
- Find **duplicate** code blocks meaningful to the language (classes/functions), not just lines.
|
|
26
|
+
- near-duplicates
|
|
27
|
+
- structurally similar text.
|
|
28
|
+
|
|
29
|
+
Helpers: This is very much an proof of concept - I'm happy with it, but I haven't supported very many languages at present. PRs welcome!
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
### detect
|
|
34
|
+
|
|
35
|
+
Scan a codebase for similar or duplicate code blocks using tree-sitter AST analysis and locality-sensitive hashing.
|
|
36
|
+
|
|
37
|
+
Key flags:
|
|
38
|
+
- `--ruleset`: Normalization ruleset to use (`none`, `default`, `loose`) - controls how code is normalized before comparison
|
|
39
|
+
- `--similarity`: Percent similarity from 1-100 (default: 100 for exact duplicates)
|
|
40
|
+
- `--min-lines`: Minimum number of lines for a match (default: 5)
|
|
41
|
+
- `--diff`: Show side-by-side comparisons of similar blocks
|
|
42
|
+
- `--format`: Output format - `console` (default) or `sarif` for CI integration
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# Find exact duplicates
|
|
46
|
+
treepeat detect /path/to/codebase
|
|
47
|
+
|
|
48
|
+
# Find near-duplicates with 80% similarity threshold
|
|
49
|
+
treepeat detect --similarity 80 /path/to/codebase
|
|
50
|
+
|
|
51
|
+
# Show diffs between similar blocks and use loose ruleset
|
|
52
|
+
treepeat --ruleset loose detect --diff --min-lines 10 /path/to/codebase
|
|
53
|
+
|
|
54
|
+
# Output results in SARIF format for CI tools
|
|
55
|
+
treepeat detect --format sarif -o results.sarif /path/to/codebase
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Other sub commands
|
|
59
|
+
|
|
60
|
+
#### list-ruleset
|
|
61
|
+
|
|
62
|
+
List all rules in a ruleset, along with their descriptions. Use `--language` to see which rules apply to a specific language.
|
|
63
|
+
|
|
64
|
+
#### treesitter
|
|
65
|
+
|
|
66
|
+
Display how treepeat normalizes source code into tree-sitter tokens for similarity detection -- helpful for debugging why a certain section of a file might be similar to another. Shows the original source code side-by-side with the normalized token representation.
|
|
67
|
+
|
|
68
|
+
## Dev setup
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
make setup
|
|
72
|
+
make test
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Benchmarks
|
|
76
|
+
|
|
77
|
+
treepeat includes a testing framework for comparing duplication detection tools against real-world codebases.
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Run all benchmark tests
|
|
81
|
+
make benchmark
|
|
82
|
+
|
|
83
|
+
# Compare results across tools
|
|
84
|
+
make benchmark-compare
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## ADRs
|
|
88
|
+
|
|
89
|
+
Architecture Decision Records live in docs/adr.
|
treepeat-0.0.1/README.md
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# Treepeat - finding groups of similar code
|
|
2
|
+
|
|
3
|
+
`treepeat` is a CLI tool that detects similar code using treesitter AST analysis with locality-sensitive hashing:
|
|
4
|
+
- Find **duplicate** code blocks meaningful to the language (classes/functions), not just lines.
|
|
5
|
+
- near-duplicates
|
|
6
|
+
- structurally similar text.
|
|
7
|
+
|
|
8
|
+
Helpers: This is very much an proof of concept - I'm happy with it, but I haven't supported very many languages at present. PRs welcome!
|
|
9
|
+
|
|
10
|
+
## Usage
|
|
11
|
+
|
|
12
|
+
### detect
|
|
13
|
+
|
|
14
|
+
Scan a codebase for similar or duplicate code blocks using tree-sitter AST analysis and locality-sensitive hashing.
|
|
15
|
+
|
|
16
|
+
Key flags:
|
|
17
|
+
- `--ruleset`: Normalization ruleset to use (`none`, `default`, `loose`) - controls how code is normalized before comparison
|
|
18
|
+
- `--similarity`: Percent similarity from 1-100 (default: 100 for exact duplicates)
|
|
19
|
+
- `--min-lines`: Minimum number of lines for a match (default: 5)
|
|
20
|
+
- `--diff`: Show side-by-side comparisons of similar blocks
|
|
21
|
+
- `--format`: Output format - `console` (default) or `sarif` for CI integration
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
# Find exact duplicates
|
|
25
|
+
treepeat detect /path/to/codebase
|
|
26
|
+
|
|
27
|
+
# Find near-duplicates with 80% similarity threshold
|
|
28
|
+
treepeat detect --similarity 80 /path/to/codebase
|
|
29
|
+
|
|
30
|
+
# Show diffs between similar blocks and use loose ruleset
|
|
31
|
+
treepeat --ruleset loose detect --diff --min-lines 10 /path/to/codebase
|
|
32
|
+
|
|
33
|
+
# Output results in SARIF format for CI tools
|
|
34
|
+
treepeat detect --format sarif -o results.sarif /path/to/codebase
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Other sub commands
|
|
38
|
+
|
|
39
|
+
#### list-ruleset
|
|
40
|
+
|
|
41
|
+
List all rules in a ruleset, along with their descriptions. Use `--language` to see which rules apply to a specific language.
|
|
42
|
+
|
|
43
|
+
#### treesitter
|
|
44
|
+
|
|
45
|
+
Display how treepeat normalizes source code into tree-sitter tokens for similarity detection -- helpful for debugging why a certain section of a file might be similar to another. Shows the original source code side-by-side with the normalized token representation.
|
|
46
|
+
|
|
47
|
+
## Dev setup
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
make setup
|
|
51
|
+
make test
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Benchmarks
|
|
55
|
+
|
|
56
|
+
treepeat includes a testing framework for comparing duplication detection tools against real-world codebases.
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
# Run all benchmark tests
|
|
60
|
+
make benchmark
|
|
61
|
+
|
|
62
|
+
# Compare results across tools
|
|
63
|
+
make benchmark-compare
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## ADRs
|
|
67
|
+
|
|
68
|
+
Architecture Decision Records live in docs/adr.
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "treepeat"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "treepeat: a treesitter-based CLI tool to detect similar code"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
authors = [{ name = "Dane Summers", email = "dsummersl@gmail.com" }]
|
|
8
|
+
license = "Apache-2.0"
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Programming Language :: Python :: 3",
|
|
11
|
+
"Typing :: Typed",
|
|
12
|
+
]
|
|
13
|
+
dependencies = [
|
|
14
|
+
"click>=8.3.0",
|
|
15
|
+
"datasketch>=1.6.5",
|
|
16
|
+
"pydantic>=2.12.3",
|
|
17
|
+
"pydantic-settings>=2.0.0",
|
|
18
|
+
"python-magic>=0.4.27",
|
|
19
|
+
"rich>=14.2.0",
|
|
20
|
+
"sarif-pydantic>=0.6.2",
|
|
21
|
+
"tdqm>=0.0.1",
|
|
22
|
+
"tree-sitter>=0.25.2",
|
|
23
|
+
"tree-sitter-language-pack>=0.10.0",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.scripts]
|
|
27
|
+
treepeat = "treepeat.cli:main"
|
|
28
|
+
|
|
29
|
+
[dependency-groups]
|
|
30
|
+
dev = [
|
|
31
|
+
"pytest>=8",
|
|
32
|
+
"pytest-cov>=6",
|
|
33
|
+
"ruff>=0.5",
|
|
34
|
+
"mypy>=1.10",
|
|
35
|
+
"pre-commit>=3.7",
|
|
36
|
+
"radon>=6.0.1",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[tool.uv]
|
|
40
|
+
package = true
|
|
41
|
+
|
|
42
|
+
[tool.ruff]
|
|
43
|
+
line-length = 100
|
|
44
|
+
exclude = [".venv", ".git", "build", "dist", "tests/fixtures", "benchmark-tests/codebases"]
|
|
45
|
+
|
|
46
|
+
[tool.pyright]
|
|
47
|
+
exclude = ["benchmark-tests"]
|
|
48
|
+
|
|
49
|
+
[tool.mypy]
|
|
50
|
+
python_version = "3.11"
|
|
51
|
+
strict = true
|
|
52
|
+
packages = ["treepeat"]
|
|
53
|
+
|
|
54
|
+
[tool.pytest.ini_options]
|
|
55
|
+
addopts = "--cov=treepeat --cov-report=term-missing"
|
|
56
|
+
norecursedirs = "benchmark-tests"
|
treepeat-0.0.1/setup.cfg
ADDED
|
File without changes
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""CLI interface for treepeat."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.logging import RichHandler
|
|
8
|
+
|
|
9
|
+
from treepeat.cli.commands import detect, list_ruleset, treesitter
|
|
10
|
+
|
|
11
|
+
console = Console()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def setup_logging(log_level: str) -> None:
|
|
15
|
+
"""Configure logging with rich handler."""
|
|
16
|
+
logging.basicConfig(
|
|
17
|
+
level=log_level,
|
|
18
|
+
format="%(message)s",
|
|
19
|
+
handlers=[RichHandler(console=console, rich_tracebacks=True)],
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@click.group()
|
|
24
|
+
@click.pass_context
|
|
25
|
+
@click.option(
|
|
26
|
+
"--log-level",
|
|
27
|
+
"-l",
|
|
28
|
+
type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False),
|
|
29
|
+
default="WARNING",
|
|
30
|
+
help="Set the logging level",
|
|
31
|
+
)
|
|
32
|
+
@click.option(
|
|
33
|
+
"--ruleset",
|
|
34
|
+
"-r",
|
|
35
|
+
type=click.Choice(["none", "default", "loose"], case_sensitive=False),
|
|
36
|
+
default="default",
|
|
37
|
+
help="Built-in ruleset profile to use (default: default)",
|
|
38
|
+
)
|
|
39
|
+
def main(
|
|
40
|
+
ctx: click.Context,
|
|
41
|
+
log_level: str,
|
|
42
|
+
ruleset: str,
|
|
43
|
+
) -> None:
|
|
44
|
+
"""Tree-sitter based similarity detector."""
|
|
45
|
+
setup_logging(log_level.upper())
|
|
46
|
+
|
|
47
|
+
# Store common options in context for subcommands
|
|
48
|
+
ctx.ensure_object(dict)
|
|
49
|
+
ctx.obj["log_level"] = log_level
|
|
50
|
+
ctx.obj["ruleset"] = ruleset
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Register subcommands
|
|
54
|
+
main.add_command(detect)
|
|
55
|
+
main.add_command(treesitter)
|
|
56
|
+
main.add_command(list_ruleset)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == "__main__":
|
|
60
|
+
main()
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
"""Detect command - find similar code regions."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from treepeat.config import LSHSettings, MinHashSettings, PipelineSettings, RulesSettings, ShingleSettings, set_settings
|
|
11
|
+
from treepeat.formatters import format_as_sarif
|
|
12
|
+
from treepeat.models.similarity import Region, RegionSignature, SimilarRegionGroup, SimilarityResult
|
|
13
|
+
from treepeat.pipeline.pipeline import run_pipeline
|
|
14
|
+
|
|
15
|
+
console = Console()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _parse_patterns(pattern_string: str) -> list[str]:
|
|
19
|
+
"""Parse comma-separated pattern string into list."""
|
|
20
|
+
return [p.strip() for p in pattern_string.split(",") if p.strip()]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _create_rules_settings(ruleset: str) -> RulesSettings:
|
|
24
|
+
"""Create RulesSettings."""
|
|
25
|
+
return RulesSettings(ruleset=ruleset)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _configure_settings(
|
|
29
|
+
ruleset: str,
|
|
30
|
+
similarity_percent: float,
|
|
31
|
+
min_lines: int,
|
|
32
|
+
ignore: str,
|
|
33
|
+
ignore_files: str,
|
|
34
|
+
ignore_node_types: str,
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Configure pipeline settings."""
|
|
37
|
+
lsh_settings = LSHSettings(
|
|
38
|
+
similarity_percent=similarity_percent / 100.0,
|
|
39
|
+
min_lines=min_lines,
|
|
40
|
+
ignore_node_types=_parse_patterns(ignore_node_types),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
settings = PipelineSettings(
|
|
44
|
+
rules=_create_rules_settings(ruleset),
|
|
45
|
+
shingle=ShingleSettings(), # Uses default k=3
|
|
46
|
+
minhash=MinHashSettings(), # Uses default num_perm=128
|
|
47
|
+
lsh=lsh_settings,
|
|
48
|
+
ignore_patterns=_parse_patterns(ignore),
|
|
49
|
+
ignore_file_patterns=_parse_patterns(ignore_files),
|
|
50
|
+
)
|
|
51
|
+
set_settings(settings)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _write_output(text: str, output_path: Path | None) -> None:
|
|
55
|
+
"""Write output text to file or stdout."""
|
|
56
|
+
if output_path:
|
|
57
|
+
output_path.write_text(text)
|
|
58
|
+
else:
|
|
59
|
+
print(text)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _run_pipeline_with_ui(path: Path, output_format: str) -> SimilarityResult:
|
|
63
|
+
"""Run the pipeline with appropriate UI feedback based on output format."""
|
|
64
|
+
if output_format.lower() == "console":
|
|
65
|
+
from treepeat.config import get_settings
|
|
66
|
+
settings = get_settings()
|
|
67
|
+
console.print(f"\nRuleset: [cyan]{settings.rules.ruleset}[/cyan]")
|
|
68
|
+
console.print(f"Analyzing: [cyan]{path}[/cyan]\n")
|
|
69
|
+
with console.status("[bold green]Running pipeline..."):
|
|
70
|
+
return run_pipeline(path)
|
|
71
|
+
else:
|
|
72
|
+
return run_pipeline(path)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _group_signatures_by_file(
|
|
76
|
+
signatures: list[RegionSignature],
|
|
77
|
+
) -> dict[Path, list[RegionSignature]]:
|
|
78
|
+
"""Group region signatures by file path."""
|
|
79
|
+
regions_by_file: dict[Path, list[RegionSignature]] = {}
|
|
80
|
+
for sig in signatures:
|
|
81
|
+
path = sig.region.path
|
|
82
|
+
if path not in regions_by_file:
|
|
83
|
+
regions_by_file[path] = []
|
|
84
|
+
regions_by_file[path].append(sig)
|
|
85
|
+
return regions_by_file
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _get_group_sort_key(group: SimilarRegionGroup) -> tuple[float, float]:
|
|
89
|
+
"""Get sort key for a similarity group by similarity and average line count."""
|
|
90
|
+
avg_lines = sum(r.end_line - r.start_line + 1 for r in group.regions) / len(group.regions)
|
|
91
|
+
return (group.similarity, avg_lines)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _format_region_name(region: Region) -> str:
|
|
95
|
+
"""Format region name with type if not lines."""
|
|
96
|
+
if region.region_type == "lines":
|
|
97
|
+
return region.region_name
|
|
98
|
+
return f"{region.region_name}({region.region_type})"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _display_group(group: SimilarRegionGroup, show_diff: bool = False) -> None:
|
|
102
|
+
"""Display a single similarity group with optional diff."""
|
|
103
|
+
from treepeat.diff import display_diff
|
|
104
|
+
|
|
105
|
+
# Display similarity group header
|
|
106
|
+
console.print(f"Similar group found ([bold]{group.similarity:.1%}[/bold] similar, {group.size} regions):")
|
|
107
|
+
|
|
108
|
+
# Display all regions in the group
|
|
109
|
+
for i, region in enumerate(group.regions):
|
|
110
|
+
lines = region.end_line - region.start_line + 1
|
|
111
|
+
prefix = " - " if i == 0 else " "
|
|
112
|
+
region_display = _format_region_name(region)
|
|
113
|
+
console.print(
|
|
114
|
+
f"{prefix}{region.path} [{region.start_line}:{region.end_line}] "
|
|
115
|
+
f"({lines} lines) {region_display}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Show diff if requested and we have at least 2 regions
|
|
119
|
+
if show_diff and len(group.regions) >= 2:
|
|
120
|
+
console.print()
|
|
121
|
+
display_diff(group.regions[0], group.regions[1])
|
|
122
|
+
else:
|
|
123
|
+
console.print() # Blank line between groups
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def display_similar_groups(result: SimilarityResult, show_diff: bool = False) -> None:
|
|
127
|
+
"""Display similar region groups with optional diff."""
|
|
128
|
+
if not result.similar_groups:
|
|
129
|
+
console.print("\n[yellow]No similar regions found above threshold.[/yellow]")
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
console.print("\n[bold cyan]Similar Regions:[/bold cyan]")
|
|
133
|
+
sorted_groups = sorted(result.similar_groups, key=_get_group_sort_key)
|
|
134
|
+
|
|
135
|
+
for group in sorted_groups:
|
|
136
|
+
_display_group(group, show_diff=show_diff)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def display_failed_files(result: SimilarityResult, show_details: bool) -> None:
|
|
140
|
+
"""Display failed files with optional error details."""
|
|
141
|
+
if not result.failed_files:
|
|
142
|
+
return
|
|
143
|
+
|
|
144
|
+
console.print("\n[bold red]Failed Files:[/bold red]")
|
|
145
|
+
for file_path, error in result.failed_files.items():
|
|
146
|
+
console.print(f" [red]✗[/red] {file_path}")
|
|
147
|
+
if show_details:
|
|
148
|
+
console.print(f" [dim]{error}[/dim]")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _init_language_stats(
|
|
152
|
+
stats_by_format: dict[str, dict[str, int | set[Path]]], language: str
|
|
153
|
+
) -> None:
|
|
154
|
+
"""Initialize stats entry for a language if not present."""
|
|
155
|
+
if language not in stats_by_format:
|
|
156
|
+
stats_by_format[language] = {"files": set(), "groups": 0, "lines": 0}
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _collect_files_from_signatures(
|
|
160
|
+
signatures: list[RegionSignature],
|
|
161
|
+
) -> dict[str, dict[str, int | set[Path]]]:
|
|
162
|
+
"""Collect all processed files from signatures."""
|
|
163
|
+
stats_by_format: dict[str, dict[str, int | set[Path]]] = {}
|
|
164
|
+
|
|
165
|
+
for signature in signatures:
|
|
166
|
+
region = signature.region
|
|
167
|
+
language = region.language
|
|
168
|
+
_init_language_stats(stats_by_format, language)
|
|
169
|
+
stats = stats_by_format[language]
|
|
170
|
+
stats["files"].add(region.path) # type: ignore[union-attr]
|
|
171
|
+
|
|
172
|
+
return stats_by_format
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _add_duplicate_stats(
|
|
176
|
+
stats_by_format: dict[str, dict[str, int | set[Path]]],
|
|
177
|
+
similar_groups: list[SimilarRegionGroup],
|
|
178
|
+
) -> None:
|
|
179
|
+
"""Add group counts and duplicate lines from similar groups."""
|
|
180
|
+
for group in similar_groups:
|
|
181
|
+
for region in group.regions:
|
|
182
|
+
language = region.language
|
|
183
|
+
_init_language_stats(stats_by_format, language)
|
|
184
|
+
stats = stats_by_format[language]
|
|
185
|
+
stats["lines"] += region.end_line - region.start_line + 1 # type: ignore[operator]
|
|
186
|
+
|
|
187
|
+
# Count group once per language (use first region's language)
|
|
188
|
+
if group.regions:
|
|
189
|
+
first_language = group.regions[0].language
|
|
190
|
+
stats_by_format[first_language]["groups"] += 1 # type: ignore[operator]
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _collect_format_statistics(result: SimilarityResult) -> dict[str, dict[str, int | set[Path]]]:
|
|
194
|
+
"""Collect statistics by language/format from all processed files."""
|
|
195
|
+
stats_by_format = _collect_files_from_signatures(result.signatures)
|
|
196
|
+
_add_duplicate_stats(stats_by_format, result.similar_groups)
|
|
197
|
+
return stats_by_format
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _populate_summary_table(
|
|
201
|
+
table: Table,
|
|
202
|
+
stats_by_format: dict[str, dict[str, int | set[Path]]],
|
|
203
|
+
) -> tuple[set[Path], int, int]:
|
|
204
|
+
"""Populate summary table with format statistics and return totals."""
|
|
205
|
+
total_files: set[Path] = set()
|
|
206
|
+
total_groups = 0
|
|
207
|
+
total_lines = 0
|
|
208
|
+
|
|
209
|
+
for language in sorted(stats_by_format.keys()):
|
|
210
|
+
stats = stats_by_format[language]
|
|
211
|
+
files = stats["files"]
|
|
212
|
+
groups = stats["groups"]
|
|
213
|
+
lines = stats["lines"]
|
|
214
|
+
|
|
215
|
+
# Type narrowing assertions
|
|
216
|
+
assert isinstance(files, set)
|
|
217
|
+
assert isinstance(groups, int)
|
|
218
|
+
assert isinstance(lines, int)
|
|
219
|
+
|
|
220
|
+
table.add_row(
|
|
221
|
+
language,
|
|
222
|
+
str(len(files)),
|
|
223
|
+
str(groups),
|
|
224
|
+
str(lines),
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Accumulate totals
|
|
228
|
+
total_files.update(files)
|
|
229
|
+
total_groups += groups
|
|
230
|
+
total_lines += lines
|
|
231
|
+
|
|
232
|
+
return total_files, total_groups, total_lines
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def display_summary_table(result: SimilarityResult) -> None:
|
|
236
|
+
"""Display summary table with statistics by format."""
|
|
237
|
+
# Show stats even if no similar groups found (to show all processed files)
|
|
238
|
+
if not result.signatures:
|
|
239
|
+
return
|
|
240
|
+
|
|
241
|
+
stats_by_format = _collect_format_statistics(result)
|
|
242
|
+
|
|
243
|
+
# Create summary table
|
|
244
|
+
table = Table(show_header=True, header_style="bold")
|
|
245
|
+
table.add_column("Format", style="cyan")
|
|
246
|
+
table.add_column("# Files", justify="right")
|
|
247
|
+
table.add_column("Groups Found", justify="right")
|
|
248
|
+
table.add_column("Lines", justify="right")
|
|
249
|
+
|
|
250
|
+
# Populate table and calculate totals
|
|
251
|
+
total_files, total_groups, total_lines = _populate_summary_table(table, stats_by_format)
|
|
252
|
+
|
|
253
|
+
# Add totals row
|
|
254
|
+
table.add_row(
|
|
255
|
+
"[bold]Totals[/bold]",
|
|
256
|
+
f"[bold]{len(total_files)}[/bold]",
|
|
257
|
+
f"[bold]{total_groups}[/bold]",
|
|
258
|
+
f"[bold]{total_lines}[/bold]",
|
|
259
|
+
end_section=True,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
console.print("\n")
|
|
263
|
+
console.print(table)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _handle_output(
|
|
267
|
+
result: SimilarityResult,
|
|
268
|
+
output_format: str,
|
|
269
|
+
output_path: Path | None,
|
|
270
|
+
log_level: str,
|
|
271
|
+
show_diff: bool = False,
|
|
272
|
+
) -> None:
|
|
273
|
+
"""Handle formatting and outputting results."""
|
|
274
|
+
if output_format.lower() == "sarif":
|
|
275
|
+
output_text = format_as_sarif(result, pretty=True)
|
|
276
|
+
_write_output(output_text, output_path)
|
|
277
|
+
else: # console
|
|
278
|
+
display_similar_groups(result, show_diff=show_diff)
|
|
279
|
+
display_failed_files(result, show_details=(log_level.upper() == "DEBUG"))
|
|
280
|
+
display_summary_table(result)
|
|
281
|
+
console.print()
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _check_result_errors(result: SimilarityResult, output_format: str) -> None:
|
|
285
|
+
"""Check for errors in the result and exit if necessary."""
|
|
286
|
+
if result.success_count == 0 and result.failure_count > 0:
|
|
287
|
+
if output_format.lower() == "console":
|
|
288
|
+
console.print("[bold red]Error:[/bold red] Failed to parse any files")
|
|
289
|
+
display_failed_files(result, show_details=True)
|
|
290
|
+
sys.exit(1)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
@click.command()
|
|
294
|
+
@click.argument("path", type=click.Path(exists=True, path_type=Path))
|
|
295
|
+
@click.pass_context
|
|
296
|
+
@click.option(
|
|
297
|
+
"--similarity",
|
|
298
|
+
"-s",
|
|
299
|
+
type=click.IntRange(5, 100),
|
|
300
|
+
default=100,
|
|
301
|
+
help="Percent similarity threshold (default: 100)",
|
|
302
|
+
)
|
|
303
|
+
@click.option(
|
|
304
|
+
"--min-lines",
|
|
305
|
+
"-ml",
|
|
306
|
+
type=click.IntRange(1),
|
|
307
|
+
default=5,
|
|
308
|
+
help="Minimum number of lines to be considered similar (default: 5)",
|
|
309
|
+
)
|
|
310
|
+
@click.option(
|
|
311
|
+
"--format",
|
|
312
|
+
"-f",
|
|
313
|
+
"output_format",
|
|
314
|
+
type=click.Choice(["console", "sarif"], case_sensitive=False),
|
|
315
|
+
default="console",
|
|
316
|
+
help="Output format (default: console)",
|
|
317
|
+
)
|
|
318
|
+
@click.option(
|
|
319
|
+
"--output",
|
|
320
|
+
"-o",
|
|
321
|
+
type=click.Path(path_type=Path),
|
|
322
|
+
default=None,
|
|
323
|
+
help="Output file path (default: stdout)",
|
|
324
|
+
)
|
|
325
|
+
@click.option(
|
|
326
|
+
"--ignore",
|
|
327
|
+
"-i",
|
|
328
|
+
type=str,
|
|
329
|
+
default="",
|
|
330
|
+
help="Comma-separated list of glob patterns to ignore files (e.g., '*.test.py,**/node_modules/**')",
|
|
331
|
+
)
|
|
332
|
+
@click.option(
|
|
333
|
+
"--ignore-files",
|
|
334
|
+
"-if",
|
|
335
|
+
type=str,
|
|
336
|
+
default="**/.*ignore",
|
|
337
|
+
help="Comma-separated list of glob patterns to find ignore files (default: '**/.*ignore')",
|
|
338
|
+
)
|
|
339
|
+
@click.option(
|
|
340
|
+
"--diff",
|
|
341
|
+
"-d",
|
|
342
|
+
is_flag=True,
|
|
343
|
+
default=False,
|
|
344
|
+
help="Show side-by-side diff between the first two files in each similar group (console format only)",
|
|
345
|
+
)
|
|
346
|
+
@click.option(
|
|
347
|
+
"--fail",
|
|
348
|
+
is_flag=True,
|
|
349
|
+
default=False,
|
|
350
|
+
help="Exit with error code 1 if any similar blocks are detected",
|
|
351
|
+
)
|
|
352
|
+
@click.option(
|
|
353
|
+
"--ignore-node-types",
|
|
354
|
+
"-int",
|
|
355
|
+
type=str,
|
|
356
|
+
default="",
|
|
357
|
+
help="Comma-separated list of AST node types to ignore during region extraction (e.g., 'parameters,argument_list')",
|
|
358
|
+
)
|
|
359
|
+
def detect(
|
|
360
|
+
ctx: click.Context,
|
|
361
|
+
path: Path,
|
|
362
|
+
similarity: float,
|
|
363
|
+
min_lines: int,
|
|
364
|
+
output_format: str,
|
|
365
|
+
output: Path | None,
|
|
366
|
+
ignore: str,
|
|
367
|
+
ignore_files: str,
|
|
368
|
+
diff: bool,
|
|
369
|
+
fail: bool,
|
|
370
|
+
ignore_node_types: str,
|
|
371
|
+
) -> None:
|
|
372
|
+
"""Detect similar code regions of files in a path."""
|
|
373
|
+
log_level = ctx.obj["log_level"]
|
|
374
|
+
ruleset = ctx.obj["ruleset"]
|
|
375
|
+
|
|
376
|
+
_configure_settings(
|
|
377
|
+
ruleset,
|
|
378
|
+
similarity,
|
|
379
|
+
min_lines,
|
|
380
|
+
ignore,
|
|
381
|
+
ignore_files,
|
|
382
|
+
ignore_node_types,
|
|
383
|
+
)
|
|
384
|
+
result = _run_pipeline_with_ui(path, output_format)
|
|
385
|
+
_check_result_errors(result, output_format)
|
|
386
|
+
_handle_output(result, output_format, output, log_level, diff)
|
|
387
|
+
|
|
388
|
+
# Exit with error code 1 in strict mode if any similar blocks are detected
|
|
389
|
+
if fail and result.similar_groups:
|
|
390
|
+
sys.exit(1)
|