PyPI - treepeat - Versions diffs - 0.0.1__py3-none-any.whl - Mend

treepeat 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

treepeat/__init__.py +0 -0
treepeat/cli/__init__.py +5 -0
treepeat/cli/cli.py +60 -0
treepeat/cli/commands/__init__.py +7 -0
treepeat/cli/commands/detect.py +390 -0
treepeat/cli/commands/list_ruleset.py +82 -0
treepeat/cli/commands/treesitter.py +316 -0
treepeat/config.py +103 -0
treepeat/diff.py +208 -0
treepeat/formatters/__init__.py +6 -0
treepeat/formatters/json.py +52 -0
treepeat/formatters/sarif.py +172 -0
treepeat/models/__init__.py +5 -0
treepeat/models/ast.py +50 -0
treepeat/models/normalization.py +41 -0
treepeat/models/shingle.py +116 -0
treepeat/models/similarity.py +122 -0
treepeat/pipeline/__init__.py +5 -0
treepeat/pipeline/auto_chunk_extraction.py +243 -0
treepeat/pipeline/languages/__init__.py +54 -0
treepeat/pipeline/languages/base.py +36 -0
treepeat/pipeline/languages/bash.py +58 -0
treepeat/pipeline/languages/css.py +71 -0
treepeat/pipeline/languages/html.py +60 -0
treepeat/pipeline/languages/javascript.py +78 -0
treepeat/pipeline/languages/markdown.py +30 -0
treepeat/pipeline/languages/python.py +132 -0
treepeat/pipeline/languages/sql.py +53 -0
treepeat/pipeline/languages/typescript.py +10 -0
treepeat/pipeline/lsh_stage.py +361 -0
treepeat/pipeline/minhash_stage.py +63 -0
treepeat/pipeline/parse.py +310 -0
treepeat/pipeline/pipeline.py +243 -0
treepeat/pipeline/region_extraction.py +248 -0
treepeat/pipeline/rules/__init__.py +0 -0
treepeat/pipeline/rules/engine.py +314 -0
treepeat/pipeline/rules/models.py +46 -0
treepeat/pipeline/rules/parser.py +137 -0
treepeat/pipeline/rules_factory.py +54 -0
treepeat/pipeline/shingle.py +289 -0
treepeat/pipeline/statistical_chunk_extraction.py +441 -0
treepeat/pipeline/verification.py +218 -0
treepeat/terminal_detect.py +274 -0
treepeat-0.0.1.dist-info/METADATA +89 -0
treepeat-0.0.1.dist-info/RECORD +48 -0
treepeat-0.0.1.dist-info/WHEEL +5 -0
treepeat-0.0.1.dist-info/entry_points.txt +2 -0
treepeat-0.0.1.dist-info/top_level.txt +1 -0

treepeat/__init__.py ADDED Viewed

File without changes

treepeat/cli/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""CLI module."""
+from treepeat.cli.cli import main
+__all__ = ["main"]

treepeat/cli/cli.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""CLI interface for treepeat."""
+import logging
+import click
+from rich.console import Console
+from rich.logging import RichHandler
+from treepeat.cli.commands import detect, list_ruleset, treesitter
+console = Console()
+def setup_logging(log_level: str) -> None:
+    """Configure logging with rich handler."""
+    logging.basicConfig(
+        level=log_level,
+        format="%(message)s",
+        handlers=[RichHandler(console=console, rich_tracebacks=True)],
+    )
+@click.group()
+@click.pass_context
+@click.option(
+    "--log-level",
+    "-l",
+    type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False),
+    default="WARNING",
+    help="Set the logging level",
+)
+@click.option(
+    "--ruleset",
+    "-r",
+    type=click.Choice(["none", "default", "loose"], case_sensitive=False),
+    default="default",
+    help="Built-in ruleset profile to use (default: default)",
+)
+def main(
+    ctx: click.Context,
+    log_level: str,
+    ruleset: str,
+) -> None:
+    """Tree-sitter based similarity detector."""
+    setup_logging(log_level.upper())
+    # Store common options in context for subcommands
+    ctx.ensure_object(dict)
+    ctx.obj["log_level"] = log_level
+    ctx.obj["ruleset"] = ruleset
+# Register subcommands
+main.add_command(detect)
+main.add_command(treesitter)
+main.add_command(list_ruleset)
+if __name__ == "__main__":
+    main()

treepeat/cli/commands/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""CLI subcommands."""
+from .detect import detect
+from .list_ruleset import list_ruleset
+from .treesitter import treesitter
+__all__ = ["detect", "list_ruleset", "treesitter"]

treepeat/cli/commands/detect.py ADDED Viewed

@@ -0,0 +1,390 @@
+"""Detect command - find similar code regions."""
+import sys
+from pathlib import Path
+import click
+from rich.console import Console
+from rich.table import Table
+from treepeat.config import LSHSettings, MinHashSettings, PipelineSettings, RulesSettings, ShingleSettings, set_settings
+from treepeat.formatters import format_as_sarif
+from treepeat.models.similarity import Region, RegionSignature, SimilarRegionGroup, SimilarityResult
+from treepeat.pipeline.pipeline import run_pipeline
+console = Console()
+def _parse_patterns(pattern_string: str) -> list[str]:
+    """Parse comma-separated pattern string into list."""
+    return [p.strip() for p in pattern_string.split(",") if p.strip()]
+def _create_rules_settings(ruleset: str) -> RulesSettings:
+    """Create RulesSettings."""
+    return RulesSettings(ruleset=ruleset)
+def _configure_settings(
+    ruleset: str,
+    similarity_percent: float,
+    min_lines: int,
+    ignore: str,
+    ignore_files: str,
+    ignore_node_types: str,
+) -> None:
+    """Configure pipeline settings."""
+    lsh_settings = LSHSettings(
+        similarity_percent=similarity_percent / 100.0,
+        min_lines=min_lines,
+        ignore_node_types=_parse_patterns(ignore_node_types),
+    )
+    settings = PipelineSettings(
+        rules=_create_rules_settings(ruleset),
+        shingle=ShingleSettings(),  # Uses default k=3
+        minhash=MinHashSettings(),  # Uses default num_perm=128
+        lsh=lsh_settings,
+        ignore_patterns=_parse_patterns(ignore),
+        ignore_file_patterns=_parse_patterns(ignore_files),
+    )
+    set_settings(settings)
+def _write_output(text: str, output_path: Path | None) -> None:
+    """Write output text to file or stdout."""
+    if output_path:
+        output_path.write_text(text)
+    else:
+        print(text)
+def _run_pipeline_with_ui(path: Path, output_format: str) -> SimilarityResult:
+    """Run the pipeline with appropriate UI feedback based on output format."""
+    if output_format.lower() == "console":
+        from treepeat.config import get_settings
+        settings = get_settings()
+        console.print(f"\nRuleset: [cyan]{settings.rules.ruleset}[/cyan]")
+        console.print(f"Analyzing: [cyan]{path}[/cyan]\n")
+        with console.status("[bold green]Running pipeline..."):
+            return run_pipeline(path)
+    else:
+        return run_pipeline(path)
+def _group_signatures_by_file(
+    signatures: list[RegionSignature],
+) -> dict[Path, list[RegionSignature]]:
+    """Group region signatures by file path."""
+    regions_by_file: dict[Path, list[RegionSignature]] = {}
+    for sig in signatures:
+        path = sig.region.path
+        if path not in regions_by_file:
+            regions_by_file[path] = []
+        regions_by_file[path].append(sig)
+    return regions_by_file
+def _get_group_sort_key(group: SimilarRegionGroup) -> tuple[float, float]:
+    """Get sort key for a similarity group by similarity and average line count."""
+    avg_lines = sum(r.end_line - r.start_line + 1 for r in group.regions) / len(group.regions)
+    return (group.similarity, avg_lines)
+def _format_region_name(region: Region) -> str:
+    """Format region name with type if not lines."""
+    if region.region_type == "lines":
+        return region.region_name
+    return f"{region.region_name}({region.region_type})"
+def _display_group(group: SimilarRegionGroup, show_diff: bool = False) -> None:
+    """Display a single similarity group with optional diff."""
+    from treepeat.diff import display_diff
+    # Display similarity group header
+    console.print(f"Similar group found ([bold]{group.similarity:.1%}[/bold] similar, {group.size} regions):")
+    # Display all regions in the group
+    for i, region in enumerate(group.regions):
+        lines = region.end_line - region.start_line + 1
+        prefix = "  - " if i == 0 else "    "
+        region_display = _format_region_name(region)
+        console.print(
+            f"{prefix}{region.path} [{region.start_line}:{region.end_line}] "
+            f"({lines} lines) {region_display}"
+        )
+    # Show diff if requested and we have at least 2 regions
+    if show_diff and len(group.regions) >= 2:
+        console.print()
+        display_diff(group.regions[0], group.regions[1])
+    else:
+        console.print()  # Blank line between groups
+def display_similar_groups(result: SimilarityResult, show_diff: bool = False) -> None:
+    """Display similar region groups with optional diff."""
+    if not result.similar_groups:
+        console.print("\n[yellow]No similar regions found above threshold.[/yellow]")
+        return
+    console.print("\n[bold cyan]Similar Regions:[/bold cyan]")
+    sorted_groups = sorted(result.similar_groups, key=_get_group_sort_key)
+    for group in sorted_groups:
+        _display_group(group, show_diff=show_diff)
+def display_failed_files(result: SimilarityResult, show_details: bool) -> None:
+    """Display failed files with optional error details."""
+    if not result.failed_files:
+        return
+    console.print("\n[bold red]Failed Files:[/bold red]")
+    for file_path, error in result.failed_files.items():
+        console.print(f"  [red]✗[/red] {file_path}")
+        if show_details:
+            console.print(f"    [dim]{error}[/dim]")
+def _init_language_stats(
+    stats_by_format: dict[str, dict[str, int | set[Path]]], language: str
+) -> None:
+    """Initialize stats entry for a language if not present."""
+    if language not in stats_by_format:
+        stats_by_format[language] = {"files": set(), "groups": 0, "lines": 0}
+def _collect_files_from_signatures(
+    signatures: list[RegionSignature],
+) -> dict[str, dict[str, int | set[Path]]]:
+    """Collect all processed files from signatures."""
+    stats_by_format: dict[str, dict[str, int | set[Path]]] = {}
+    for signature in signatures:
+        region = signature.region
+        language = region.language
+        _init_language_stats(stats_by_format, language)
+        stats = stats_by_format[language]
+        stats["files"].add(region.path)  # type: ignore[union-attr]
+    return stats_by_format
+def _add_duplicate_stats(
+    stats_by_format: dict[str, dict[str, int | set[Path]]],
+    similar_groups: list[SimilarRegionGroup],
+) -> None:
+    """Add group counts and duplicate lines from similar groups."""
+    for group in similar_groups:
+        for region in group.regions:
+            language = region.language
+            _init_language_stats(stats_by_format, language)
+            stats = stats_by_format[language]
+            stats["lines"] += region.end_line - region.start_line + 1  # type: ignore[operator]
+        # Count group once per language (use first region's language)
+        if group.regions:
+            first_language = group.regions[0].language
+            stats_by_format[first_language]["groups"] += 1  # type: ignore[operator]
+def _collect_format_statistics(result: SimilarityResult) -> dict[str, dict[str, int | set[Path]]]:
+    """Collect statistics by language/format from all processed files."""
+    stats_by_format = _collect_files_from_signatures(result.signatures)
+    _add_duplicate_stats(stats_by_format, result.similar_groups)
+    return stats_by_format
+def _populate_summary_table(
+    table: Table,
+    stats_by_format: dict[str, dict[str, int | set[Path]]],
+) -> tuple[set[Path], int, int]:
+    """Populate summary table with format statistics and return totals."""
+    total_files: set[Path] = set()
+    total_groups = 0
+    total_lines = 0
+    for language in sorted(stats_by_format.keys()):
+        stats = stats_by_format[language]
+        files = stats["files"]
+        groups = stats["groups"]
+        lines = stats["lines"]
+        # Type narrowing assertions
+        assert isinstance(files, set)
+        assert isinstance(groups, int)
+        assert isinstance(lines, int)
+        table.add_row(
+            language,
+            str(len(files)),
+            str(groups),
+            str(lines),
+        )
+        # Accumulate totals
+        total_files.update(files)
+        total_groups += groups
+        total_lines += lines
+    return total_files, total_groups, total_lines
+def display_summary_table(result: SimilarityResult) -> None:
+    """Display summary table with statistics by format."""
+    # Show stats even if no similar groups found (to show all processed files)
+    if not result.signatures:
+        return
+    stats_by_format = _collect_format_statistics(result)
+    # Create summary table
+    table = Table(show_header=True, header_style="bold")
+    table.add_column("Format", style="cyan")
+    table.add_column("# Files", justify="right")
+    table.add_column("Groups Found", justify="right")
+    table.add_column("Lines", justify="right")
+    # Populate table and calculate totals
+    total_files, total_groups, total_lines = _populate_summary_table(table, stats_by_format)
+    # Add totals row
+    table.add_row(
+        "[bold]Totals[/bold]",
+        f"[bold]{len(total_files)}[/bold]",
+        f"[bold]{total_groups}[/bold]",
+        f"[bold]{total_lines}[/bold]",
+        end_section=True,
+    )
+    console.print("\n")
+    console.print(table)
+def _handle_output(
+    result: SimilarityResult,
+    output_format: str,
+    output_path: Path | None,
+    log_level: str,
+    show_diff: bool = False,
+) -> None:
+    """Handle formatting and outputting results."""
+    if output_format.lower() == "sarif":
+        output_text = format_as_sarif(result, pretty=True)
+        _write_output(output_text, output_path)
+    else:  # console
+        display_similar_groups(result, show_diff=show_diff)
+        display_failed_files(result, show_details=(log_level.upper() == "DEBUG"))
+        display_summary_table(result)
+        console.print()
+def _check_result_errors(result: SimilarityResult, output_format: str) -> None:
+    """Check for errors in the result and exit if necessary."""
+    if result.success_count == 0 and result.failure_count > 0:
+        if output_format.lower() == "console":
+            console.print("[bold red]Error:[/bold red] Failed to parse any files")
+            display_failed_files(result, show_details=True)
+        sys.exit(1)
+@click.command()
+@click.argument("path", type=click.Path(exists=True, path_type=Path))
+@click.pass_context
+@click.option(
+    "--similarity",
+    "-s",
+    type=click.IntRange(5, 100),
+    default=100,
+    help="Percent similarity threshold (default: 100)",
+)
+@click.option(
+    "--min-lines",
+    "-ml",
+    type=click.IntRange(1),
+    default=5,
+    help="Minimum number of lines to be considered similar (default: 5)",
+)
+@click.option(
+    "--format",
+    "-f",
+    "output_format",
+    type=click.Choice(["console", "sarif"], case_sensitive=False),
+    default="console",
+    help="Output format (default: console)",
+)
+@click.option(
+    "--output",
+    "-o",
+    type=click.Path(path_type=Path),
+    default=None,
+    help="Output file path (default: stdout)",
+)
+@click.option(
+    "--ignore",
+    "-i",
+    type=str,
+    default="",
+    help="Comma-separated list of glob patterns to ignore files (e.g., '*.test.py,**/node_modules/**')",
+)
+@click.option(
+    "--ignore-files",
+    "-if",
+    type=str,
+    default="**/.*ignore",
+    help="Comma-separated list of glob patterns to find ignore files (default: '**/.*ignore')",
+)
+@click.option(
+    "--diff",
+    "-d",
+    is_flag=True,
+    default=False,
+    help="Show side-by-side diff between the first two files in each similar group (console format only)",
+)
+@click.option(
+    "--fail",
+    is_flag=True,
+    default=False,
+    help="Exit with error code 1 if any similar blocks are detected",
+)
+@click.option(
+    "--ignore-node-types",
+    "-int",
+    type=str,
+    default="",
+    help="Comma-separated list of AST node types to ignore during region extraction (e.g., 'parameters,argument_list')",
+)
+def detect(
+    ctx: click.Context,
+    path: Path,
+    similarity: float,
+    min_lines: int,
+    output_format: str,
+    output: Path | None,
+    ignore: str,
+    ignore_files: str,
+    diff: bool,
+    fail: bool,
+    ignore_node_types: str,
+) -> None:
+    """Detect similar code regions of files in a path."""
+    log_level = ctx.obj["log_level"]
+    ruleset = ctx.obj["ruleset"]
+    _configure_settings(
+        ruleset,
+        similarity,
+        min_lines,
+        ignore,
+        ignore_files,
+        ignore_node_types,
+    )
+    result = _run_pipeline_with_ui(path, output_format)
+    _check_result_errors(result, output_format)
+    _handle_output(result, output_format, output, log_level, diff)
+    # Exit with error code 1 in strict mode if any similar blocks are detected
+    if fail and result.similar_groups:
+        sys.exit(1)

treepeat/cli/commands/list_ruleset.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""List-ruleset command - display rules in a ruleset."""
+from typing import Any
+import click
+from rich.console import Console
+console = Console()
+def _print_rule_spec(rule: Any) -> None:
+    """Print rule specification."""
+    query_preview = rule.query[:60] + "..." if len(rule.query) > 60 else rule.query
+    rule_spec = f"languages={','.join(rule.languages)}, action={rule.action.value if rule.action else 'none'}"
+    console.print(f"    [dim]{rule_spec}[/dim]")
+    console.print(f"    [dim]query: {query_preview}[/dim]\n")
+def _build_ruleset_header(ruleset_name: str, language_filter: str | None) -> str:
+    """Build the ruleset header with optional language filter."""
+    header = f"Ruleset: {ruleset_name}"
+    if language_filter:
+        header += f" (language: {language_filter})"
+    return header
+def _print_empty_ruleset_message(language_filter: str | None) -> None:
+    """Print message when no rules are found."""
+    if language_filter:
+        console.print(f"  [dim]No rules found for language '{language_filter}'[/dim]\n")
+    else:
+        console.print("  [dim]No normalization rules - raw AST comparison[/dim]\n")
+def _filter_rules_by_language(
+    rules: list[tuple[Any, str]], language_filter: str | None
+) -> list[tuple[Any, str]]:
+    """Filter rules by language if specified."""
+    if not language_filter:
+        return rules
+    return [(rule, desc) for rule, desc in rules if language_filter in rule.languages]
+def _print_rulesets(ruleset_name: str, language_filter: str | None = None) -> None:
+    """Print rules in the specified ruleset, optionally filtered by language."""
+    from treepeat.pipeline.rules_factory import get_ruleset_with_descriptions
+    rules_with_descriptions = get_ruleset_with_descriptions(ruleset_name)
+    rules_with_descriptions = _filter_rules_by_language(rules_with_descriptions, language_filter)
+    header = _build_ruleset_header(ruleset_name, language_filter)
+    console.print(f"\n[bold blue]{header}[/bold blue]\n")
+    if not rules_with_descriptions:
+        _print_empty_ruleset_message(language_filter)
+        return
+    console.print(f"[dim]{len(rules_with_descriptions)} rule(s):[/dim]\n")
+    for rule, description in rules_with_descriptions:
+        console.print(f"  [cyan]•[/cyan] {description}")
+        _print_rule_spec(rule)
+@click.command(name="list-ruleset")
+@click.argument(
+    "ruleset",
+    type=click.Choice(["none", "default", "loose"], case_sensitive=False),
+)
+@click.option(
+    "--language",
+    "-l",
+    type=str,
+    default=None,
+    help="Filter rules by language (e.g., python, java, javascript)",
+)
+def list_ruleset(ruleset: str, language: str | None) -> None:
+    """List rules in the specified ruleset.
+    Display all rules in a given ruleset (none/default/loose), optionally
+    filtered by a specific programming language.
+    """
+    _print_rulesets(ruleset, language)