treepeat 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. treepeat/__init__.py +0 -0
  2. treepeat/cli/__init__.py +5 -0
  3. treepeat/cli/cli.py +60 -0
  4. treepeat/cli/commands/__init__.py +7 -0
  5. treepeat/cli/commands/detect.py +390 -0
  6. treepeat/cli/commands/list_ruleset.py +82 -0
  7. treepeat/cli/commands/treesitter.py +316 -0
  8. treepeat/config.py +103 -0
  9. treepeat/diff.py +208 -0
  10. treepeat/formatters/__init__.py +6 -0
  11. treepeat/formatters/json.py +52 -0
  12. treepeat/formatters/sarif.py +172 -0
  13. treepeat/models/__init__.py +5 -0
  14. treepeat/models/ast.py +50 -0
  15. treepeat/models/normalization.py +41 -0
  16. treepeat/models/shingle.py +116 -0
  17. treepeat/models/similarity.py +122 -0
  18. treepeat/pipeline/__init__.py +5 -0
  19. treepeat/pipeline/auto_chunk_extraction.py +243 -0
  20. treepeat/pipeline/languages/__init__.py +54 -0
  21. treepeat/pipeline/languages/base.py +36 -0
  22. treepeat/pipeline/languages/bash.py +58 -0
  23. treepeat/pipeline/languages/css.py +71 -0
  24. treepeat/pipeline/languages/html.py +60 -0
  25. treepeat/pipeline/languages/javascript.py +78 -0
  26. treepeat/pipeline/languages/markdown.py +30 -0
  27. treepeat/pipeline/languages/python.py +132 -0
  28. treepeat/pipeline/languages/sql.py +53 -0
  29. treepeat/pipeline/languages/typescript.py +10 -0
  30. treepeat/pipeline/lsh_stage.py +361 -0
  31. treepeat/pipeline/minhash_stage.py +63 -0
  32. treepeat/pipeline/parse.py +310 -0
  33. treepeat/pipeline/pipeline.py +243 -0
  34. treepeat/pipeline/region_extraction.py +248 -0
  35. treepeat/pipeline/rules/__init__.py +0 -0
  36. treepeat/pipeline/rules/engine.py +314 -0
  37. treepeat/pipeline/rules/models.py +46 -0
  38. treepeat/pipeline/rules/parser.py +137 -0
  39. treepeat/pipeline/rules_factory.py +54 -0
  40. treepeat/pipeline/shingle.py +289 -0
  41. treepeat/pipeline/statistical_chunk_extraction.py +441 -0
  42. treepeat/pipeline/verification.py +218 -0
  43. treepeat/terminal_detect.py +274 -0
  44. treepeat-0.0.1.dist-info/METADATA +89 -0
  45. treepeat-0.0.1.dist-info/RECORD +48 -0
  46. treepeat-0.0.1.dist-info/WHEEL +5 -0
  47. treepeat-0.0.1.dist-info/entry_points.txt +2 -0
  48. treepeat-0.0.1.dist-info/top_level.txt +1 -0
treepeat/__init__.py ADDED
File without changes
@@ -0,0 +1,5 @@
1
+ """CLI module."""
2
+
3
+ from treepeat.cli.cli import main
4
+
5
+ __all__ = ["main"]
treepeat/cli/cli.py ADDED
@@ -0,0 +1,60 @@
1
+ """CLI interface for treepeat."""
2
+
3
+ import logging
4
+
5
+ import click
6
+ from rich.console import Console
7
+ from rich.logging import RichHandler
8
+
9
+ from treepeat.cli.commands import detect, list_ruleset, treesitter
10
+
11
+ console = Console()
12
+
13
+
14
+ def setup_logging(log_level: str) -> None:
15
+ """Configure logging with rich handler."""
16
+ logging.basicConfig(
17
+ level=log_level,
18
+ format="%(message)s",
19
+ handlers=[RichHandler(console=console, rich_tracebacks=True)],
20
+ )
21
+
22
+
23
+ @click.group()
24
+ @click.pass_context
25
+ @click.option(
26
+ "--log-level",
27
+ "-l",
28
+ type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False),
29
+ default="WARNING",
30
+ help="Set the logging level",
31
+ )
32
+ @click.option(
33
+ "--ruleset",
34
+ "-r",
35
+ type=click.Choice(["none", "default", "loose"], case_sensitive=False),
36
+ default="default",
37
+ help="Built-in ruleset profile to use (default: default)",
38
+ )
39
+ def main(
40
+ ctx: click.Context,
41
+ log_level: str,
42
+ ruleset: str,
43
+ ) -> None:
44
+ """Tree-sitter based similarity detector."""
45
+ setup_logging(log_level.upper())
46
+
47
+ # Store common options in context for subcommands
48
+ ctx.ensure_object(dict)
49
+ ctx.obj["log_level"] = log_level
50
+ ctx.obj["ruleset"] = ruleset
51
+
52
+
53
+ # Register subcommands
54
+ main.add_command(detect)
55
+ main.add_command(treesitter)
56
+ main.add_command(list_ruleset)
57
+
58
+
59
+ if __name__ == "__main__":
60
+ main()
@@ -0,0 +1,7 @@
1
+ """CLI subcommands."""
2
+
3
+ from .detect import detect
4
+ from .list_ruleset import list_ruleset
5
+ from .treesitter import treesitter
6
+
7
+ __all__ = ["detect", "list_ruleset", "treesitter"]
@@ -0,0 +1,390 @@
1
+ """Detect command - find similar code regions."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import click
7
+ from rich.console import Console
8
+ from rich.table import Table
9
+
10
+ from treepeat.config import LSHSettings, MinHashSettings, PipelineSettings, RulesSettings, ShingleSettings, set_settings
11
+ from treepeat.formatters import format_as_sarif
12
+ from treepeat.models.similarity import Region, RegionSignature, SimilarRegionGroup, SimilarityResult
13
+ from treepeat.pipeline.pipeline import run_pipeline
14
+
15
+ console = Console()
16
+
17
+
18
+ def _parse_patterns(pattern_string: str) -> list[str]:
19
+ """Parse comma-separated pattern string into list."""
20
+ return [p.strip() for p in pattern_string.split(",") if p.strip()]
21
+
22
+
23
+ def _create_rules_settings(ruleset: str) -> RulesSettings:
24
+ """Create RulesSettings."""
25
+ return RulesSettings(ruleset=ruleset)
26
+
27
+
28
+ def _configure_settings(
29
+ ruleset: str,
30
+ similarity_percent: float,
31
+ min_lines: int,
32
+ ignore: str,
33
+ ignore_files: str,
34
+ ignore_node_types: str,
35
+ ) -> None:
36
+ """Configure pipeline settings."""
37
+ lsh_settings = LSHSettings(
38
+ similarity_percent=similarity_percent / 100.0,
39
+ min_lines=min_lines,
40
+ ignore_node_types=_parse_patterns(ignore_node_types),
41
+ )
42
+
43
+ settings = PipelineSettings(
44
+ rules=_create_rules_settings(ruleset),
45
+ shingle=ShingleSettings(), # Uses default k=3
46
+ minhash=MinHashSettings(), # Uses default num_perm=128
47
+ lsh=lsh_settings,
48
+ ignore_patterns=_parse_patterns(ignore),
49
+ ignore_file_patterns=_parse_patterns(ignore_files),
50
+ )
51
+ set_settings(settings)
52
+
53
+
54
+ def _write_output(text: str, output_path: Path | None) -> None:
55
+ """Write output text to file or stdout."""
56
+ if output_path:
57
+ output_path.write_text(text)
58
+ else:
59
+ print(text)
60
+
61
+
62
+ def _run_pipeline_with_ui(path: Path, output_format: str) -> SimilarityResult:
63
+ """Run the pipeline with appropriate UI feedback based on output format."""
64
+ if output_format.lower() == "console":
65
+ from treepeat.config import get_settings
66
+ settings = get_settings()
67
+ console.print(f"\nRuleset: [cyan]{settings.rules.ruleset}[/cyan]")
68
+ console.print(f"Analyzing: [cyan]{path}[/cyan]\n")
69
+ with console.status("[bold green]Running pipeline..."):
70
+ return run_pipeline(path)
71
+ else:
72
+ return run_pipeline(path)
73
+
74
+
75
+ def _group_signatures_by_file(
76
+ signatures: list[RegionSignature],
77
+ ) -> dict[Path, list[RegionSignature]]:
78
+ """Group region signatures by file path."""
79
+ regions_by_file: dict[Path, list[RegionSignature]] = {}
80
+ for sig in signatures:
81
+ path = sig.region.path
82
+ if path not in regions_by_file:
83
+ regions_by_file[path] = []
84
+ regions_by_file[path].append(sig)
85
+ return regions_by_file
86
+
87
+
88
+ def _get_group_sort_key(group: SimilarRegionGroup) -> tuple[float, float]:
89
+ """Get sort key for a similarity group by similarity and average line count."""
90
+ avg_lines = sum(r.end_line - r.start_line + 1 for r in group.regions) / len(group.regions)
91
+ return (group.similarity, avg_lines)
92
+
93
+
94
+ def _format_region_name(region: Region) -> str:
95
+ """Format region name with type if not lines."""
96
+ if region.region_type == "lines":
97
+ return region.region_name
98
+ return f"{region.region_name}({region.region_type})"
99
+
100
+
101
+ def _display_group(group: SimilarRegionGroup, show_diff: bool = False) -> None:
102
+ """Display a single similarity group with optional diff."""
103
+ from treepeat.diff import display_diff
104
+
105
+ # Display similarity group header
106
+ console.print(f"Similar group found ([bold]{group.similarity:.1%}[/bold] similar, {group.size} regions):")
107
+
108
+ # Display all regions in the group
109
+ for i, region in enumerate(group.regions):
110
+ lines = region.end_line - region.start_line + 1
111
+ prefix = " - " if i == 0 else " "
112
+ region_display = _format_region_name(region)
113
+ console.print(
114
+ f"{prefix}{region.path} [{region.start_line}:{region.end_line}] "
115
+ f"({lines} lines) {region_display}"
116
+ )
117
+
118
+ # Show diff if requested and we have at least 2 regions
119
+ if show_diff and len(group.regions) >= 2:
120
+ console.print()
121
+ display_diff(group.regions[0], group.regions[1])
122
+ else:
123
+ console.print() # Blank line between groups
124
+
125
+
126
+ def display_similar_groups(result: SimilarityResult, show_diff: bool = False) -> None:
127
+ """Display similar region groups with optional diff."""
128
+ if not result.similar_groups:
129
+ console.print("\n[yellow]No similar regions found above threshold.[/yellow]")
130
+ return
131
+
132
+ console.print("\n[bold cyan]Similar Regions:[/bold cyan]")
133
+ sorted_groups = sorted(result.similar_groups, key=_get_group_sort_key)
134
+
135
+ for group in sorted_groups:
136
+ _display_group(group, show_diff=show_diff)
137
+
138
+
139
+ def display_failed_files(result: SimilarityResult, show_details: bool) -> None:
140
+ """Display failed files with optional error details."""
141
+ if not result.failed_files:
142
+ return
143
+
144
+ console.print("\n[bold red]Failed Files:[/bold red]")
145
+ for file_path, error in result.failed_files.items():
146
+ console.print(f" [red]✗[/red] {file_path}")
147
+ if show_details:
148
+ console.print(f" [dim]{error}[/dim]")
149
+
150
+
151
+ def _init_language_stats(
152
+ stats_by_format: dict[str, dict[str, int | set[Path]]], language: str
153
+ ) -> None:
154
+ """Initialize stats entry for a language if not present."""
155
+ if language not in stats_by_format:
156
+ stats_by_format[language] = {"files": set(), "groups": 0, "lines": 0}
157
+
158
+
159
+ def _collect_files_from_signatures(
160
+ signatures: list[RegionSignature],
161
+ ) -> dict[str, dict[str, int | set[Path]]]:
162
+ """Collect all processed files from signatures."""
163
+ stats_by_format: dict[str, dict[str, int | set[Path]]] = {}
164
+
165
+ for signature in signatures:
166
+ region = signature.region
167
+ language = region.language
168
+ _init_language_stats(stats_by_format, language)
169
+ stats = stats_by_format[language]
170
+ stats["files"].add(region.path) # type: ignore[union-attr]
171
+
172
+ return stats_by_format
173
+
174
+
175
+ def _add_duplicate_stats(
176
+ stats_by_format: dict[str, dict[str, int | set[Path]]],
177
+ similar_groups: list[SimilarRegionGroup],
178
+ ) -> None:
179
+ """Add group counts and duplicate lines from similar groups."""
180
+ for group in similar_groups:
181
+ for region in group.regions:
182
+ language = region.language
183
+ _init_language_stats(stats_by_format, language)
184
+ stats = stats_by_format[language]
185
+ stats["lines"] += region.end_line - region.start_line + 1 # type: ignore[operator]
186
+
187
+ # Count group once per language (use first region's language)
188
+ if group.regions:
189
+ first_language = group.regions[0].language
190
+ stats_by_format[first_language]["groups"] += 1 # type: ignore[operator]
191
+
192
+
193
+ def _collect_format_statistics(result: SimilarityResult) -> dict[str, dict[str, int | set[Path]]]:
194
+ """Collect statistics by language/format from all processed files."""
195
+ stats_by_format = _collect_files_from_signatures(result.signatures)
196
+ _add_duplicate_stats(stats_by_format, result.similar_groups)
197
+ return stats_by_format
198
+
199
+
200
+ def _populate_summary_table(
201
+ table: Table,
202
+ stats_by_format: dict[str, dict[str, int | set[Path]]],
203
+ ) -> tuple[set[Path], int, int]:
204
+ """Populate summary table with format statistics and return totals."""
205
+ total_files: set[Path] = set()
206
+ total_groups = 0
207
+ total_lines = 0
208
+
209
+ for language in sorted(stats_by_format.keys()):
210
+ stats = stats_by_format[language]
211
+ files = stats["files"]
212
+ groups = stats["groups"]
213
+ lines = stats["lines"]
214
+
215
+ # Type narrowing assertions
216
+ assert isinstance(files, set)
217
+ assert isinstance(groups, int)
218
+ assert isinstance(lines, int)
219
+
220
+ table.add_row(
221
+ language,
222
+ str(len(files)),
223
+ str(groups),
224
+ str(lines),
225
+ )
226
+
227
+ # Accumulate totals
228
+ total_files.update(files)
229
+ total_groups += groups
230
+ total_lines += lines
231
+
232
+ return total_files, total_groups, total_lines
233
+
234
+
235
+ def display_summary_table(result: SimilarityResult) -> None:
236
+ """Display summary table with statistics by format."""
237
+ # Show stats even if no similar groups found (to show all processed files)
238
+ if not result.signatures:
239
+ return
240
+
241
+ stats_by_format = _collect_format_statistics(result)
242
+
243
+ # Create summary table
244
+ table = Table(show_header=True, header_style="bold")
245
+ table.add_column("Format", style="cyan")
246
+ table.add_column("# Files", justify="right")
247
+ table.add_column("Groups Found", justify="right")
248
+ table.add_column("Lines", justify="right")
249
+
250
+ # Populate table and calculate totals
251
+ total_files, total_groups, total_lines = _populate_summary_table(table, stats_by_format)
252
+
253
+ # Add totals row
254
+ table.add_row(
255
+ "[bold]Totals[/bold]",
256
+ f"[bold]{len(total_files)}[/bold]",
257
+ f"[bold]{total_groups}[/bold]",
258
+ f"[bold]{total_lines}[/bold]",
259
+ end_section=True,
260
+ )
261
+
262
+ console.print("\n")
263
+ console.print(table)
264
+
265
+
266
+ def _handle_output(
267
+ result: SimilarityResult,
268
+ output_format: str,
269
+ output_path: Path | None,
270
+ log_level: str,
271
+ show_diff: bool = False,
272
+ ) -> None:
273
+ """Handle formatting and outputting results."""
274
+ if output_format.lower() == "sarif":
275
+ output_text = format_as_sarif(result, pretty=True)
276
+ _write_output(output_text, output_path)
277
+ else: # console
278
+ display_similar_groups(result, show_diff=show_diff)
279
+ display_failed_files(result, show_details=(log_level.upper() == "DEBUG"))
280
+ display_summary_table(result)
281
+ console.print()
282
+
283
+
284
+ def _check_result_errors(result: SimilarityResult, output_format: str) -> None:
285
+ """Check for errors in the result and exit if necessary."""
286
+ if result.success_count == 0 and result.failure_count > 0:
287
+ if output_format.lower() == "console":
288
+ console.print("[bold red]Error:[/bold red] Failed to parse any files")
289
+ display_failed_files(result, show_details=True)
290
+ sys.exit(1)
291
+
292
+
293
+ @click.command()
294
+ @click.argument("path", type=click.Path(exists=True, path_type=Path))
295
+ @click.pass_context
296
+ @click.option(
297
+ "--similarity",
298
+ "-s",
299
+ type=click.IntRange(5, 100),
300
+ default=100,
301
+ help="Percent similarity threshold (default: 100)",
302
+ )
303
+ @click.option(
304
+ "--min-lines",
305
+ "-ml",
306
+ type=click.IntRange(1),
307
+ default=5,
308
+ help="Minimum number of lines to be considered similar (default: 5)",
309
+ )
310
+ @click.option(
311
+ "--format",
312
+ "-f",
313
+ "output_format",
314
+ type=click.Choice(["console", "sarif"], case_sensitive=False),
315
+ default="console",
316
+ help="Output format (default: console)",
317
+ )
318
+ @click.option(
319
+ "--output",
320
+ "-o",
321
+ type=click.Path(path_type=Path),
322
+ default=None,
323
+ help="Output file path (default: stdout)",
324
+ )
325
+ @click.option(
326
+ "--ignore",
327
+ "-i",
328
+ type=str,
329
+ default="",
330
+ help="Comma-separated list of glob patterns to ignore files (e.g., '*.test.py,**/node_modules/**')",
331
+ )
332
+ @click.option(
333
+ "--ignore-files",
334
+ "-if",
335
+ type=str,
336
+ default="**/.*ignore",
337
+ help="Comma-separated list of glob patterns to find ignore files (default: '**/.*ignore')",
338
+ )
339
+ @click.option(
340
+ "--diff",
341
+ "-d",
342
+ is_flag=True,
343
+ default=False,
344
+ help="Show side-by-side diff between the first two files in each similar group (console format only)",
345
+ )
346
+ @click.option(
347
+ "--fail",
348
+ is_flag=True,
349
+ default=False,
350
+ help="Exit with error code 1 if any similar blocks are detected",
351
+ )
352
+ @click.option(
353
+ "--ignore-node-types",
354
+ "-int",
355
+ type=str,
356
+ default="",
357
+ help="Comma-separated list of AST node types to ignore during region extraction (e.g., 'parameters,argument_list')",
358
+ )
359
+ def detect(
360
+ ctx: click.Context,
361
+ path: Path,
362
+ similarity: float,
363
+ min_lines: int,
364
+ output_format: str,
365
+ output: Path | None,
366
+ ignore: str,
367
+ ignore_files: str,
368
+ diff: bool,
369
+ fail: bool,
370
+ ignore_node_types: str,
371
+ ) -> None:
372
+ """Detect similar code regions of files in a path."""
373
+ log_level = ctx.obj["log_level"]
374
+ ruleset = ctx.obj["ruleset"]
375
+
376
+ _configure_settings(
377
+ ruleset,
378
+ similarity,
379
+ min_lines,
380
+ ignore,
381
+ ignore_files,
382
+ ignore_node_types,
383
+ )
384
+ result = _run_pipeline_with_ui(path, output_format)
385
+ _check_result_errors(result, output_format)
386
+ _handle_output(result, output_format, output, log_level, diff)
387
+
388
+ # Exit with error code 1 in strict mode if any similar blocks are detected
389
+ if fail and result.similar_groups:
390
+ sys.exit(1)
@@ -0,0 +1,82 @@
1
+ """List-ruleset command - display rules in a ruleset."""
2
+
3
+ from typing import Any
4
+
5
+ import click
6
+ from rich.console import Console
7
+
8
+ console = Console()
9
+
10
+
11
+ def _print_rule_spec(rule: Any) -> None:
12
+ """Print rule specification."""
13
+ query_preview = rule.query[:60] + "..." if len(rule.query) > 60 else rule.query
14
+ rule_spec = f"languages={','.join(rule.languages)}, action={rule.action.value if rule.action else 'none'}"
15
+ console.print(f" [dim]{rule_spec}[/dim]")
16
+ console.print(f" [dim]query: {query_preview}[/dim]\n")
17
+
18
+
19
+ def _build_ruleset_header(ruleset_name: str, language_filter: str | None) -> str:
20
+ """Build the ruleset header with optional language filter."""
21
+ header = f"Ruleset: {ruleset_name}"
22
+ if language_filter:
23
+ header += f" (language: {language_filter})"
24
+ return header
25
+
26
+
27
+ def _print_empty_ruleset_message(language_filter: str | None) -> None:
28
+ """Print message when no rules are found."""
29
+ if language_filter:
30
+ console.print(f" [dim]No rules found for language '{language_filter}'[/dim]\n")
31
+ else:
32
+ console.print(" [dim]No normalization rules - raw AST comparison[/dim]\n")
33
+
34
+
35
+ def _filter_rules_by_language(
36
+ rules: list[tuple[Any, str]], language_filter: str | None
37
+ ) -> list[tuple[Any, str]]:
38
+ """Filter rules by language if specified."""
39
+ if not language_filter:
40
+ return rules
41
+ return [(rule, desc) for rule, desc in rules if language_filter in rule.languages]
42
+
43
+
44
+ def _print_rulesets(ruleset_name: str, language_filter: str | None = None) -> None:
45
+ """Print rules in the specified ruleset, optionally filtered by language."""
46
+ from treepeat.pipeline.rules_factory import get_ruleset_with_descriptions
47
+
48
+ rules_with_descriptions = get_ruleset_with_descriptions(ruleset_name)
49
+ rules_with_descriptions = _filter_rules_by_language(rules_with_descriptions, language_filter)
50
+
51
+ header = _build_ruleset_header(ruleset_name, language_filter)
52
+ console.print(f"\n[bold blue]{header}[/bold blue]\n")
53
+
54
+ if not rules_with_descriptions:
55
+ _print_empty_ruleset_message(language_filter)
56
+ return
57
+
58
+ console.print(f"[dim]{len(rules_with_descriptions)} rule(s):[/dim]\n")
59
+ for rule, description in rules_with_descriptions:
60
+ console.print(f" [cyan]•[/cyan] {description}")
61
+ _print_rule_spec(rule)
62
+
63
+
64
+ @click.command(name="list-ruleset")
65
+ @click.argument(
66
+ "ruleset",
67
+ type=click.Choice(["none", "default", "loose"], case_sensitive=False),
68
+ )
69
+ @click.option(
70
+ "--language",
71
+ "-l",
72
+ type=str,
73
+ default=None,
74
+ help="Filter rules by language (e.g., python, java, javascript)",
75
+ )
76
+ def list_ruleset(ruleset: str, language: str | None) -> None:
77
+ """List rules in the specified ruleset.
78
+
79
+ Display all rules in a given ruleset (none/default/loose), optionally
80
+ filtered by a specific programming language.
81
+ """
82
+ _print_rulesets(ruleset, language)