codeclone 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeclone/__init__.py CHANGED
@@ -0,0 +1,16 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
9
+ from importlib.metadata import version, PackageNotFoundError
10
+
11
+ try:
12
+ __version__ = version("codeclone")
13
+ except PackageNotFoundError:
14
+ __version__ = "dev"
15
+
16
+ __all__ = ["__version__"]
codeclone/baseline.py CHANGED
@@ -1,3 +1,11 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
1
9
  from __future__ import annotations
2
10
 
3
11
  import json
codeclone/blockhash.py CHANGED
@@ -1,3 +1,11 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
1
9
  from __future__ import annotations
2
10
 
3
11
  import ast
@@ -5,8 +13,9 @@ import hashlib
5
13
 
6
14
  from .normalize import NormalizationConfig, AstNormalizer
7
15
 
16
+
8
17
  def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str:
9
18
  normalizer = AstNormalizer(cfg)
10
19
  stmt = ast.fix_missing_locations(normalizer.visit(stmt))
11
20
  dump = ast.dump(stmt, annotate_fields=True, include_attributes=False)
12
- return hashlib.sha1(dump.encode("utf-8")).hexdigest()
21
+ return hashlib.sha1(dump.encode("utf-8")).hexdigest()
codeclone/blocks.py CHANGED
@@ -1,3 +1,11 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
1
9
  from __future__ import annotations
2
10
 
3
11
  import ast
@@ -18,13 +26,13 @@ class BlockUnit:
18
26
 
19
27
 
20
28
  def extract_blocks(
21
- func_node: ast.AST,
22
- *,
23
- filepath: str,
24
- qualname: str,
25
- cfg: NormalizationConfig,
26
- block_size: int,
27
- max_blocks: int,
29
+ func_node: ast.AST,
30
+ *,
31
+ filepath: str,
32
+ qualname: str,
33
+ cfg: NormalizationConfig,
34
+ block_size: int,
35
+ max_blocks: int,
28
36
  ) -> list[BlockUnit]:
29
37
  body = getattr(func_node, "body", None)
30
38
  if not isinstance(body, list) or len(body) < block_size:
@@ -45,16 +53,18 @@ def extract_blocks(
45
53
  if last_start is not None and start - last_start < MIN_LINE_DISTANCE:
46
54
  continue
47
55
 
48
- bh = "|".join(stmt_hashes[i:i + block_size])
56
+ bh = "|".join(stmt_hashes[i : i + block_size])
49
57
 
50
- blocks.append(BlockUnit(
51
- block_hash=bh,
52
- filepath=filepath,
53
- qualname=qualname,
54
- start_line=start,
55
- end_line=end,
56
- size=block_size,
57
- ))
58
+ blocks.append(
59
+ BlockUnit(
60
+ block_hash=bh,
61
+ filepath=filepath,
62
+ qualname=qualname,
63
+ start_line=start,
64
+ end_line=end,
65
+ size=block_size,
66
+ )
67
+ )
58
68
 
59
69
  last_start = start
60
70
  if len(blocks) >= max_blocks:
codeclone/cache.py CHANGED
@@ -1,3 +1,11 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
1
9
  from __future__ import annotations
2
10
 
3
11
  import json
codeclone/cfg.py ADDED
@@ -0,0 +1,173 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import ast
12
+ from dataclasses import dataclass, field
13
+ from typing import Iterable
14
+
15
+
16
+ # =========================
17
+ # Core CFG structures
18
+ # =========================
19
+
20
+
21
+ @dataclass(eq=False)
22
+ class Block:
23
+ id: int
24
+ statements: list[ast.stmt] = field(default_factory=list)
25
+ successors: set["Block"] = field(default_factory=set)
26
+ is_terminated: bool = False
27
+
28
+ def add_successor(self, block: Block) -> None:
29
+ self.successors.add(block)
30
+
31
+ def __hash__(self) -> int:
32
+ return hash(self.id)
33
+
34
+ def __eq__(self, other: object) -> bool:
35
+ return isinstance(other, Block) and self.id == other.id
36
+
37
+
38
+ @dataclass
39
+ class CFG:
40
+ qualname: str
41
+ blocks: list[Block] = field(default_factory=list)
42
+
43
+ entry: Block = field(init=False)
44
+ exit: Block = field(init=False)
45
+
46
+ def __post_init__(self) -> None:
47
+ self.entry = self.create_block()
48
+ self.exit = self.create_block()
49
+
50
+ def create_block(self) -> Block:
51
+ block = Block(id=len(self.blocks))
52
+ self.blocks.append(block)
53
+ return block
54
+
55
+
56
+ # =========================
57
+ # CFG Builder
58
+ # =========================
59
+
60
+
61
+ class CFGBuilder:
62
+ def __init__(self) -> None:
63
+ self.cfg: CFG
64
+ self.current: Block
65
+
66
+ def build(
67
+ self,
68
+ qualname: str,
69
+ node: ast.FunctionDef | ast.AsyncFunctionDef,
70
+ ) -> CFG:
71
+ self.cfg = CFG(qualname)
72
+ self.current = self.cfg.entry
73
+
74
+ self._visit_statements(node.body)
75
+
76
+ if not self.current.is_terminated:
77
+ self.current.add_successor(self.cfg.exit)
78
+
79
+ return self.cfg
80
+
81
+ # ---------- Internals ----------
82
+
83
+ def _visit_statements(self, stmts: Iterable[ast.stmt]) -> None:
84
+ for stmt in stmts:
85
+ if self.current.is_terminated:
86
+ break
87
+ self._visit(stmt)
88
+
89
+ def _visit(self, stmt: ast.stmt) -> None:
90
+ match stmt:
91
+ case ast.Return():
92
+ self.current.statements.append(stmt)
93
+ self.current.is_terminated = True
94
+ self.current.add_successor(self.cfg.exit)
95
+
96
+ case ast.Raise():
97
+ self.current.statements.append(stmt)
98
+ self.current.is_terminated = True
99
+ self.current.add_successor(self.cfg.exit)
100
+
101
+ case ast.If():
102
+ self._visit_if(stmt)
103
+
104
+ case ast.While():
105
+ self._visit_while(stmt)
106
+
107
+ case ast.For():
108
+ self._visit_for(stmt)
109
+
110
+ case _:
111
+ self.current.statements.append(stmt)
112
+
113
+ # ---------- Control Flow ----------
114
+
115
+ def _visit_if(self, stmt: ast.If) -> None:
116
+ self.current.statements.append(ast.Expr(value=stmt.test))
117
+
118
+ then_block = self.cfg.create_block()
119
+ else_block = self.cfg.create_block()
120
+ after_block = self.cfg.create_block()
121
+
122
+ self.current.add_successor(then_block)
123
+ self.current.add_successor(else_block)
124
+
125
+ self.current = then_block
126
+ self._visit_statements(stmt.body)
127
+ if not self.current.is_terminated:
128
+ self.current.add_successor(after_block)
129
+
130
+ self.current = else_block
131
+ self._visit_statements(stmt.orelse)
132
+ if not self.current.is_terminated:
133
+ self.current.add_successor(after_block)
134
+
135
+ self.current = after_block
136
+
137
+ def _visit_while(self, stmt: ast.While) -> None:
138
+ cond_block = self.cfg.create_block()
139
+ body_block = self.cfg.create_block()
140
+ after_block = self.cfg.create_block()
141
+
142
+ self.current.add_successor(cond_block)
143
+
144
+ self.current = cond_block
145
+ self.current.statements.append(ast.Expr(value=stmt.test))
146
+ self.current.add_successor(body_block)
147
+ self.current.add_successor(after_block)
148
+
149
+ self.current = body_block
150
+ self._visit_statements(stmt.body)
151
+ if not self.current.is_terminated:
152
+ self.current.add_successor(cond_block)
153
+
154
+ self.current = after_block
155
+
156
+ def _visit_for(self, stmt: ast.For) -> None:
157
+ iter_block = self.cfg.create_block()
158
+ body_block = self.cfg.create_block()
159
+ after_block = self.cfg.create_block()
160
+
161
+ self.current.add_successor(iter_block)
162
+
163
+ self.current = iter_block
164
+ self.current.statements.append(ast.Expr(value=stmt.iter))
165
+ self.current.add_successor(body_block)
166
+ self.current.add_successor(after_block)
167
+
168
+ self.current = body_block
169
+ self._visit_statements(stmt.body)
170
+ if not self.current.is_terminated:
171
+ self.current.add_successor(iter_block)
172
+
173
+ self.current = after_block
codeclone/cli.py CHANGED
@@ -1,39 +1,71 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
1
9
  from __future__ import annotations
2
10
 
3
11
  import argparse
12
+ from concurrent.futures import ProcessPoolExecutor
4
13
  from pathlib import Path
5
14
 
6
15
  from .baseline import Baseline
7
16
  from .cache import Cache, file_stat_signature
8
17
  from .extractor import extract_units_from_source
18
+ from .html_report import build_html_report
9
19
  from .normalize import NormalizationConfig
10
20
  from .report import build_groups, build_block_groups, to_json, to_text
11
21
  from .scanner import iter_py_files, module_name_from_path
12
22
 
13
23
 
14
- def main():
24
+ def process_file(
25
+ filepath: str,
26
+ root: str,
27
+ cfg: NormalizationConfig,
28
+ min_loc: int,
29
+ min_stmt: int,
30
+ ) -> tuple[str, dict, list, list] | None:
31
+ try:
32
+ source = Path(filepath).read_text("utf-8")
33
+ except UnicodeDecodeError:
34
+ return None
35
+
36
+ stat = file_stat_signature(filepath)
37
+ module_name = module_name_from_path(root, filepath)
38
+
39
+ units, blocks = extract_units_from_source(
40
+ source=source,
41
+ filepath=filepath,
42
+ module_name=module_name,
43
+ cfg=cfg,
44
+ min_loc=min_loc,
45
+ min_stmt=min_stmt,
46
+ )
47
+
48
+ return filepath, stat, units, blocks
49
+
50
+
51
+ def main() -> None:
15
52
  ap = argparse.ArgumentParser("codeclone")
16
53
  ap.add_argument("root", help="Project root")
54
+ ap.add_argument("--processes", type=int, default=4)
17
55
  ap.add_argument("--cache", default="~/.cache/codeclone/")
18
56
  ap.add_argument("--min-loc", type=int, default=15)
19
57
  ap.add_argument("--min-stmt", type=int, default=6)
20
58
  ap.add_argument("--json-out", default="")
21
59
  ap.add_argument("--text-out", default="")
60
+ ap.add_argument("--html-out", default="")
22
61
  ap.add_argument("--fail-if-groups", type=int, default=-1)
23
62
  ap.add_argument("--baseline", default="~/.config/codeclone/baseline.json")
24
- ap.add_argument("--update-baseline", action="store_true",
25
- help="Write current clones as baseline")
26
- ap.add_argument("--fail-on-new", action="store_true",
27
- help="Fail if new clones appear vs baseline")
63
+ ap.add_argument("--update-baseline", action="store_true")
64
+ ap.add_argument("--fail-on-new", action="store_true")
65
+
28
66
  args = ap.parse_args()
29
67
 
30
- cfg = NormalizationConfig(
31
- ignore_docstrings=True,
32
- ignore_type_annotations=True,
33
- normalize_attributes=True,
34
- normalize_constants=True,
35
- normalize_names=True,
36
- )
68
+ cfg = NormalizationConfig()
37
69
 
38
70
  cache = Cache(args.cache)
39
71
  cache.load()
@@ -42,6 +74,8 @@ def main():
42
74
  all_blocks: list[dict] = []
43
75
  changed = 0
44
76
 
77
+ files_to_process: list[str] = []
78
+
45
79
  for fp in iter_py_files(args.root):
46
80
  stat = file_stat_signature(fp)
47
81
  cached = cache.get_file_entry(fp)
@@ -49,32 +83,52 @@ def main():
49
83
  if cached and cached.get("stat") == stat:
50
84
  all_units.extend(cached.get("units", []))
51
85
  all_blocks.extend(cached.get("blocks", []))
52
- continue
53
-
54
- try:
55
- source = Path(fp).read_text("utf-8")
56
- except UnicodeDecodeError:
57
- continue
58
-
59
- module_name = module_name_from_path(args.root, fp)
60
- units, blocks = extract_units_from_source(
61
- source=source,
62
- filepath=fp,
63
- module_name=module_name,
64
- cfg=cfg,
65
- min_loc=args.min_loc,
66
- min_stmt=args.min_stmt,
67
- )
68
-
69
- cache.put_file_entry(fp, stat, units, blocks)
70
- changed += 1
71
-
72
- all_units.extend([u.__dict__ for u in units])
73
- all_blocks.extend([b.__dict__ for b in blocks])
86
+ else:
87
+ files_to_process.append(fp)
88
+
89
+ with ProcessPoolExecutor(max_workers=args.processes) as executor:
90
+ futures = [
91
+ executor.submit(
92
+ process_file,
93
+ fp,
94
+ args.root,
95
+ cfg,
96
+ args.min_loc,
97
+ args.min_stmt,
98
+ )
99
+ for fp in files_to_process
100
+ ]
101
+
102
+ for future in futures:
103
+ result = future.result()
104
+ if result is None:
105
+ continue
106
+
107
+ fp, stat, units, blocks = result
108
+
109
+ cache.put_file_entry(fp, stat, units, blocks)
110
+ changed += 1
111
+
112
+ all_units.extend([u.__dict__ for u in units])
113
+ all_blocks.extend([b.__dict__ for b in blocks])
74
114
 
75
115
  func_groups = build_groups(all_units)
76
116
  block_groups = build_block_groups(all_blocks)
77
117
 
118
+ if args.html_out:
119
+ out = Path(args.html_out)
120
+ out.parent.mkdir(parents=True, exist_ok=True)
121
+ out.write_text(
122
+ build_html_report(
123
+ func_groups=func_groups,
124
+ block_groups=block_groups,
125
+ title="CodeClone Report",
126
+ context_lines=3,
127
+ max_snippet_lines=220,
128
+ ),
129
+ "utf-8",
130
+ )
131
+
78
132
  baseline = Baseline(args.baseline)
79
133
  baseline.load()
80
134
 
@@ -91,10 +145,7 @@ def main():
91
145
  out = Path(args.json_out)
92
146
  out.parent.mkdir(parents=True, exist_ok=True)
93
147
  out.write_text(
94
- to_json({
95
- "functions": func_groups,
96
- "blocks": block_groups,
97
- }),
148
+ to_json({"functions": func_groups, "blocks": block_groups}),
98
149
  "utf-8",
99
150
  )
100
151
 
@@ -114,26 +165,9 @@ def main():
114
165
  print(f"Function clone groups: {len(func_groups)}")
115
166
  print(f"Block clone groups: {len(block_groups)}")
116
167
 
117
- if args.fail_on_new:
118
- if new_func or new_block:
119
- print("\n❌ New code clones detected\n")
120
-
121
- if new_func:
122
- print(f"New FUNCTION clone groups: {len(new_func)}")
123
- for k in sorted(new_func):
124
- print(f" - {k}")
125
-
126
- if new_block:
127
- print(f"New BLOCK clone groups: {len(new_block)}")
128
- for k in sorted(new_block):
129
- print(f" - {k}")
130
-
131
- raise SystemExit(3)
132
-
133
- print(f"Baseline function clones: {len(baseline.functions)}")
134
- print(f"Baseline block clones: {len(baseline.blocks)}")
135
- print(f"New function clones: {len(new_func)}")
136
- print(f"New block clones: {len(new_block)}")
168
+ if args.fail_on_new and (new_func or new_block):
169
+ print("\n❌ New code clones detected\n")
170
+ raise SystemExit(3)
137
171
 
138
172
  cache.save()
139
173
 
codeclone/extractor.py CHANGED
@@ -1,11 +1,26 @@
1
+ """
2
+ CodeClone — AST and CFG-based code clone detector for Python
3
+ focused on architectural duplication.
4
+
5
+ Copyright (c) 2026 Den Rozhnovskiy
6
+ Licensed under the MIT License.
7
+ """
8
+
1
9
  from __future__ import annotations
2
10
 
3
11
  import ast
4
12
  from dataclasses import dataclass
13
+ from typing import Sequence
5
14
 
6
15
  from .blocks import extract_blocks, BlockUnit
16
+ from .cfg import CFGBuilder
7
17
  from .fingerprint import sha1, bucket_loc
8
- from .normalize import NormalizationConfig, normalized_ast_dump
18
+ from .normalize import NormalizationConfig, normalized_ast_dump_from_list
19
+
20
+
21
+ # =========================
22
+ # Data structures
23
+ # =========================
9
24
 
10
25
 
11
26
  @dataclass(frozen=True)
@@ -20,37 +35,83 @@ class Unit:
20
35
  loc_bucket: str
21
36
 
22
37
 
38
+ # =========================
39
+ # Helpers
40
+ # =========================
41
+
42
+
23
43
  def _stmt_count(node: ast.AST) -> int:
24
44
  body = getattr(node, "body", None)
25
45
  return len(body) if isinstance(body, list) else 0
26
46
 
27
47
 
28
48
  class _QualnameBuilder(ast.NodeVisitor):
29
- def __init__(self):
49
+ def __init__(self) -> None:
30
50
  self.stack: list[str] = []
31
- self.units: list[tuple[str, ast.AST]] = []
51
+ self.units: list[tuple[str, ast.FunctionDef | ast.AsyncFunctionDef]] = []
32
52
 
33
- def visit_ClassDef(self, node: ast.ClassDef):
53
+ def visit_ClassDef(self, node: ast.ClassDef) -> None:
34
54
  self.stack.append(node.name)
35
55
  self.generic_visit(node)
36
56
  self.stack.pop()
37
57
 
38
- def visit_FunctionDef(self, node: ast.FunctionDef):
58
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
39
59
  name = ".".join(self.stack + [node.name]) if self.stack else node.name
40
60
  self.units.append((name, node))
41
61
 
42
- def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
62
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
43
63
  name = ".".join(self.stack + [node.name]) if self.stack else node.name
44
64
  self.units.append((name, node))
45
65
 
46
66
 
67
+ # =========================
68
+ # CFG fingerprinting
69
+ # =========================
70
+
71
+
72
+ def get_cfg_fingerprint(
73
+ node: ast.FunctionDef | ast.AsyncFunctionDef,
74
+ cfg: NormalizationConfig,
75
+ qualname: str,
76
+ ) -> str:
77
+ """
78
+ Build CFG, normalize it into a canonical form, and hash it.
79
+ """
80
+ builder = CFGBuilder()
81
+ graph = builder.build(qualname, node)
82
+
83
+ parts: list[str] = []
84
+
85
+ # Stable order for deterministic hash
86
+ for block in sorted(graph.blocks, key=lambda b: b.id):
87
+ # NOTE: normalized_ast_dump_from_list must accept Sequence[ast.AST] (covariant),
88
+ # but even if it still accepts list[ast.AST], passing list[ast.stmt] will fail
89
+ # due to invariance. We pass as Sequence[ast.AST] via a typed view.
90
+ stmts_as_ast: Sequence[ast.AST] = block.statements
91
+ normalized_stmts = normalized_ast_dump_from_list(stmts_as_ast, cfg)
92
+
93
+ successor_ids = sorted(succ.id for succ in block.successors)
94
+
95
+ parts.append(
96
+ f"BLOCK[{block.id}]:{normalized_stmts}"
97
+ f"|SUCCESSORS:{','.join(map(str, successor_ids))}"
98
+ )
99
+
100
+ return sha1("|".join(parts))
101
+
102
+
103
+ # =========================
104
+ # Public API
105
+ # =========================
106
+
107
+
47
108
  def extract_units_from_source(
48
- source: str,
49
- filepath: str,
50
- module_name: str,
51
- cfg: NormalizationConfig,
52
- min_loc: int,
53
- min_stmt: int,
109
+ source: str,
110
+ filepath: str,
111
+ module_name: str,
112
+ cfg: NormalizationConfig,
113
+ min_loc: int,
114
+ min_stmt: int,
54
115
  ) -> tuple[list[Unit], list[BlockUnit]]:
55
116
  try:
56
117
  tree = ast.parse(source)
@@ -66,6 +127,7 @@ def extract_units_from_source(
66
127
  for local_name, node in qb.units:
67
128
  start = getattr(node, "lineno", None)
68
129
  end = getattr(node, "end_lineno", None)
130
+
69
131
  if not start or not end or end < start:
70
132
  continue
71
133
 
@@ -76,26 +138,24 @@ def extract_units_from_source(
76
138
  continue
77
139
 
78
140
  qualname = f"{module_name}:{local_name}"
79
- dump = normalized_ast_dump(node, cfg)
80
- fp = sha1(dump)
81
-
82
- # ✅ __init__ INCLUDED as function-level unit
83
- units.append(Unit(
84
- qualname=qualname,
85
- filepath=filepath,
86
- start_line=start,
87
- end_line=end,
88
- loc=loc,
89
- stmt_count=stmt_count,
90
- fingerprint=fp,
91
- loc_bucket=bucket_loc(loc),
92
- ))
93
-
94
- if (
95
- not local_name.endswith("__init__")
96
- and loc >= 40
97
- and stmt_count >= 10
98
- ):
141
+ fingerprint = get_cfg_fingerprint(node, cfg, qualname)
142
+
143
+ # Function-level unit (including __init__)
144
+ units.append(
145
+ Unit(
146
+ qualname=qualname,
147
+ filepath=filepath,
148
+ start_line=start,
149
+ end_line=end,
150
+ loc=loc,
151
+ stmt_count=stmt_count,
152
+ fingerprint=fingerprint,
153
+ loc_bucket=bucket_loc(loc),
154
+ )
155
+ )
156
+
157
+ # Block-level units (exclude __init__)
158
+ if not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10:
99
159
  blocks = extract_blocks(
100
160
  node,
101
161
  filepath=filepath,