codeclone 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codeclone/extractor.py CHANGED
@@ -9,21 +9,24 @@ Licensed under the MIT License.
9
9
  from __future__ import annotations
10
10
 
11
11
  import ast
12
+ import os
13
+ import signal
14
+ from collections.abc import Iterator
15
+ from contextlib import contextmanager
12
16
  from dataclasses import dataclass
13
- from typing import Sequence
14
17
 
15
- from .blocks import extract_blocks, BlockUnit
18
+ from .blocks import BlockUnit, extract_blocks
16
19
  from .cfg import CFGBuilder
17
- from .fingerprint import sha1, bucket_loc
20
+ from .errors import ParseError
21
+ from .fingerprint import bucket_loc, sha1
18
22
  from .normalize import NormalizationConfig, normalized_ast_dump_from_list
19
23
 
20
-
21
24
  # =========================
22
25
  # Data structures
23
26
  # =========================
24
27
 
25
28
 
26
- @dataclass(frozen=True)
29
+ @dataclass(frozen=True, slots=True)
27
30
  class Unit:
28
31
  qualname: str
29
32
  filepath: str
@@ -39,6 +42,67 @@ class Unit:
39
42
  # Helpers
40
43
  # =========================
41
44
 
45
+ PARSE_TIMEOUT_SECONDS = 5
46
+
47
+
48
+ class _ParseTimeoutError(Exception):
49
+ pass
50
+
51
+
52
+ @contextmanager
53
+ def _parse_limits(timeout_s: int) -> Iterator[None]:
54
+ if os.name != "posix" or timeout_s <= 0:
55
+ yield
56
+ return
57
+
58
+ old_handler = signal.getsignal(signal.SIGALRM)
59
+
60
+ def _timeout_handler(_signum: int, _frame: object) -> None:
61
+ raise _ParseTimeoutError("AST parsing timeout")
62
+
63
+ old_limits: tuple[int, int] | None = None
64
+ try:
65
+ signal.signal(signal.SIGALRM, _timeout_handler)
66
+ signal.setitimer(signal.ITIMER_REAL, timeout_s)
67
+
68
+ try:
69
+ import resource
70
+
71
+ old_limits = resource.getrlimit(resource.RLIMIT_CPU)
72
+ soft, hard = old_limits
73
+ new_soft = (
74
+ min(timeout_s, soft) if soft != resource.RLIM_INFINITY else timeout_s
75
+ )
76
+ new_hard = (
77
+ min(timeout_s + 1, hard)
78
+ if hard != resource.RLIM_INFINITY
79
+ else timeout_s + 1
80
+ )
81
+ resource.setrlimit(resource.RLIMIT_CPU, (new_soft, new_hard))
82
+ except Exception:
83
+ # If resource is unavailable or cannot be set, rely on alarm only.
84
+ pass
85
+
86
+ yield
87
+ finally:
88
+ signal.setitimer(signal.ITIMER_REAL, 0)
89
+ signal.signal(signal.SIGALRM, old_handler)
90
+ if old_limits is not None:
91
+ try:
92
+ import resource
93
+
94
+ resource.setrlimit(resource.RLIMIT_CPU, old_limits)
95
+ except Exception:
96
+ pass
97
+
98
+
99
+ def _parse_with_limits(source: str, timeout_s: int) -> ast.AST:
100
+ try:
101
+ with _parse_limits(timeout_s):
102
+ return ast.parse(source)
103
+ except _ParseTimeoutError as e:
104
+ raise ParseError(str(e)) from e
105
+
42
106
 
43
107
  def _stmt_count(node: ast.AST) -> int:
44
108
  body = getattr(node, "body", None)
@@ -46,6 +110,8 @@ def _stmt_count(node: ast.AST) -> int:
46
110
 
47
111
 
48
112
  class _QualnameBuilder(ast.NodeVisitor):
113
+ __slots__ = ("stack", "units")
114
+
49
115
  def __init__(self) -> None:
50
116
  self.stack: list[str] = []
51
117
  self.units: list[tuple[str, ast.FunctionDef | ast.AsyncFunctionDef]] = []
@@ -56,11 +122,11 @@ class _QualnameBuilder(ast.NodeVisitor):
56
122
  self.stack.pop()
57
123
 
58
124
  def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
59
- name = ".".join(self.stack + [node.name]) if self.stack else node.name
125
+ name = ".".join([*self.stack, node.name]) if self.stack else node.name
60
126
  self.units.append((name, node))
61
127
 
62
128
  def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
63
- name = ".".join(self.stack + [node.name]) if self.stack else node.name
129
+ name = ".".join([*self.stack, node.name]) if self.stack else node.name
64
130
  self.units.append((name, node))
65
131
 
66
132
 
@@ -75,28 +141,39 @@ def get_cfg_fingerprint(
75
141
  qualname: str,
76
142
  ) -> str:
77
143
  """
78
- Build CFG, normalize it into a canonical form, and hash it.
144
+ Generate a structural fingerprint for a function using CFG analysis.
145
+
146
+ The fingerprint is computed by:
147
+ 1. Building a Control Flow Graph (CFG) from the function
148
+ 2. Normalizing each CFG block's statements (variable names, constants, etc.)
149
+ 3. Creating a canonical representation of the CFG structure
150
+ 4. Hashing the representation with SHA-1
151
+
152
+ Functions with identical control flow and normalized statements will
153
+ produce the same fingerprint, even if they differ in variable names,
154
+ constants, or type annotations.
155
+
156
+ Args:
157
+ node: Function AST node to fingerprint
158
+ cfg: Normalization configuration (what to ignore)
159
+ qualname: Qualified name for logging/debugging
160
+
161
+ Returns:
162
+ 40-character hex SHA-1 hash of the normalized CFG
79
163
  """
80
164
  builder = CFGBuilder()
81
165
  graph = builder.build(qualname, node)
82
166
 
167
+ # Use generator to avoid building large list of strings
83
168
  parts: list[str] = []
84
-
85
- # Stable order for deterministic hash
86
169
  for block in sorted(graph.blocks, key=lambda b: b.id):
87
- # NOTE: normalized_ast_dump_from_list must accept Sequence[ast.AST] (covariant),
88
- # but even if it still accepts list[ast.AST], passing list[ast.stmt] will fail
89
- # due to invariance. We pass as Sequence[ast.AST] via a typed view.
90
- stmts_as_ast: Sequence[ast.AST] = block.statements
91
- normalized_stmts = normalized_ast_dump_from_list(stmts_as_ast, cfg)
92
-
93
- successor_ids = sorted(succ.id for succ in block.successors)
94
-
170
+ succ_ids = ",".join(
171
+ str(s.id) for s in sorted(block.successors, key=lambda s: s.id)
172
+ )
95
173
  parts.append(
96
- f"BLOCK[{block.id}]:{normalized_stmts}"
97
- f"|SUCCESSORS:{','.join(map(str, successor_ids))}"
174
+ f"BLOCK[{block.id}]:{normalized_ast_dump_from_list(block.statements, cfg)}"
175
+ f"|SUCCESSORS:{succ_ids}"
98
176
  )
99
-
100
177
  return sha1("|".join(parts))
101
178
 
102
179
 
@@ -114,9 +191,9 @@ def extract_units_from_source(
114
191
  min_stmt: int,
115
192
  ) -> tuple[list[Unit], list[BlockUnit]]:
116
193
  try:
117
- tree = ast.parse(source)
118
- except SyntaxError:
119
- return [], []
194
+ tree = _parse_with_limits(source, PARSE_TIMEOUT_SECONDS)
195
+ except SyntaxError as e:
196
+ raise ParseError(f"Failed to parse {filepath}: {e}") from e
120
197
 
121
198
  qb = _QualnameBuilder()
122
199
  qb.visit(tree)