parallax-scan 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. parallax_scan-0.3.0/LICENSE +21 -0
  2. parallax_scan-0.3.0/PKG-INFO +134 -0
  3. parallax_scan-0.3.0/README.md +107 -0
  4. parallax_scan-0.3.0/pyproject.toml +57 -0
  5. parallax_scan-0.3.0/setup.cfg +4 -0
  6. parallax_scan-0.3.0/src/parallax/__init__.py +20 -0
  7. parallax_scan-0.3.0/src/parallax/cli.py +270 -0
  8. parallax_scan-0.3.0/src/parallax/config.py +90 -0
  9. parallax_scan-0.3.0/src/parallax/core.py +220 -0
  10. parallax_scan-0.3.0/src/parallax/extractors/__init__.py +26 -0
  11. parallax_scan-0.3.0/src/parallax/extractors/base.py +23 -0
  12. parallax_scan-0.3.0/src/parallax/extractors/django_models.py +90 -0
  13. parallax_scan-0.3.0/src/parallax/extractors/env_vars.py +80 -0
  14. parallax_scan-0.3.0/src/parallax/extractors/http_urls.py +134 -0
  15. parallax_scan-0.3.0/src/parallax/extractors/redis_keys.py +148 -0
  16. parallax_scan-0.3.0/src/parallax/extractors/sqlalchemy_models.py +188 -0
  17. parallax_scan-0.3.0/src/parallax/reporters/__init__.py +8 -0
  18. parallax_scan-0.3.0/src/parallax/reporters/html_reporter.py +113 -0
  19. parallax_scan-0.3.0/src/parallax/reporters/json_reporter.py +39 -0
  20. parallax_scan-0.3.0/src/parallax/reporters/sarif_reporter.py +112 -0
  21. parallax_scan-0.3.0/src/parallax/reporters/text_reporter.py +53 -0
  22. parallax_scan-0.3.0/src/parallax/verify/__init__.py +25 -0
  23. parallax_scan-0.3.0/src/parallax/verify/core.py +97 -0
  24. parallax_scan-0.3.0/src/parallax/verify/python_ast.py +83 -0
  25. parallax_scan-0.3.0/src/parallax_scan.egg-info/PKG-INFO +134 -0
  26. parallax_scan-0.3.0/src/parallax_scan.egg-info/SOURCES.txt +38 -0
  27. parallax_scan-0.3.0/src/parallax_scan.egg-info/dependency_links.txt +1 -0
  28. parallax_scan-0.3.0/src/parallax_scan.egg-info/entry_points.txt +2 -0
  29. parallax_scan-0.3.0/src/parallax_scan.egg-info/requires.txt +8 -0
  30. parallax_scan-0.3.0/src/parallax_scan.egg-info/top_level.txt +1 -0
  31. parallax_scan-0.3.0/tests/test_config_and_ci.py +120 -0
  32. parallax_scan-0.3.0/tests/test_core.py +158 -0
  33. parallax_scan-0.3.0/tests/test_django_extractor.py +33 -0
  34. parallax_scan-0.3.0/tests/test_env_vars_extractor.py +48 -0
  35. parallax_scan-0.3.0/tests/test_http_urls_extractor.py +48 -0
  36. parallax_scan-0.3.0/tests/test_redis_keys_extractor.py +85 -0
  37. parallax_scan-0.3.0/tests/test_repo_awareness.py +33 -0
  38. parallax_scan-0.3.0/tests/test_reporters.py +90 -0
  39. parallax_scan-0.3.0/tests/test_sqlalchemy_extractor.py +39 -0
  40. parallax_scan-0.3.0/tests/test_verify.py +40 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ziyad Alotaibi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,134 @@
1
+ Metadata-Version: 2.4
2
+ Name: parallax-scan
3
+ Version: 0.3.0
4
+ Summary: Find functions that work on the same resource through different code paths — the architectural-duplication detector that token-similarity tools miss.
5
+ Author-email: Ziyad Alotaibi <ziyad.alotaibe@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ziyad00/parallax
8
+ Project-URL: Issues, https://github.com/ziyad00/parallax/issues
9
+ Keywords: duplication,static-analysis,ast,refactor,code-quality
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Software Development :: Quality Assurance
17
+ Classifier: Intended Audience :: Developers
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: tomli; python_version < "3.11"
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest>=7; extra == "dev"
24
+ Requires-Dist: pytest-cov>=4; extra == "dev"
25
+ Requires-Dist: ruff>=0.5; extra == "dev"
26
+ Dynamic: license-file
27
+
28
+ # parallax
29
+
30
+ [![PyPI version](https://img.shields.io/pypi/v/parallax-scan.svg)](https://pypi.org/project/parallax-scan/)
31
+ [![CI](https://github.com/ziyad00/parallax/actions/workflows/ci.yml/badge.svg)](https://github.com/ziyad00/parallax/actions/workflows/ci.yml)
32
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
33
+
34
+ Find code that does the same logical job through different paths.
35
+
36
+ Token-similarity tools (jscpd, PMD CPD, `pylint duplicate-code`) detect copy-paste. parallax detects something different: two pieces of code that touch the same set of resources, regardless of how the code is written. Different filters, different return shapes, even different languages.
37
+
38
+ ## Model
39
+
40
+ A **unit** of code (function, method, file, module, microservice, ...) touches a set of **resources** (database tables, HTTP endpoints, Redis keys, env vars, file paths, ...). Units sharing the same resource set are clustered as duplication candidates.
41
+
42
+ Both unit detection and resource detection are pluggable per **extractor**.
43
+
44
+ ## Built-in extractors
45
+
46
+ | Name | Unit | Resource |
47
+ |---|---|---|
48
+ | `sqlalchemy` | Python function/method | SQLAlchemy ORM model classes |
49
+ | `django` | Python function/method | Django ORM model classes |
50
+ | `http-urls` | any text file | HTTP URL paths |
51
+ | `env-vars` | any text file | Environment variable names |
52
+ | `redis-keys` | any text file | Redis key namespaces |
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ pip install parallax-scan
58
+ ```
59
+
60
+ ## Usage
61
+
62
+ ```bash
63
+ parallax scan path/to/repo
64
+ parallax scan path/to/repo --extractor sqlalchemy
65
+ parallax scan path/to/repo -e sqlalchemy -e http-urls
66
+ parallax scan path/to/repo --min-resources 3 --top 20
67
+ parallax scan path/to/repo --cross-file-only
68
+ parallax scan path/to/repo --format html -o report.html
69
+ parallax scan path/to/repo --format sarif -o parallax.sarif
70
+ ```
71
+
72
+ ## Configuration
73
+
74
+ Drop `.parallax.toml` at your repo root:
75
+
76
+ ```toml
77
+ [scan]
78
+ min_resources = 3
79
+ min_cluster_size = 2
80
+
81
+ [ci]
82
+ max_cluster_size = 5
83
+
84
+ [[ignore]]
85
+ resources = ["User", "Place"]
86
+ reason = "Generic"
87
+ ```
88
+
89
+ In CI:
90
+
91
+ ```bash
92
+ parallax scan . --ci
93
+ parallax scan . --format sarif -o parallax.sarif
94
+ ```
95
+
96
+ `--ci` exits non-zero only when a cluster meets `ci.max_cluster_size`. Without it, any cluster is non-zero.
97
+
98
+ ## Comparison
99
+
100
+ | Tool | Catches | Doesn't catch |
101
+ |---|---|---|
102
+ | jscpd, PMD CPD, pylint duplicate-code | Token-similar copy-paste | Code with different surface shape |
103
+ | Sourcegraph | Manual code search | Automatic detection |
104
+ | `pydeps`, `dependency-cruiser` | Module-level imports | Same-resource overlap |
105
+ | semgrep | Hand-written patterns | Discovery |
106
+ | parallax | Same-resource overlap regardless of shape | Single-instance bad patterns |
107
+
108
+ ## Status
109
+
110
+ Alpha. API and CLI are unstable until 1.0.
111
+
112
+ ## Writing an extractor
113
+
114
+ ```python
115
+ from pathlib import Path
116
+ from typing import Iterable
117
+
118
+ from parallax import Unit
119
+ from parallax.extractors.base import Extractor
120
+
121
+
122
+ class TerraformAwsExtractor(Extractor):
123
+ name = "terraform-aws"
124
+
125
+ def extract(self, root: Path) -> Iterable[Unit]:
126
+ for tf in root.rglob("*.tf"):
127
+ ...
128
+ ```
129
+
130
+ Register in `parallax.extractors.BUILTIN_EXTRACTORS`.
131
+
132
+ ## License
133
+
134
+ MIT
@@ -0,0 +1,107 @@
1
+ # parallax
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/parallax-scan.svg)](https://pypi.org/project/parallax-scan/)
4
+ [![CI](https://github.com/ziyad00/parallax/actions/workflows/ci.yml/badge.svg)](https://github.com/ziyad00/parallax/actions/workflows/ci.yml)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
6
+
7
+ Find code that does the same logical job through different paths.
8
+
9
+ Token-similarity tools (jscpd, PMD CPD, `pylint duplicate-code`) detect copy-paste. parallax detects something different: two pieces of code that touch the same set of resources, regardless of how the code is written. Different filters, different return shapes, even different languages.
10
+
11
+ ## Model
12
+
13
+ A **unit** of code (function, method, file, module, microservice, ...) touches a set of **resources** (database tables, HTTP endpoints, Redis keys, env vars, file paths, ...). Units sharing the same resource set are clustered as duplication candidates.
14
+
15
+ Both unit detection and resource detection are pluggable per **extractor**.
16
+
17
+ ## Built-in extractors
18
+
19
+ | Name | Unit | Resource |
20
+ |---|---|---|
21
+ | `sqlalchemy` | Python function/method | SQLAlchemy ORM model classes |
22
+ | `django` | Python function/method | Django ORM model classes |
23
+ | `http-urls` | any text file | HTTP URL paths |
24
+ | `env-vars` | any text file | Environment variable names |
25
+ | `redis-keys` | any text file | Redis key namespaces |
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install parallax-scan
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ ```bash
36
+ parallax scan path/to/repo
37
+ parallax scan path/to/repo --extractor sqlalchemy
38
+ parallax scan path/to/repo -e sqlalchemy -e http-urls
39
+ parallax scan path/to/repo --min-resources 3 --top 20
40
+ parallax scan path/to/repo --cross-file-only
41
+ parallax scan path/to/repo --format html -o report.html
42
+ parallax scan path/to/repo --format sarif -o parallax.sarif
43
+ ```
44
+
45
+ ## Configuration
46
+
47
+ Drop `.parallax.toml` at your repo root:
48
+
49
+ ```toml
50
+ [scan]
51
+ min_resources = 3
52
+ min_cluster_size = 2
53
+
54
+ [ci]
55
+ max_cluster_size = 5
56
+
57
+ [[ignore]]
58
+ resources = ["User", "Place"]
59
+ reason = "Generic"
60
+ ```
61
+
62
+ In CI:
63
+
64
+ ```bash
65
+ parallax scan . --ci
66
+ parallax scan . --format sarif -o parallax.sarif
67
+ ```
68
+
69
+ `--ci` exits non-zero only when a cluster meets `ci.max_cluster_size`. Without it, any cluster is non-zero.
70
+
71
+ ## Comparison
72
+
73
+ | Tool | Catches | Doesn't catch |
74
+ |---|---|---|
75
+ | jscpd, PMD CPD, pylint duplicate-code | Token-similar copy-paste | Code with different surface shape |
76
+ | Sourcegraph | Manual code search | Automatic detection |
77
+ | `pydeps`, `dependency-cruiser` | Module-level imports | Same-resource overlap |
78
+ | semgrep | Hand-written patterns | Discovery |
79
+ | parallax | Same-resource overlap regardless of shape | Single-instance bad patterns |
80
+
81
+ ## Status
82
+
83
+ Alpha. API and CLI are unstable until 1.0.
84
+
85
+ ## Writing an extractor
86
+
87
+ ```python
88
+ from pathlib import Path
89
+ from typing import Iterable
90
+
91
+ from parallax import Unit
92
+ from parallax.extractors.base import Extractor
93
+
94
+
95
+ class TerraformAwsExtractor(Extractor):
96
+ name = "terraform-aws"
97
+
98
+ def extract(self, root: Path) -> Iterable[Unit]:
99
+ for tf in root.rglob("*.tf"):
100
+ ...
101
+ ```
102
+
103
+ Register in `parallax.extractors.BUILTIN_EXTRACTORS`.
104
+
105
+ ## License
106
+
107
+ MIT
@@ -0,0 +1,57 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "parallax-scan"
7
+ version = "0.3.0"
8
+ description = "Find functions that work on the same resource through different code paths — the architectural-duplication detector that token-similarity tools miss."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Ziyad Alotaibi", email = "ziyad.alotaibe@gmail.com" }
14
+ ]
15
+ keywords = ["duplication", "static-analysis", "ast", "refactor", "code-quality"]
16
+ dependencies = [
17
+ "tomli; python_version < '3.11'",
18
+ ]
19
+ classifiers = [
20
+ "Development Status :: 3 - Alpha",
21
+ "License :: OSI Approved :: MIT License",
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Topic :: Software Development :: Quality Assurance",
27
+ "Intended Audience :: Developers",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ dev = [
32
+ "pytest>=7",
33
+ "pytest-cov>=4",
34
+ "ruff>=0.5",
35
+ ]
36
+
37
+ [project.scripts]
38
+ parallax = "parallax.cli:main"
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/ziyad00/parallax"
42
+ Issues = "https://github.com/ziyad00/parallax/issues"
43
+
44
+ [tool.setuptools.packages.find]
45
+ where = ["src"]
46
+
47
+ [tool.ruff]
48
+ line-length = 100
49
+ target-version = "py310"
50
+
51
+ [tool.ruff.lint]
52
+ select = ["E", "F", "W", "I", "B", "UP"]
53
+ ignore = ["E501"]
54
+
55
+ [tool.pytest.ini_options]
56
+ testpaths = ["tests"]
57
+ addopts = "-ra"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,20 @@
1
+ """parallax: find code that does the same logical job through different paths."""
2
+
3
+ from .core import (
4
+ Cluster,
5
+ FoldedGroup,
6
+ Unit,
7
+ fold_units_by_class,
8
+ group_by_resource_set,
9
+ )
10
+ from .extractors.base import Extractor
11
+
12
+ __all__ = [
13
+ "Cluster",
14
+ "Extractor",
15
+ "FoldedGroup",
16
+ "Unit",
17
+ "fold_units_by_class",
18
+ "group_by_resource_set",
19
+ ]
20
+ __version__ = "0.3.0"
@@ -0,0 +1,270 @@
1
+ """parallax CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ from .config import Config, load_config
11
+ from .core import Cluster, Unit, group_by_resource_set
12
+ from .extractors import BUILTIN_EXTRACTORS
13
+ from .reporters import render_html, render_json, render_sarif, render_text
14
+ from .verify import verify_cluster
15
+
16
+
17
+ REPORTERS = {"text", "json", "html", "sarif"}
18
+
19
+
20
+ def _build_argparser() -> argparse.ArgumentParser:
21
+ p = argparse.ArgumentParser(
22
+ prog="parallax",
23
+ description="Find code that does the same logical job through different paths.",
24
+ )
25
+ sub = p.add_subparsers(dest="command", required=True)
26
+
27
+ scan = sub.add_parser("scan", help="Scan a tree for clusters of overlapping units.")
28
+ scan.add_argument("path", type=Path, help="Root directory to scan.")
29
+ scan.add_argument(
30
+ "--extractor",
31
+ "-e",
32
+ action="append",
33
+ choices=sorted(BUILTIN_EXTRACTORS),
34
+ help="Which extractor to run. Repeat to combine. Default: all.",
35
+ )
36
+ scan.add_argument(
37
+ "--config",
38
+ "-c",
39
+ type=Path,
40
+ help=(
41
+ "Path to a config file (TOML). If omitted, parallax looks for "
42
+ ".parallax.toml in the working directory."
43
+ ),
44
+ )
45
+ scan.add_argument(
46
+ "--min-resources",
47
+ type=int,
48
+ help="Override config.scan.min_resources.",
49
+ )
50
+ scan.add_argument(
51
+ "--min-cluster-size",
52
+ type=int,
53
+ help="Override config.scan.min_cluster_size.",
54
+ )
55
+ scan.add_argument(
56
+ "--format",
57
+ "-f",
58
+ choices=sorted(REPORTERS),
59
+ default="text",
60
+ help="Output format. Default 'text'.",
61
+ )
62
+ scan.add_argument(
63
+ "--output",
64
+ "-o",
65
+ type=Path,
66
+ help="Write output to FILE instead of stdout.",
67
+ )
68
+ scan.add_argument(
69
+ "--json",
70
+ action="store_true",
71
+ help="Shortcut for --format json.",
72
+ )
73
+ scan.add_argument(
74
+ "--cross-file-only",
75
+ action="store_true",
76
+ help="Drop clusters whose units all live in one file.",
77
+ )
78
+ scan.add_argument(
79
+ "--top",
80
+ type=int,
81
+ default=None,
82
+ help="Limit the report to the N highest-scoring clusters.",
83
+ )
84
+ scan.add_argument(
85
+ "--fold-threshold",
86
+ type=int,
87
+ default=5,
88
+ help="Collapse N+ methods of the same class into one row in text output. Set 0 to disable.",
89
+ )
90
+ scan.add_argument(
91
+ "--ci",
92
+ action="store_true",
93
+ help="Exit non-zero only when a cluster meets ci.max_cluster_size from config.",
94
+ )
95
+
96
+ sub.add_parser("list-extractors", help="Print the extractors built into parallax.")
97
+
98
+ verify = sub.add_parser(
99
+ "verify",
100
+ help="Read a cluster (JSON on stdin or --file) and run deeper analysis.",
101
+ )
102
+ verify.add_argument(
103
+ "--root",
104
+ type=Path,
105
+ default=Path("."),
106
+ help="Source root used to resolve unit locations. Default: cwd.",
107
+ )
108
+ verify.add_argument(
109
+ "--file",
110
+ type=Path,
111
+ help="Read the cluster JSON from FILE instead of stdin.",
112
+ )
113
+
114
+ return p
115
+
116
+
117
+ def _apply_overrides(args: argparse.Namespace, cfg: Config) -> Config:
118
+ """CLI flags trump config-file values when given."""
119
+ if args.min_resources is not None:
120
+ cfg.min_resources = args.min_resources
121
+ if args.min_cluster_size is not None:
122
+ cfg.min_cluster_size = args.min_cluster_size
123
+ return cfg
124
+
125
+
126
+ def _filter_ignored(
127
+ clusters: list[Cluster], cfg: Config
128
+ ) -> tuple[list[Cluster], list[Cluster]]:
129
+ """Split clusters into (kept, ignored)."""
130
+ kept: list[Cluster] = []
131
+ ignored: list[Cluster] = []
132
+ for c in clusters:
133
+ if cfg.matches_ignore(c.resources) is not None:
134
+ ignored.append(c)
135
+ else:
136
+ kept.append(c)
137
+ return kept, ignored
138
+
139
+
140
+ def cmd_scan(args: argparse.Namespace) -> int:
141
+ if not args.path.exists():
142
+ print(f"error: {args.path} does not exist", file=sys.stderr)
143
+ return 2
144
+
145
+ cfg = load_config(args.config)
146
+ cfg = _apply_overrides(args, cfg)
147
+
148
+ extractor_names: list[str] = args.extractor or sorted(BUILTIN_EXTRACTORS)
149
+ units: list[Unit] = []
150
+ for name in extractor_names:
151
+ cls = BUILTIN_EXTRACTORS[name]
152
+ units.extend(cls().extract(args.path))
153
+
154
+ clusters = group_by_resource_set(
155
+ units,
156
+ min_resources=cfg.min_resources,
157
+ min_cluster_size=cfg.min_cluster_size,
158
+ cross_file_only=args.cross_file_only,
159
+ )
160
+ kept, ignored = _filter_ignored(clusters, cfg)
161
+ if args.top is not None and args.top >= 0:
162
+ kept = kept[: args.top]
163
+
164
+ fmt = "json" if args.json else args.format
165
+ if fmt == "text":
166
+ output = render_text(
167
+ kept,
168
+ scanned_units=len(units),
169
+ extractors=extractor_names,
170
+ min_resources=cfg.min_resources,
171
+ min_cluster_size=cfg.min_cluster_size,
172
+ fold_threshold=max(0, args.fold_threshold),
173
+ )
174
+ if ignored:
175
+ output += f"\n({len(ignored)} cluster(s) suppressed by ignore rules)\n"
176
+ elif fmt == "json":
177
+ output = render_json(kept, scanned_units=len(units))
178
+ elif fmt == "html":
179
+ output = render_html(
180
+ kept,
181
+ scanned_units=len(units),
182
+ extractors=extractor_names,
183
+ min_resources=cfg.min_resources,
184
+ min_cluster_size=cfg.min_cluster_size,
185
+ )
186
+ elif fmt == "sarif":
187
+ output = render_sarif(kept, scanned_units=len(units), extractors=extractor_names)
188
+ else: # pragma: no cover
189
+ raise ValueError(f"unknown format: {fmt}")
190
+
191
+ if args.output:
192
+ args.output.write_text(output, encoding="utf-8")
193
+ else:
194
+ sys.stdout.write(output)
195
+ if not output.endswith("\n"):
196
+ sys.stdout.write("\n")
197
+
198
+ return _exit_code(kept, cfg, args.ci)
199
+
200
+
201
+ def _exit_code(kept: list[Cluster], cfg: Config, ci_mode: bool) -> int:
202
+ if ci_mode:
203
+ if cfg.max_cluster_size is None:
204
+ return 0
205
+ if any(c.size >= cfg.max_cluster_size for c in kept):
206
+ return 1
207
+ return 0
208
+ return 0 if not kept else 1
209
+
210
+
211
+ def cmd_verify(args: argparse.Namespace) -> int:
212
+ raw = args.file.read_text(encoding="utf-8") if args.file else sys.stdin.read()
213
+ try:
214
+ payload = json.loads(raw)
215
+ except json.JSONDecodeError as e:
216
+ print(f"error: invalid JSON: {e}", file=sys.stderr)
217
+ return 2
218
+
219
+ cluster: dict | None
220
+ if isinstance(payload, dict) and "units" in payload:
221
+ cluster = payload
222
+ elif isinstance(payload, dict) and "clusters" in payload and payload["clusters"]:
223
+ cluster = payload["clusters"][0]
224
+ else:
225
+ print(
226
+ "error: expected a cluster dict (with 'units') or a scan payload "
227
+ "(with 'clusters')",
228
+ file=sys.stderr,
229
+ )
230
+ return 2
231
+
232
+ results = verify_cluster(cluster, root=args.root)
233
+ if not results:
234
+ print("No applicable verifiers for this cluster.")
235
+ return 0
236
+
237
+ for r in results:
238
+ print(
239
+ f"verifier={r.verifier} "
240
+ f"recommendation={r.recommendation.value} "
241
+ f"mean_similarity={r.mean_similarity:.2f}"
242
+ )
243
+ for p in r.pairs:
244
+ print(f" {p.score:.2f} {p.a_location} ~ {p.b_location}")
245
+ return 0
246
+
247
+
248
+ def cmd_list_extractors(_: argparse.Namespace) -> int:
249
+ for name, cls in sorted(BUILTIN_EXTRACTORS.items()):
250
+ doc_first_line = (cls.__doc__ or "").strip().splitlines()[:1]
251
+ summary = doc_first_line[0] if doc_first_line else ""
252
+ print(f"{name:<14} {summary}")
253
+ return 0
254
+
255
+
256
+ def main(argv: list[str] | None = None) -> int:
257
+ parser = _build_argparser()
258
+ args = parser.parse_args(argv)
259
+ if args.command == "scan":
260
+ return cmd_scan(args)
261
+ if args.command == "list-extractors":
262
+ return cmd_list_extractors(args)
263
+ if args.command == "verify":
264
+ return cmd_verify(args)
265
+ parser.error("unknown command")
266
+ return 2
267
+
268
+
269
+ if __name__ == "__main__":
270
+ raise SystemExit(main())