skilltotal 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
skilltotal/__init__.py ADDED
@@ -0,0 +1,43 @@
1
+ """SkillTotal — AI Component Security Platform (core engine).
2
+
3
+ This package is the reusable core engine. Everything here is import-safe and free of
4
+ process-level side effects (no printing, no ``sys.exit``) *except* :mod:`skilltotal.cli`,
5
+ which is the thin I/O shell. Future web and enterprise products are intended to import
6
+ :func:`skilltotal.engine.analyze` directly.
7
+ """
8
+
9
+ from skilltotal.models import (
10
+ Capability,
11
+ Component,
12
+ Evidence,
13
+ Finding,
14
+ NeedsReview,
15
+ Report,
16
+ RiskLevel,
17
+ Severity,
18
+ )
19
+
20
+ # --- Versioned contract (consumed by downstream products such as the web app) -----------
21
+ # ENGINE_VERSION: semver of the code / public API; pin this from a consumer.
22
+ # REPORT_SCHEMA_VERSION: shape of Report.to_dict(); bumps only on schema changes.
23
+ # RULESET_VERSION: integer counter of the detection ruleset; bumps when rules change, so a
24
+ # consumer knows when re-scanning old reports may surface new findings.
25
+ __version__ = "0.3.0"
26
+ ENGINE_VERSION = __version__
27
+ REPORT_SCHEMA_VERSION = "1.1"
28
+ RULESET_VERSION = 4
29
+
30
+ __all__ = [
31
+ "__version__",
32
+ "ENGINE_VERSION",
33
+ "REPORT_SCHEMA_VERSION",
34
+ "RULESET_VERSION",
35
+ "Capability",
36
+ "Component",
37
+ "Evidence",
38
+ "Finding",
39
+ "NeedsReview",
40
+ "Report",
41
+ "RiskLevel",
42
+ "Severity",
43
+ ]
skilltotal/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Enable ``python -m skilltotal`` as an entry point."""
2
+
3
+ from skilltotal.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ raise SystemExit(main())
skilltotal/baseline.py ADDED
@@ -0,0 +1,81 @@
1
+ """Baseline suppression of known findings.
2
+
3
+ A baseline records stable *fingerprints* of accepted findings so they no longer appear in
4
+ future scans (useful for adopting SkillTotal on an existing repo, or for CI gates). A
5
+ fingerprint hashes ``(rule_id, file, normalized snippet)`` — deliberately **not** the line
6
+ number — so it survives edits that shift lines.
7
+
8
+ Suppression is applied at the evidence level before scoring: matched evidence is removed,
9
+ and a finding with no remaining evidence is dropped entirely (preserving the
10
+ "no finding without evidence" invariant) and does not contribute to the score.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import hashlib
16
+ import json
17
+ from pathlib import Path
18
+
19
+ from skilltotal.models import Evidence, Finding
20
+
21
+
22
+ def fingerprint(rule_id: str, evidence: Evidence) -> str:
23
+ """Stable, line-independent identifier for one evidence occurrence."""
24
+ payload = f"{rule_id}|{evidence.file}|{evidence.snippet.strip()}"
25
+ # Not a security hash: just a stable fingerprint for baseline dedup/suppression.
26
+ digest = hashlib.sha1(payload.encode("utf-8"), usedforsecurity=False)
27
+ return digest.hexdigest()[:16]
28
+
29
+
30
+ def finding_fingerprints(finding: Finding) -> list[str]:
31
+ return [fingerprint(finding.id, e) for e in finding.evidence]
32
+
33
+
34
+ def apply_suppressions(
35
+ findings: list[Finding], suppressed: set[str]
36
+ ) -> tuple[list[Finding], int]:
37
+ """Drop suppressed evidence (and emptied findings). Returns (kept, suppressed_count)."""
38
+ if not suppressed:
39
+ return findings, 0
40
+ kept: list[Finding] = []
41
+ removed = 0
42
+ for finding in findings:
43
+ remaining = [e for e in finding.evidence if fingerprint(finding.id, e) not in suppressed]
44
+ removed += len(finding.evidence) - len(remaining)
45
+ if remaining:
46
+ kept.append(
47
+ Finding(
48
+ id=finding.id,
49
+ severity=finding.severity,
50
+ category=finding.category,
51
+ title=finding.title,
52
+ description=finding.description,
53
+ evidence=remaining,
54
+ recommendation=finding.recommendation,
55
+ )
56
+ )
57
+ return kept, removed
58
+
59
+
60
+ def load_baseline(path: str | Path) -> set[str]:
61
+ """Load a baseline file into a set of fingerprints.
62
+
63
+ Accepts either a JSON object ``{"suppressed": [...]}`` or a plain JSON list.
64
+ """
65
+ data = json.loads(Path(path).read_text(encoding="utf-8"))
66
+ if isinstance(data, dict):
67
+ items = data.get("suppressed", [])
68
+ elif isinstance(data, list):
69
+ items = data
70
+ else:
71
+ items = []
72
+ return {str(x) for x in items}
73
+
74
+
75
+ def build_baseline(findings: list[Finding]) -> dict[str, object]:
76
+ """Build a baseline document covering every current finding occurrence."""
77
+ fps = sorted({fp for f in findings for fp in finding_fingerprints(f)})
78
+ return {
79
+ "version": 1,
80
+ "suppressed": fps,
81
+ }
@@ -0,0 +1,31 @@
1
+ """Evidence-based capability extraction.
2
+
3
+ Capabilities are a pure projection over findings: each finding's rule declares the
4
+ :class:`~skilltotal.models.Capability` it implies, so we simply regroup the evidence the
5
+ findings already proved. No file is re-scanned, and every capability is therefore
6
+ evidence-backed by construction.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from skilltotal.models import Capability, Evidence, Finding
12
+ from skilltotal.scanners import rule_by_id
13
+
14
+ # Evidence kept per capability (capabilities can aggregate many findings).
15
+ MAX_EVIDENCE_PER_CAPABILITY = 25
16
+
17
+
18
+ def extract_capabilities(findings: list[Finding]) -> dict[Capability, list[Evidence]]:
19
+ rules = rule_by_id()
20
+ caps: dict[Capability, list[Evidence]] = {}
21
+ for finding in findings:
22
+ rule = rules.get(finding.id)
23
+ capability = rule.capability if rule else None
24
+ if capability is None:
25
+ continue
26
+ bucket = caps.setdefault(capability, [])
27
+ for ev in finding.evidence:
28
+ if len(bucket) >= MAX_EVIDENCE_PER_CAPABILITY:
29
+ break
30
+ bucket.append(ev)
31
+ return caps
skilltotal/cli.py ADDED
@@ -0,0 +1,155 @@
1
+ """SkillTotal command-line interface — the only I/O shell around the core engine.
2
+
3
+ Commands:
4
+ skilltotal scan <path-or-url> [--json] [--output FILE] [--fail-on-high]
5
+ skilltotal rules list [--json]
6
+
7
+ Exit codes:
8
+ 0 success
9
+ 1 usage / collection error
10
+ 2 --fail-on-high set and a finding of severity >= high was produced
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import json
17
+ import sys
18
+ from pathlib import Path
19
+
20
+ from skilltotal import __version__
21
+ from skilltotal.baseline import build_baseline, load_baseline
22
+ from skilltotal.collector import CollectionError
23
+ from skilltotal.engine import analyze
24
+ from skilltotal.models import Severity
25
+ from skilltotal.report import (
26
+ render_json,
27
+ render_rules_json,
28
+ render_rules_text,
29
+ render_text,
30
+ )
31
+ from skilltotal.rules import get_rules
32
+ from skilltotal.sarif import render_sarif
33
+
34
+ EXIT_OK = 0
35
+ EXIT_ERROR = 1
36
+ EXIT_FAIL_ON_HIGH = 2
37
+
38
+
39
+ def build_parser() -> argparse.ArgumentParser:
40
+ parser = argparse.ArgumentParser(
41
+ prog="skilltotal",
42
+ description="AI Component Security Platform — static analysis of AI components.",
43
+ )
44
+ parser.add_argument("--version", action="version", version=f"skilltotal {__version__}")
45
+ sub = parser.add_subparsers(dest="command", required=True)
46
+
47
+ scan = sub.add_parser("scan", help="Scan a component (local path or git URL).")
48
+ scan.add_argument("source", help="Local directory path or git repository URL.")
49
+ scan.add_argument("--json", action="store_true", help="Emit JSON to stdout.")
50
+ scan.add_argument(
51
+ "--sarif",
52
+ action="store_true",
53
+ help="Emit SARIF 2.1.0 to stdout (and to --output if given).",
54
+ )
55
+ scan.add_argument(
56
+ "--output",
57
+ metavar="FILE",
58
+ help="Write the report to FILE (SARIF if --sarif, else JSON).",
59
+ )
60
+ scan.add_argument(
61
+ "--baseline",
62
+ metavar="FILE",
63
+ help="Suppress findings whose fingerprints are listed in this baseline file.",
64
+ )
65
+ scan.add_argument(
66
+ "--write-baseline",
67
+ metavar="FILE",
68
+ help="Write a baseline file covering the current findings, then exit normally.",
69
+ )
70
+ scan.add_argument(
71
+ "--fail-on-high",
72
+ action="store_true",
73
+ help="Exit with code 2 if any finding is high or critical.",
74
+ )
75
+
76
+ rules = sub.add_parser("rules", help="Inspect the detection rules.")
77
+ rules_sub = rules.add_subparsers(dest="rules_command", required=True)
78
+ rules_list = rules_sub.add_parser("list", help="List all detection rules.")
79
+ rules_list.add_argument("--json", action="store_true", help="Emit JSON to stdout.")
80
+
81
+ return parser
82
+
83
+
84
+ def main(argv: list[str] | None = None) -> int:
85
+ parser = build_parser()
86
+ args = parser.parse_args(argv)
87
+
88
+ if args.command == "scan":
89
+ return _cmd_scan(args)
90
+ if args.command == "rules":
91
+ return _cmd_rules(args)
92
+ parser.error("unknown command") # pragma: no cover
93
+ return EXIT_ERROR
94
+
95
+
96
+ def _cmd_scan(args: argparse.Namespace) -> int:
97
+ suppress: set[str] = set()
98
+ if args.baseline:
99
+ try:
100
+ suppress = load_baseline(args.baseline)
101
+ except (OSError, json.JSONDecodeError) as exc:
102
+ print(f"error: cannot read baseline {args.baseline}: {exc}", file=sys.stderr)
103
+ return EXIT_ERROR
104
+
105
+ try:
106
+ report = analyze(args.source, suppress=suppress)
107
+ except CollectionError as exc:
108
+ print(f"error: {exc}", file=sys.stderr)
109
+ return EXIT_ERROR
110
+
111
+ if args.write_baseline:
112
+ doc = build_baseline(report.findings)
113
+ Path(args.write_baseline).write_text(
114
+ json.dumps(doc, indent=2), encoding="utf-8"
115
+ )
116
+ print(
117
+ f"Baseline with {len(doc['suppressed'])} fingerprint(s) written to "
118
+ f"{args.write_baseline}",
119
+ file=sys.stderr,
120
+ )
121
+
122
+ # Choose the structured renderer once; reuse for stdout and --output.
123
+ if args.sarif:
124
+ structured = render_sarif(report)
125
+ else:
126
+ structured = render_json(report)
127
+
128
+ if args.sarif or args.json:
129
+ print(structured)
130
+ else:
131
+ print(render_text(report))
132
+
133
+ if args.output:
134
+ Path(args.output).write_text(structured, encoding="utf-8")
135
+ print(f"Report written to {args.output}", file=sys.stderr)
136
+
137
+ if args.fail_on_high and _has_high(report):
138
+ return EXIT_FAIL_ON_HIGH
139
+ return EXIT_OK
140
+
141
+
142
+ def _cmd_rules(args: argparse.Namespace) -> int:
143
+ if args.rules_command == "list":
144
+ rules = get_rules()
145
+ if args.json:
146
+ print(render_rules_json(rules))
147
+ else:
148
+ print(render_rules_text(rules))
149
+ return EXIT_OK
150
+ return EXIT_ERROR
151
+
152
+
153
+ def _has_high(report) -> bool:
154
+ threshold = Severity.HIGH.rank
155
+ return any(f.severity.rank >= threshold for f in report.findings)
@@ -0,0 +1,371 @@
1
+ """Source collection: resolve a path or URL into a local directory + component identity.
2
+
3
+ Supported sources: a local directory, or a remote git URL (cloned shallowly into a temp
4
+ directory). Component identity (name/type/version) is derived **only** from the component
5
+ itself — never from the user's environment.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import io
11
+ import json
12
+ import re
13
+ import shutil
14
+ import stat
15
+ import subprocess # nosec B404
16
+ import tarfile
17
+ import tempfile
18
+ import urllib.request
19
+ import zipfile
20
+ from dataclasses import dataclass, replace
21
+ from pathlib import Path
22
+ from urllib.parse import quote
23
+
24
+ try: # Python 3.11+
25
+ import tomllib
26
+ except ModuleNotFoundError: # pragma: no cover
27
+ tomllib = None # type: ignore[assignment]
28
+
29
+ from skilltotal.models import Component
30
+
31
+ _GIT_URL_RE = re.compile(r"^(?:https?://|git@|ssh://|git://).+", re.IGNORECASE)
32
+ _NPMJS_URL_RE = re.compile(r"^https?://(?:www\.)?npmjs\.com/package/(@?[\w.-]+(?:/[\w.-]+)?)", re.I)
33
+ _PYPI_URL_RE = re.compile(r"^https?://pypi\.org/project/([\w.-]+)", re.I)
34
+ # Conservative package-name shapes (also block path traversal in specs).
35
+ _NPM_NAME_RE = re.compile(r"^@?[a-z0-9][\w.-]*(?:/[a-z0-9][\w.-]*)?$", re.I)
36
+ _PYPI_NAME_RE = re.compile(r"^[a-z0-9][\w.-]*$", re.I)
37
+
38
+ _HTTP_TIMEOUT = 60 # seconds for a registry/download request
39
+ _MAX_ARCHIVE_BYTES = 150 * 1024 * 1024 # cap the downloaded archive
40
+ _MAX_EXTRACT_BYTES = 400 * 1024 * 1024 # cap total uncompressed size (decompression-bomb guard)
41
+
42
+
43
+ class CollectionError(Exception):
44
+ """Raised when a source cannot be resolved into an analyzable directory."""
45
+
46
+
47
+ @dataclass
48
+ class SourceContext:
49
+ """A resolved, analyzable component on local disk."""
50
+
51
+ root: Path
52
+ component: Component
53
+ _tempdir: tempfile.TemporaryDirectory | None = None
54
+
55
+ def __enter__(self) -> SourceContext:
56
+ return self
57
+
58
+ def __exit__(self, *exc: object) -> None:
59
+ self.cleanup()
60
+
61
+ def cleanup(self) -> None:
62
+ if self._tempdir is not None:
63
+ self._tempdir.cleanup()
64
+ self._tempdir = None
65
+
66
+
67
+ def is_url(source: str) -> bool:
68
+ return bool(_GIT_URL_RE.match(source.strip()))
69
+
70
+
71
+ def classify_source(source: str) -> str:
72
+ """Classify a source string into 'npm', 'pypi', 'git', or 'local'."""
73
+ s = source.strip()
74
+ if s.lower().startswith("npm:") or _NPMJS_URL_RE.match(s):
75
+ return "npm"
76
+ if s.lower().startswith("pypi:") or _PYPI_URL_RE.match(s):
77
+ return "pypi"
78
+ if is_url(s):
79
+ return "git"
80
+ return "local"
81
+
82
+
83
+ def npm_package_name(source: str) -> str | None:
84
+ """Extract the npm package name from an `npm:<name>` spec or an npmjs.com URL."""
85
+ s = source.strip()
86
+ if s.lower().startswith("npm:"):
87
+ name = s[4:].strip()
88
+ else:
89
+ m = _NPMJS_URL_RE.match(s)
90
+ name = m.group(1) if m else ""
91
+ return name if name and _NPM_NAME_RE.match(name) else None
92
+
93
+
94
+ def pypi_package_name(source: str) -> str | None:
95
+ """Extract the PyPI project name from a `pypi:<name>` spec or a pypi.org URL."""
96
+ s = source.strip()
97
+ if s.lower().startswith("pypi:"):
98
+ name = s[5:].strip()
99
+ else:
100
+ m = _PYPI_URL_RE.match(s)
101
+ name = m.group(1) if m else ""
102
+ return name if name and _PYPI_NAME_RE.match(name) else None
103
+
104
+
105
+ def collect(source: str) -> SourceContext:
106
+ """Resolve ``source`` into a :class:`SourceContext`.
107
+
108
+ Supports a local directory, a git URL (shallow clone), and npm / PyPI packages
109
+ (`npm:<name>` / `pypi:<name>` specs or npmjs.com / pypi.org URLs — the latest published
110
+ release is downloaded from the registry and extracted).
111
+ """
112
+ kind = classify_source(source)
113
+ if kind == "npm":
114
+ return _collect_npm(source)
115
+ if kind == "pypi":
116
+ return _collect_pypi(source)
117
+ if kind == "git":
118
+ return _collect_remote(source)
119
+ return _collect_local(source)
120
+
121
+
122
+ def _collect_local(source: str) -> SourceContext:
123
+ root = Path(source).expanduser().resolve()
124
+ if not root.exists():
125
+ raise CollectionError(f"Path does not exist: {source}")
126
+ if not root.is_dir():
127
+ raise CollectionError(f"Path is not a directory: {source}")
128
+ component = detect_component(root, source=str(root))
129
+ return SourceContext(root=root, component=component)
130
+
131
+
132
+ def _collect_remote(url: str) -> SourceContext:
133
+ """Shallow-clone a remote git URL into a temp dir.
134
+
135
+ Security: the subprocess call is intentional and reviewed — git is resolved from PATH
136
+ (cross-platform), arguments are passed as a list (never with a shell), and the URL has
137
+ already been validated by :func:`is_url`. The call below is annotated as a reviewed
138
+ exception for the static security scan.
139
+ """
140
+ if shutil.which("git") is None:
141
+ raise CollectionError(
142
+ "git is required to analyze remote URLs but was not found on PATH."
143
+ )
144
+ tmp = tempfile.TemporaryDirectory(prefix="skilltotal_")
145
+ dest = Path(tmp.name) / "repo"
146
+ try:
147
+ subprocess.run( # nosec B603 B607
148
+ ["git", "clone", "--depth", "1", url, str(dest)],
149
+ check=True,
150
+ capture_output=True,
151
+ text=True,
152
+ )
153
+ except subprocess.CalledProcessError as exc:
154
+ tmp.cleanup()
155
+ raise CollectionError(
156
+ f"git clone failed for {url}: {exc.stderr.strip() or exc}"
157
+ ) from exc
158
+ component = detect_component(dest, source=url)
159
+ return SourceContext(root=dest, component=component, _tempdir=tmp)
160
+
161
+
162
+ # --------------------------------------------------------------- package registries
163
+
164
+ def _open(url: str):
165
+ """Open an https URL (scheme enforced) with a timeout."""
166
+ if not url.lower().startswith("https://"):
167
+ raise CollectionError(f"refusing to fetch non-https URL: {url}")
168
+ return urllib.request.urlopen(url, timeout=_HTTP_TIMEOUT) # nosec B310 - https enforced above
169
+
170
+
171
+ def _http_get(url: str) -> bytes:
172
+ """Fetch a URL, capping the response size (guards against oversized payloads)."""
173
+ try:
174
+ with _open(url) as resp:
175
+ data = resp.read(_MAX_ARCHIVE_BYTES + 1)
176
+ except (OSError, ValueError) as exc: # URLError/HTTPError are OSError subclasses
177
+ raise CollectionError(f"failed to fetch {url}: {exc}") from exc
178
+ if len(data) > _MAX_ARCHIVE_BYTES:
179
+ raise CollectionError(f"response from {url} exceeds the size limit")
180
+ return data
181
+
182
+
183
+ def _single_root(extract_dir: Path) -> Path:
184
+ """If the archive extracted to a single top-level directory, return it; else the dir."""
185
+ entries = list(extract_dir.iterdir())
186
+ if len(entries) == 1 and entries[0].is_dir():
187
+ return entries[0]
188
+ return extract_dir
189
+
190
+
191
+ def _within(dest_resolved: Path, name: str, dest: Path) -> bool:
192
+ """True if extracting ``name`` stays inside ``dest`` (boundary-correct, not prefix-based)."""
193
+ try:
194
+ (dest / name).resolve().relative_to(dest_resolved)
195
+ return True
196
+ except ValueError:
197
+ return False
198
+
199
+
200
+ def _safe_extract_tar(data: bytes, dest: Path) -> None:
201
+ dest_resolved = dest.resolve()
202
+ with tarfile.open(fileobj=io.BytesIO(data), mode="r:*") as tf:
203
+ safe, total = [], 0
204
+ for m in tf.getmembers():
205
+ if m.issym() or m.islnk():
206
+ continue # never extract links (path-escape risk)
207
+ if not _within(dest_resolved, m.name, dest):
208
+ raise CollectionError("archive contains an unsafe path")
209
+ if m.isfile():
210
+ total += m.size
211
+ if total > _MAX_EXTRACT_BYTES:
212
+ raise CollectionError("archive too large when extracted")
213
+ safe.append(m)
214
+ tf.extractall(dest, members=safe) # nosec B202 - members validated above
215
+
216
+
217
+ def _safe_extract_zip(data: bytes, dest: Path) -> None:
218
+ dest_resolved = dest.resolve()
219
+ with zipfile.ZipFile(io.BytesIO(data)) as zf:
220
+ safe, total = [], 0
221
+ for info in zf.infolist():
222
+ # zipfile.extractall can create symlinks from the stored unix mode on some platforms.
223
+ if stat.S_ISLNK(info.external_attr >> 16):
224
+ continue # never extract symlinks (path-escape risk)
225
+ if not _within(dest_resolved, info.filename, dest):
226
+ raise CollectionError("archive contains an unsafe path")
227
+ total += info.file_size
228
+ if total > _MAX_EXTRACT_BYTES:
229
+ raise CollectionError("archive too large when extracted")
230
+ safe.append(info)
231
+ zf.extractall(dest, members=safe) # nosec B202 - members validated above
232
+
233
+
234
+ def _collect_archive(
235
+ source: str, ctype: str, version: str, archive_url: str, filename: str
236
+ ) -> SourceContext:
237
+ data = _http_get(archive_url)
238
+ tmp = tempfile.TemporaryDirectory(prefix="skilltotal_")
239
+ try:
240
+ extract_dir = Path(tmp.name) / "x"
241
+ extract_dir.mkdir()
242
+ if filename.lower().endswith(".whl") or filename.lower().endswith(".zip"):
243
+ _safe_extract_zip(data, extract_dir)
244
+ else: # .tgz / .tar.gz / .tar.*
245
+ _safe_extract_tar(data, extract_dir)
246
+ root = _single_root(extract_dir)
247
+ component = detect_component(root, source=source)
248
+ component = replace(component, type=ctype, version=component.version or version)
249
+ return SourceContext(root=root, component=component, _tempdir=tmp)
250
+ except Exception:
251
+ tmp.cleanup()
252
+ raise
253
+
254
+
255
+ def _collect_npm(source: str) -> SourceContext:
256
+ name = npm_package_name(source)
257
+ if not name:
258
+ raise CollectionError(f"invalid npm package name in: {source}")
259
+ meta = json.loads(_http_get(f"https://registry.npmjs.org/{quote(name, safe='@')}"))
260
+ latest = (meta.get("dist-tags") or {}).get("latest")
261
+ versions = meta.get("versions") or {}
262
+ dist = (versions.get(latest) or {}).get("dist") if latest else None
263
+ tarball = (dist or {}).get("tarball")
264
+ if not (latest and tarball):
265
+ raise CollectionError(f"npm package '{name}' has no resolvable latest release")
266
+ return _collect_archive(source, "npm_package", str(latest), tarball, tarball)
267
+
268
+
269
+ def _collect_pypi(source: str) -> SourceContext:
270
+ name = pypi_package_name(source)
271
+ if not name:
272
+ raise CollectionError(f"invalid PyPI package name in: {source}")
273
+ meta = json.loads(_http_get(f"https://pypi.org/pypi/{name}/json"))
274
+ version = str((meta.get("info") or {}).get("version") or "")
275
+ urls = meta.get("urls") or []
276
+ chosen = next((u for u in urls if u.get("packagetype") == "sdist"), None)
277
+ chosen = chosen or next((u for u in urls if u.get("packagetype") == "bdist_wheel"), None)
278
+ if not (version and chosen and chosen.get("url")):
279
+ raise CollectionError(f"PyPI project '{name}' has no downloadable distribution")
280
+ return _collect_archive(
281
+ source, "python_package", version, chosen["url"], chosen.get("filename", "")
282
+ )
283
+
284
+
285
+ def detect_component(root: Path, source: str) -> Component:
286
+ """Derive component name/type/version solely from files inside ``root``."""
287
+ name = root.name
288
+ version = ""
289
+ ctype = "directory"
290
+
291
+ pkg = root / "package.json"
292
+ pyproject = root / "pyproject.toml"
293
+ setup_py = root / "setup.py"
294
+
295
+ if pkg.exists():
296
+ ctype = "npm_package"
297
+ meta = _read_package_json(pkg)
298
+ name = meta.get("name") or name
299
+ version = meta.get("version") or ""
300
+ elif pyproject.exists() or setup_py.exists():
301
+ ctype = "python_package"
302
+ meta = _read_pyproject(pyproject) if pyproject.exists() else {}
303
+ name = meta.get("name") or name
304
+ version = meta.get("version") or ""
305
+
306
+ # MCP / AI-component overrides take precedence when their artifacts are present.
307
+ if _has_mcp_manifest(root):
308
+ ctype = "mcp_server"
309
+ elif ctype == "directory" and _has_ai_artifacts(root):
310
+ ctype = "ai_component"
311
+
312
+ return Component(name=name, type=ctype, source=source, version=version)
313
+
314
+
315
+ def _read_package_json(path: Path) -> dict[str, str]:
316
+ try:
317
+ data = json.loads(path.read_text(encoding="utf-8", errors="replace"))
318
+ except (OSError, json.JSONDecodeError, ValueError):
319
+ return {}
320
+ if not isinstance(data, dict):
321
+ return {}
322
+ return {
323
+ "name": str(data.get("name", "")),
324
+ "version": str(data.get("version", "")),
325
+ }
326
+
327
+
328
+ def _read_pyproject(path: Path) -> dict[str, str]:
329
+ text = path.read_text(encoding="utf-8", errors="replace")
330
+ if tomllib is not None:
331
+ try:
332
+ data = tomllib.loads(text)
333
+ project = data.get("project", {}) if isinstance(data, dict) else {}
334
+ return {
335
+ "name": str(project.get("name", "")),
336
+ "version": str(project.get("version", "")),
337
+ }
338
+ except (tomllib.TOMLDecodeError, ValueError):
339
+ pass
340
+ # Fallback: best-effort regex for name/version.
341
+ name = _toml_value(text, "name")
342
+ version = _toml_value(text, "version")
343
+ return {"name": name, "version": version}
344
+
345
+
346
+ def _toml_value(text: str, key: str) -> str:
347
+ m = re.search(rf'^\s*{key}\s*=\s*"([^"]*)"', text, re.MULTILINE)
348
+ return m.group(1) if m else ""
349
+
350
+
351
+ def _has_mcp_manifest(root: Path) -> bool:
352
+ for name in ("mcp.json", ".mcp.json", "mcp.config.json"):
353
+ if (root / name).exists():
354
+ return True
355
+ # A package.json / manifest that declares mcpServers also counts.
356
+ for candidate in ("package.json", "manifest.json", "server.json"):
357
+ p = root / candidate
358
+ if p.exists():
359
+ try:
360
+ if "mcpServers" in p.read_text(encoding="utf-8", errors="replace"):
361
+ return True
362
+ except OSError:
363
+ pass
364
+ return False
365
+
366
+
367
+ def _has_ai_artifacts(root: Path) -> bool:
368
+ for name in ("SKILL.md", "AGENTS.md", "skill.md", "agents.md", "CLAUDE.md"):
369
+ if (root / name).exists():
370
+ return True
371
+ return False