skilltotal 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skilltotal/__init__.py +43 -0
- skilltotal/__main__.py +6 -0
- skilltotal/baseline.py +81 -0
- skilltotal/capabilities.py +31 -0
- skilltotal/cli.py +155 -0
- skilltotal/collector.py +371 -0
- skilltotal/engine.py +149 -0
- skilltotal/file_index.py +227 -0
- skilltotal/models.py +218 -0
- skilltotal/report.py +86 -0
- skilltotal/rules.py +36 -0
- skilltotal/sarif.py +108 -0
- skilltotal/scanners/__init__.py +49 -0
- skilltotal/scanners/base.py +128 -0
- skilltotal/scanners/dynamic_code.py +41 -0
- skilltotal/scanners/filesystem.py +63 -0
- skilltotal/scanners/install_scripts.py +85 -0
- skilltotal/scanners/invisible_unicode.py +127 -0
- skilltotal/scanners/mcp.py +370 -0
- skilltotal/scanners/network.py +47 -0
- skilltotal/scanners/obfuscation.py +140 -0
- skilltotal/scanners/prompt_surface.py +104 -0
- skilltotal/scanners/python_ast.py +380 -0
- skilltotal/scanners/sensitive_paths.py +174 -0
- skilltotal/scanners/shell_exec.py +47 -0
- skilltotal/scoring.py +79 -0
- skilltotal-0.3.0.dist-info/METADATA +170 -0
- skilltotal-0.3.0.dist-info/RECORD +33 -0
- skilltotal-0.3.0.dist-info/WHEEL +5 -0
- skilltotal-0.3.0.dist-info/entry_points.txt +2 -0
- skilltotal-0.3.0.dist-info/licenses/LICENSE +201 -0
- skilltotal-0.3.0.dist-info/licenses/NOTICE +6 -0
- skilltotal-0.3.0.dist-info/top_level.txt +1 -0
skilltotal/__init__.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""SkillTotal — AI Component Security Platform (core engine).
|
|
2
|
+
|
|
3
|
+
This package is the reusable core engine. Everything here is import-safe and free of
|
|
4
|
+
process-level side effects (no printing, no ``sys.exit``) *except* :mod:`skilltotal.cli`,
|
|
5
|
+
which is the thin I/O shell. Future web and enterprise products are intended to import
|
|
6
|
+
:func:`skilltotal.engine.analyze` directly.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from skilltotal.models import (
|
|
10
|
+
Capability,
|
|
11
|
+
Component,
|
|
12
|
+
Evidence,
|
|
13
|
+
Finding,
|
|
14
|
+
NeedsReview,
|
|
15
|
+
Report,
|
|
16
|
+
RiskLevel,
|
|
17
|
+
Severity,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# --- Versioned contract (consumed by downstream products such as the web app) -----------
|
|
21
|
+
# ENGINE_VERSION: semver of the code / public API; pin this from a consumer.
|
|
22
|
+
# REPORT_SCHEMA_VERSION: shape of Report.to_dict(); bumps only on schema changes.
|
|
23
|
+
# RULESET_VERSION: integer counter of the detection ruleset; bumps when rules change, so a
|
|
24
|
+
# consumer knows when re-scanning old reports may surface new findings.
|
|
25
|
+
__version__ = "0.3.0"
|
|
26
|
+
ENGINE_VERSION = __version__
|
|
27
|
+
REPORT_SCHEMA_VERSION = "1.1"
|
|
28
|
+
RULESET_VERSION = 4
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"__version__",
|
|
32
|
+
"ENGINE_VERSION",
|
|
33
|
+
"REPORT_SCHEMA_VERSION",
|
|
34
|
+
"RULESET_VERSION",
|
|
35
|
+
"Capability",
|
|
36
|
+
"Component",
|
|
37
|
+
"Evidence",
|
|
38
|
+
"Finding",
|
|
39
|
+
"NeedsReview",
|
|
40
|
+
"Report",
|
|
41
|
+
"RiskLevel",
|
|
42
|
+
"Severity",
|
|
43
|
+
]
|
skilltotal/__main__.py
ADDED
skilltotal/baseline.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Baseline suppression of known findings.
|
|
2
|
+
|
|
3
|
+
A baseline records stable *fingerprints* of accepted findings so they no longer appear in
|
|
4
|
+
future scans (useful for adopting SkillTotal on an existing repo, or for CI gates). A
|
|
5
|
+
fingerprint hashes ``(rule_id, file, normalized snippet)`` — deliberately **not** the line
|
|
6
|
+
number — so it survives edits that shift lines.
|
|
7
|
+
|
|
8
|
+
Suppression is applied at the evidence level before scoring: matched evidence is removed,
|
|
9
|
+
and a finding with no remaining evidence is dropped entirely (preserving the
|
|
10
|
+
"no finding without evidence" invariant) and does not contribute to the score.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import hashlib
|
|
16
|
+
import json
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from skilltotal.models import Evidence, Finding
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def fingerprint(rule_id: str, evidence: Evidence) -> str:
|
|
23
|
+
"""Stable, line-independent identifier for one evidence occurrence."""
|
|
24
|
+
payload = f"{rule_id}|{evidence.file}|{evidence.snippet.strip()}"
|
|
25
|
+
# Not a security hash: just a stable fingerprint for baseline dedup/suppression.
|
|
26
|
+
digest = hashlib.sha1(payload.encode("utf-8"), usedforsecurity=False)
|
|
27
|
+
return digest.hexdigest()[:16]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def finding_fingerprints(finding: Finding) -> list[str]:
|
|
31
|
+
return [fingerprint(finding.id, e) for e in finding.evidence]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def apply_suppressions(
|
|
35
|
+
findings: list[Finding], suppressed: set[str]
|
|
36
|
+
) -> tuple[list[Finding], int]:
|
|
37
|
+
"""Drop suppressed evidence (and emptied findings). Returns (kept, suppressed_count)."""
|
|
38
|
+
if not suppressed:
|
|
39
|
+
return findings, 0
|
|
40
|
+
kept: list[Finding] = []
|
|
41
|
+
removed = 0
|
|
42
|
+
for finding in findings:
|
|
43
|
+
remaining = [e for e in finding.evidence if fingerprint(finding.id, e) not in suppressed]
|
|
44
|
+
removed += len(finding.evidence) - len(remaining)
|
|
45
|
+
if remaining:
|
|
46
|
+
kept.append(
|
|
47
|
+
Finding(
|
|
48
|
+
id=finding.id,
|
|
49
|
+
severity=finding.severity,
|
|
50
|
+
category=finding.category,
|
|
51
|
+
title=finding.title,
|
|
52
|
+
description=finding.description,
|
|
53
|
+
evidence=remaining,
|
|
54
|
+
recommendation=finding.recommendation,
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
return kept, removed
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def load_baseline(path: str | Path) -> set[str]:
|
|
61
|
+
"""Load a baseline file into a set of fingerprints.
|
|
62
|
+
|
|
63
|
+
Accepts either a JSON object ``{"suppressed": [...]}`` or a plain JSON list.
|
|
64
|
+
"""
|
|
65
|
+
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
66
|
+
if isinstance(data, dict):
|
|
67
|
+
items = data.get("suppressed", [])
|
|
68
|
+
elif isinstance(data, list):
|
|
69
|
+
items = data
|
|
70
|
+
else:
|
|
71
|
+
items = []
|
|
72
|
+
return {str(x) for x in items}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def build_baseline(findings: list[Finding]) -> dict[str, object]:
|
|
76
|
+
"""Build a baseline document covering every current finding occurrence."""
|
|
77
|
+
fps = sorted({fp for f in findings for fp in finding_fingerprints(f)})
|
|
78
|
+
return {
|
|
79
|
+
"version": 1,
|
|
80
|
+
"suppressed": fps,
|
|
81
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Evidence-based capability extraction.
|
|
2
|
+
|
|
3
|
+
Capabilities are a pure projection over findings: each finding's rule declares the
|
|
4
|
+
:class:`~skilltotal.models.Capability` it implies, so we simply regroup the evidence the
|
|
5
|
+
findings already proved. No file is re-scanned, and every capability is therefore
|
|
6
|
+
evidence-backed by construction.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from skilltotal.models import Capability, Evidence, Finding
|
|
12
|
+
from skilltotal.scanners import rule_by_id
|
|
13
|
+
|
|
14
|
+
# Evidence kept per capability (capabilities can aggregate many findings).
|
|
15
|
+
MAX_EVIDENCE_PER_CAPABILITY = 25
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def extract_capabilities(findings: list[Finding]) -> dict[Capability, list[Evidence]]:
|
|
19
|
+
rules = rule_by_id()
|
|
20
|
+
caps: dict[Capability, list[Evidence]] = {}
|
|
21
|
+
for finding in findings:
|
|
22
|
+
rule = rules.get(finding.id)
|
|
23
|
+
capability = rule.capability if rule else None
|
|
24
|
+
if capability is None:
|
|
25
|
+
continue
|
|
26
|
+
bucket = caps.setdefault(capability, [])
|
|
27
|
+
for ev in finding.evidence:
|
|
28
|
+
if len(bucket) >= MAX_EVIDENCE_PER_CAPABILITY:
|
|
29
|
+
break
|
|
30
|
+
bucket.append(ev)
|
|
31
|
+
return caps
|
skilltotal/cli.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""SkillTotal command-line interface — the only I/O shell around the core engine.
|
|
2
|
+
|
|
3
|
+
Commands:
|
|
4
|
+
skilltotal scan <path-or-url> [--json] [--output FILE] [--fail-on-high]
|
|
5
|
+
skilltotal rules list [--json]
|
|
6
|
+
|
|
7
|
+
Exit codes:
|
|
8
|
+
0 success
|
|
9
|
+
1 usage / collection error
|
|
10
|
+
2 --fail-on-high set and a finding of severity >= high was produced
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import sys
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
from skilltotal import __version__
|
|
21
|
+
from skilltotal.baseline import build_baseline, load_baseline
|
|
22
|
+
from skilltotal.collector import CollectionError
|
|
23
|
+
from skilltotal.engine import analyze
|
|
24
|
+
from skilltotal.models import Severity
|
|
25
|
+
from skilltotal.report import (
|
|
26
|
+
render_json,
|
|
27
|
+
render_rules_json,
|
|
28
|
+
render_rules_text,
|
|
29
|
+
render_text,
|
|
30
|
+
)
|
|
31
|
+
from skilltotal.rules import get_rules
|
|
32
|
+
from skilltotal.sarif import render_sarif
|
|
33
|
+
|
|
34
|
+
EXIT_OK = 0
|
|
35
|
+
EXIT_ERROR = 1
|
|
36
|
+
EXIT_FAIL_ON_HIGH = 2
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
40
|
+
parser = argparse.ArgumentParser(
|
|
41
|
+
prog="skilltotal",
|
|
42
|
+
description="AI Component Security Platform — static analysis of AI components.",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument("--version", action="version", version=f"skilltotal {__version__}")
|
|
45
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
46
|
+
|
|
47
|
+
scan = sub.add_parser("scan", help="Scan a component (local path or git URL).")
|
|
48
|
+
scan.add_argument("source", help="Local directory path or git repository URL.")
|
|
49
|
+
scan.add_argument("--json", action="store_true", help="Emit JSON to stdout.")
|
|
50
|
+
scan.add_argument(
|
|
51
|
+
"--sarif",
|
|
52
|
+
action="store_true",
|
|
53
|
+
help="Emit SARIF 2.1.0 to stdout (and to --output if given).",
|
|
54
|
+
)
|
|
55
|
+
scan.add_argument(
|
|
56
|
+
"--output",
|
|
57
|
+
metavar="FILE",
|
|
58
|
+
help="Write the report to FILE (SARIF if --sarif, else JSON).",
|
|
59
|
+
)
|
|
60
|
+
scan.add_argument(
|
|
61
|
+
"--baseline",
|
|
62
|
+
metavar="FILE",
|
|
63
|
+
help="Suppress findings whose fingerprints are listed in this baseline file.",
|
|
64
|
+
)
|
|
65
|
+
scan.add_argument(
|
|
66
|
+
"--write-baseline",
|
|
67
|
+
metavar="FILE",
|
|
68
|
+
help="Write a baseline file covering the current findings, then exit normally.",
|
|
69
|
+
)
|
|
70
|
+
scan.add_argument(
|
|
71
|
+
"--fail-on-high",
|
|
72
|
+
action="store_true",
|
|
73
|
+
help="Exit with code 2 if any finding is high or critical.",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
rules = sub.add_parser("rules", help="Inspect the detection rules.")
|
|
77
|
+
rules_sub = rules.add_subparsers(dest="rules_command", required=True)
|
|
78
|
+
rules_list = rules_sub.add_parser("list", help="List all detection rules.")
|
|
79
|
+
rules_list.add_argument("--json", action="store_true", help="Emit JSON to stdout.")
|
|
80
|
+
|
|
81
|
+
return parser
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def main(argv: list[str] | None = None) -> int:
|
|
85
|
+
parser = build_parser()
|
|
86
|
+
args = parser.parse_args(argv)
|
|
87
|
+
|
|
88
|
+
if args.command == "scan":
|
|
89
|
+
return _cmd_scan(args)
|
|
90
|
+
if args.command == "rules":
|
|
91
|
+
return _cmd_rules(args)
|
|
92
|
+
parser.error("unknown command") # pragma: no cover
|
|
93
|
+
return EXIT_ERROR
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _cmd_scan(args: argparse.Namespace) -> int:
|
|
97
|
+
suppress: set[str] = set()
|
|
98
|
+
if args.baseline:
|
|
99
|
+
try:
|
|
100
|
+
suppress = load_baseline(args.baseline)
|
|
101
|
+
except (OSError, json.JSONDecodeError) as exc:
|
|
102
|
+
print(f"error: cannot read baseline {args.baseline}: {exc}", file=sys.stderr)
|
|
103
|
+
return EXIT_ERROR
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
report = analyze(args.source, suppress=suppress)
|
|
107
|
+
except CollectionError as exc:
|
|
108
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
109
|
+
return EXIT_ERROR
|
|
110
|
+
|
|
111
|
+
if args.write_baseline:
|
|
112
|
+
doc = build_baseline(report.findings)
|
|
113
|
+
Path(args.write_baseline).write_text(
|
|
114
|
+
json.dumps(doc, indent=2), encoding="utf-8"
|
|
115
|
+
)
|
|
116
|
+
print(
|
|
117
|
+
f"Baseline with {len(doc['suppressed'])} fingerprint(s) written to "
|
|
118
|
+
f"{args.write_baseline}",
|
|
119
|
+
file=sys.stderr,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Choose the structured renderer once; reuse for stdout and --output.
|
|
123
|
+
if args.sarif:
|
|
124
|
+
structured = render_sarif(report)
|
|
125
|
+
else:
|
|
126
|
+
structured = render_json(report)
|
|
127
|
+
|
|
128
|
+
if args.sarif or args.json:
|
|
129
|
+
print(structured)
|
|
130
|
+
else:
|
|
131
|
+
print(render_text(report))
|
|
132
|
+
|
|
133
|
+
if args.output:
|
|
134
|
+
Path(args.output).write_text(structured, encoding="utf-8")
|
|
135
|
+
print(f"Report written to {args.output}", file=sys.stderr)
|
|
136
|
+
|
|
137
|
+
if args.fail_on_high and _has_high(report):
|
|
138
|
+
return EXIT_FAIL_ON_HIGH
|
|
139
|
+
return EXIT_OK
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _cmd_rules(args: argparse.Namespace) -> int:
|
|
143
|
+
if args.rules_command == "list":
|
|
144
|
+
rules = get_rules()
|
|
145
|
+
if args.json:
|
|
146
|
+
print(render_rules_json(rules))
|
|
147
|
+
else:
|
|
148
|
+
print(render_rules_text(rules))
|
|
149
|
+
return EXIT_OK
|
|
150
|
+
return EXIT_ERROR
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _has_high(report) -> bool:
|
|
154
|
+
threshold = Severity.HIGH.rank
|
|
155
|
+
return any(f.severity.rank >= threshold for f in report.findings)
|
skilltotal/collector.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
"""Source collection: resolve a path or URL into a local directory + component identity.
|
|
2
|
+
|
|
3
|
+
Supported sources: a local directory, or a remote git URL (cloned shallowly into a temp
|
|
4
|
+
directory). Component identity (name/type/version) is derived **only** from the component
|
|
5
|
+
itself — never from the user's environment.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import io
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
import shutil
|
|
14
|
+
import stat
|
|
15
|
+
import subprocess # nosec B404
|
|
16
|
+
import tarfile
|
|
17
|
+
import tempfile
|
|
18
|
+
import urllib.request
|
|
19
|
+
import zipfile
|
|
20
|
+
from dataclasses import dataclass, replace
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from urllib.parse import quote
|
|
23
|
+
|
|
24
|
+
try: # Python 3.11+
|
|
25
|
+
import tomllib
|
|
26
|
+
except ModuleNotFoundError: # pragma: no cover
|
|
27
|
+
tomllib = None # type: ignore[assignment]
|
|
28
|
+
|
|
29
|
+
from skilltotal.models import Component
|
|
30
|
+
|
|
31
|
+
_GIT_URL_RE = re.compile(r"^(?:https?://|git@|ssh://|git://).+", re.IGNORECASE)
|
|
32
|
+
_NPMJS_URL_RE = re.compile(r"^https?://(?:www\.)?npmjs\.com/package/(@?[\w.-]+(?:/[\w.-]+)?)", re.I)
|
|
33
|
+
_PYPI_URL_RE = re.compile(r"^https?://pypi\.org/project/([\w.-]+)", re.I)
|
|
34
|
+
# Conservative package-name shapes (also block path traversal in specs).
|
|
35
|
+
_NPM_NAME_RE = re.compile(r"^@?[a-z0-9][\w.-]*(?:/[a-z0-9][\w.-]*)?$", re.I)
|
|
36
|
+
_PYPI_NAME_RE = re.compile(r"^[a-z0-9][\w.-]*$", re.I)
|
|
37
|
+
|
|
38
|
+
_HTTP_TIMEOUT = 60 # seconds for a registry/download request
|
|
39
|
+
_MAX_ARCHIVE_BYTES = 150 * 1024 * 1024 # cap the downloaded archive
|
|
40
|
+
_MAX_EXTRACT_BYTES = 400 * 1024 * 1024 # cap total uncompressed size (decompression-bomb guard)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class CollectionError(Exception):
|
|
44
|
+
"""Raised when a source cannot be resolved into an analyzable directory."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class SourceContext:
|
|
49
|
+
"""A resolved, analyzable component on local disk."""
|
|
50
|
+
|
|
51
|
+
root: Path
|
|
52
|
+
component: Component
|
|
53
|
+
_tempdir: tempfile.TemporaryDirectory | None = None
|
|
54
|
+
|
|
55
|
+
def __enter__(self) -> SourceContext:
|
|
56
|
+
return self
|
|
57
|
+
|
|
58
|
+
def __exit__(self, *exc: object) -> None:
|
|
59
|
+
self.cleanup()
|
|
60
|
+
|
|
61
|
+
def cleanup(self) -> None:
|
|
62
|
+
if self._tempdir is not None:
|
|
63
|
+
self._tempdir.cleanup()
|
|
64
|
+
self._tempdir = None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def is_url(source: str) -> bool:
|
|
68
|
+
return bool(_GIT_URL_RE.match(source.strip()))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def classify_source(source: str) -> str:
|
|
72
|
+
"""Classify a source string into 'npm', 'pypi', 'git', or 'local'."""
|
|
73
|
+
s = source.strip()
|
|
74
|
+
if s.lower().startswith("npm:") or _NPMJS_URL_RE.match(s):
|
|
75
|
+
return "npm"
|
|
76
|
+
if s.lower().startswith("pypi:") or _PYPI_URL_RE.match(s):
|
|
77
|
+
return "pypi"
|
|
78
|
+
if is_url(s):
|
|
79
|
+
return "git"
|
|
80
|
+
return "local"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def npm_package_name(source: str) -> str | None:
|
|
84
|
+
"""Extract the npm package name from an `npm:<name>` spec or an npmjs.com URL."""
|
|
85
|
+
s = source.strip()
|
|
86
|
+
if s.lower().startswith("npm:"):
|
|
87
|
+
name = s[4:].strip()
|
|
88
|
+
else:
|
|
89
|
+
m = _NPMJS_URL_RE.match(s)
|
|
90
|
+
name = m.group(1) if m else ""
|
|
91
|
+
return name if name and _NPM_NAME_RE.match(name) else None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def pypi_package_name(source: str) -> str | None:
|
|
95
|
+
"""Extract the PyPI project name from a `pypi:<name>` spec or a pypi.org URL."""
|
|
96
|
+
s = source.strip()
|
|
97
|
+
if s.lower().startswith("pypi:"):
|
|
98
|
+
name = s[5:].strip()
|
|
99
|
+
else:
|
|
100
|
+
m = _PYPI_URL_RE.match(s)
|
|
101
|
+
name = m.group(1) if m else ""
|
|
102
|
+
return name if name and _PYPI_NAME_RE.match(name) else None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def collect(source: str) -> SourceContext:
|
|
106
|
+
"""Resolve ``source`` into a :class:`SourceContext`.
|
|
107
|
+
|
|
108
|
+
Supports a local directory, a git URL (shallow clone), and npm / PyPI packages
|
|
109
|
+
(`npm:<name>` / `pypi:<name>` specs or npmjs.com / pypi.org URLs — the latest published
|
|
110
|
+
release is downloaded from the registry and extracted).
|
|
111
|
+
"""
|
|
112
|
+
kind = classify_source(source)
|
|
113
|
+
if kind == "npm":
|
|
114
|
+
return _collect_npm(source)
|
|
115
|
+
if kind == "pypi":
|
|
116
|
+
return _collect_pypi(source)
|
|
117
|
+
if kind == "git":
|
|
118
|
+
return _collect_remote(source)
|
|
119
|
+
return _collect_local(source)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _collect_local(source: str) -> SourceContext:
|
|
123
|
+
root = Path(source).expanduser().resolve()
|
|
124
|
+
if not root.exists():
|
|
125
|
+
raise CollectionError(f"Path does not exist: {source}")
|
|
126
|
+
if not root.is_dir():
|
|
127
|
+
raise CollectionError(f"Path is not a directory: {source}")
|
|
128
|
+
component = detect_component(root, source=str(root))
|
|
129
|
+
return SourceContext(root=root, component=component)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _collect_remote(url: str) -> SourceContext:
|
|
133
|
+
"""Shallow-clone a remote git URL into a temp dir.
|
|
134
|
+
|
|
135
|
+
Security: the subprocess call is intentional and reviewed — git is resolved from PATH
|
|
136
|
+
(cross-platform), arguments are passed as a list (never with a shell), and the URL has
|
|
137
|
+
already been validated by :func:`is_url`. The call below is annotated as a reviewed
|
|
138
|
+
exception for the static security scan.
|
|
139
|
+
"""
|
|
140
|
+
if shutil.which("git") is None:
|
|
141
|
+
raise CollectionError(
|
|
142
|
+
"git is required to analyze remote URLs but was not found on PATH."
|
|
143
|
+
)
|
|
144
|
+
tmp = tempfile.TemporaryDirectory(prefix="skilltotal_")
|
|
145
|
+
dest = Path(tmp.name) / "repo"
|
|
146
|
+
try:
|
|
147
|
+
subprocess.run( # nosec B603 B607
|
|
148
|
+
["git", "clone", "--depth", "1", url, str(dest)],
|
|
149
|
+
check=True,
|
|
150
|
+
capture_output=True,
|
|
151
|
+
text=True,
|
|
152
|
+
)
|
|
153
|
+
except subprocess.CalledProcessError as exc:
|
|
154
|
+
tmp.cleanup()
|
|
155
|
+
raise CollectionError(
|
|
156
|
+
f"git clone failed for {url}: {exc.stderr.strip() or exc}"
|
|
157
|
+
) from exc
|
|
158
|
+
component = detect_component(dest, source=url)
|
|
159
|
+
return SourceContext(root=dest, component=component, _tempdir=tmp)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# --------------------------------------------------------------- package registries
|
|
163
|
+
|
|
164
|
+
def _open(url: str):
|
|
165
|
+
"""Open an https URL (scheme enforced) with a timeout."""
|
|
166
|
+
if not url.lower().startswith("https://"):
|
|
167
|
+
raise CollectionError(f"refusing to fetch non-https URL: {url}")
|
|
168
|
+
return urllib.request.urlopen(url, timeout=_HTTP_TIMEOUT) # nosec B310 - https enforced above
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _http_get(url: str) -> bytes:
|
|
172
|
+
"""Fetch a URL, capping the response size (guards against oversized payloads)."""
|
|
173
|
+
try:
|
|
174
|
+
with _open(url) as resp:
|
|
175
|
+
data = resp.read(_MAX_ARCHIVE_BYTES + 1)
|
|
176
|
+
except (OSError, ValueError) as exc: # URLError/HTTPError are OSError subclasses
|
|
177
|
+
raise CollectionError(f"failed to fetch {url}: {exc}") from exc
|
|
178
|
+
if len(data) > _MAX_ARCHIVE_BYTES:
|
|
179
|
+
raise CollectionError(f"response from {url} exceeds the size limit")
|
|
180
|
+
return data
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _single_root(extract_dir: Path) -> Path:
|
|
184
|
+
"""If the archive extracted to a single top-level directory, return it; else the dir."""
|
|
185
|
+
entries = list(extract_dir.iterdir())
|
|
186
|
+
if len(entries) == 1 and entries[0].is_dir():
|
|
187
|
+
return entries[0]
|
|
188
|
+
return extract_dir
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _within(dest_resolved: Path, name: str, dest: Path) -> bool:
|
|
192
|
+
"""True if extracting ``name`` stays inside ``dest`` (boundary-correct, not prefix-based)."""
|
|
193
|
+
try:
|
|
194
|
+
(dest / name).resolve().relative_to(dest_resolved)
|
|
195
|
+
return True
|
|
196
|
+
except ValueError:
|
|
197
|
+
return False
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _safe_extract_tar(data: bytes, dest: Path) -> None:
|
|
201
|
+
dest_resolved = dest.resolve()
|
|
202
|
+
with tarfile.open(fileobj=io.BytesIO(data), mode="r:*") as tf:
|
|
203
|
+
safe, total = [], 0
|
|
204
|
+
for m in tf.getmembers():
|
|
205
|
+
if m.issym() or m.islnk():
|
|
206
|
+
continue # never extract links (path-escape risk)
|
|
207
|
+
if not _within(dest_resolved, m.name, dest):
|
|
208
|
+
raise CollectionError("archive contains an unsafe path")
|
|
209
|
+
if m.isfile():
|
|
210
|
+
total += m.size
|
|
211
|
+
if total > _MAX_EXTRACT_BYTES:
|
|
212
|
+
raise CollectionError("archive too large when extracted")
|
|
213
|
+
safe.append(m)
|
|
214
|
+
tf.extractall(dest, members=safe) # nosec B202 - members validated above
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _safe_extract_zip(data: bytes, dest: Path) -> None:
|
|
218
|
+
dest_resolved = dest.resolve()
|
|
219
|
+
with zipfile.ZipFile(io.BytesIO(data)) as zf:
|
|
220
|
+
safe, total = [], 0
|
|
221
|
+
for info in zf.infolist():
|
|
222
|
+
# zipfile.extractall can create symlinks from the stored unix mode on some platforms.
|
|
223
|
+
if stat.S_ISLNK(info.external_attr >> 16):
|
|
224
|
+
continue # never extract symlinks (path-escape risk)
|
|
225
|
+
if not _within(dest_resolved, info.filename, dest):
|
|
226
|
+
raise CollectionError("archive contains an unsafe path")
|
|
227
|
+
total += info.file_size
|
|
228
|
+
if total > _MAX_EXTRACT_BYTES:
|
|
229
|
+
raise CollectionError("archive too large when extracted")
|
|
230
|
+
safe.append(info)
|
|
231
|
+
zf.extractall(dest, members=safe) # nosec B202 - members validated above
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _collect_archive(
|
|
235
|
+
source: str, ctype: str, version: str, archive_url: str, filename: str
|
|
236
|
+
) -> SourceContext:
|
|
237
|
+
data = _http_get(archive_url)
|
|
238
|
+
tmp = tempfile.TemporaryDirectory(prefix="skilltotal_")
|
|
239
|
+
try:
|
|
240
|
+
extract_dir = Path(tmp.name) / "x"
|
|
241
|
+
extract_dir.mkdir()
|
|
242
|
+
if filename.lower().endswith(".whl") or filename.lower().endswith(".zip"):
|
|
243
|
+
_safe_extract_zip(data, extract_dir)
|
|
244
|
+
else: # .tgz / .tar.gz / .tar.*
|
|
245
|
+
_safe_extract_tar(data, extract_dir)
|
|
246
|
+
root = _single_root(extract_dir)
|
|
247
|
+
component = detect_component(root, source=source)
|
|
248
|
+
component = replace(component, type=ctype, version=component.version or version)
|
|
249
|
+
return SourceContext(root=root, component=component, _tempdir=tmp)
|
|
250
|
+
except Exception:
|
|
251
|
+
tmp.cleanup()
|
|
252
|
+
raise
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _collect_npm(source: str) -> SourceContext:
|
|
256
|
+
name = npm_package_name(source)
|
|
257
|
+
if not name:
|
|
258
|
+
raise CollectionError(f"invalid npm package name in: {source}")
|
|
259
|
+
meta = json.loads(_http_get(f"https://registry.npmjs.org/{quote(name, safe='@')}"))
|
|
260
|
+
latest = (meta.get("dist-tags") or {}).get("latest")
|
|
261
|
+
versions = meta.get("versions") or {}
|
|
262
|
+
dist = (versions.get(latest) or {}).get("dist") if latest else None
|
|
263
|
+
tarball = (dist or {}).get("tarball")
|
|
264
|
+
if not (latest and tarball):
|
|
265
|
+
raise CollectionError(f"npm package '{name}' has no resolvable latest release")
|
|
266
|
+
return _collect_archive(source, "npm_package", str(latest), tarball, tarball)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _collect_pypi(source: str) -> SourceContext:
|
|
270
|
+
name = pypi_package_name(source)
|
|
271
|
+
if not name:
|
|
272
|
+
raise CollectionError(f"invalid PyPI package name in: {source}")
|
|
273
|
+
meta = json.loads(_http_get(f"https://pypi.org/pypi/{name}/json"))
|
|
274
|
+
version = str((meta.get("info") or {}).get("version") or "")
|
|
275
|
+
urls = meta.get("urls") or []
|
|
276
|
+
chosen = next((u for u in urls if u.get("packagetype") == "sdist"), None)
|
|
277
|
+
chosen = chosen or next((u for u in urls if u.get("packagetype") == "bdist_wheel"), None)
|
|
278
|
+
if not (version and chosen and chosen.get("url")):
|
|
279
|
+
raise CollectionError(f"PyPI project '{name}' has no downloadable distribution")
|
|
280
|
+
return _collect_archive(
|
|
281
|
+
source, "python_package", version, chosen["url"], chosen.get("filename", "")
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def detect_component(root: Path, source: str) -> Component:
|
|
286
|
+
"""Derive component name/type/version solely from files inside ``root``."""
|
|
287
|
+
name = root.name
|
|
288
|
+
version = ""
|
|
289
|
+
ctype = "directory"
|
|
290
|
+
|
|
291
|
+
pkg = root / "package.json"
|
|
292
|
+
pyproject = root / "pyproject.toml"
|
|
293
|
+
setup_py = root / "setup.py"
|
|
294
|
+
|
|
295
|
+
if pkg.exists():
|
|
296
|
+
ctype = "npm_package"
|
|
297
|
+
meta = _read_package_json(pkg)
|
|
298
|
+
name = meta.get("name") or name
|
|
299
|
+
version = meta.get("version") or ""
|
|
300
|
+
elif pyproject.exists() or setup_py.exists():
|
|
301
|
+
ctype = "python_package"
|
|
302
|
+
meta = _read_pyproject(pyproject) if pyproject.exists() else {}
|
|
303
|
+
name = meta.get("name") or name
|
|
304
|
+
version = meta.get("version") or ""
|
|
305
|
+
|
|
306
|
+
# MCP / AI-component overrides take precedence when their artifacts are present.
|
|
307
|
+
if _has_mcp_manifest(root):
|
|
308
|
+
ctype = "mcp_server"
|
|
309
|
+
elif ctype == "directory" and _has_ai_artifacts(root):
|
|
310
|
+
ctype = "ai_component"
|
|
311
|
+
|
|
312
|
+
return Component(name=name, type=ctype, source=source, version=version)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _read_package_json(path: Path) -> dict[str, str]:
|
|
316
|
+
try:
|
|
317
|
+
data = json.loads(path.read_text(encoding="utf-8", errors="replace"))
|
|
318
|
+
except (OSError, json.JSONDecodeError, ValueError):
|
|
319
|
+
return {}
|
|
320
|
+
if not isinstance(data, dict):
|
|
321
|
+
return {}
|
|
322
|
+
return {
|
|
323
|
+
"name": str(data.get("name", "")),
|
|
324
|
+
"version": str(data.get("version", "")),
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _read_pyproject(path: Path) -> dict[str, str]:
|
|
329
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
330
|
+
if tomllib is not None:
|
|
331
|
+
try:
|
|
332
|
+
data = tomllib.loads(text)
|
|
333
|
+
project = data.get("project", {}) if isinstance(data, dict) else {}
|
|
334
|
+
return {
|
|
335
|
+
"name": str(project.get("name", "")),
|
|
336
|
+
"version": str(project.get("version", "")),
|
|
337
|
+
}
|
|
338
|
+
except (tomllib.TOMLDecodeError, ValueError):
|
|
339
|
+
pass
|
|
340
|
+
# Fallback: best-effort regex for name/version.
|
|
341
|
+
name = _toml_value(text, "name")
|
|
342
|
+
version = _toml_value(text, "version")
|
|
343
|
+
return {"name": name, "version": version}
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _toml_value(text: str, key: str) -> str:
|
|
347
|
+
m = re.search(rf'^\s*{key}\s*=\s*"([^"]*)"', text, re.MULTILINE)
|
|
348
|
+
return m.group(1) if m else ""
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def _has_mcp_manifest(root: Path) -> bool:
|
|
352
|
+
for name in ("mcp.json", ".mcp.json", "mcp.config.json"):
|
|
353
|
+
if (root / name).exists():
|
|
354
|
+
return True
|
|
355
|
+
# A package.json / manifest that declares mcpServers also counts.
|
|
356
|
+
for candidate in ("package.json", "manifest.json", "server.json"):
|
|
357
|
+
p = root / candidate
|
|
358
|
+
if p.exists():
|
|
359
|
+
try:
|
|
360
|
+
if "mcpServers" in p.read_text(encoding="utf-8", errors="replace"):
|
|
361
|
+
return True
|
|
362
|
+
except OSError:
|
|
363
|
+
pass
|
|
364
|
+
return False
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _has_ai_artifacts(root: Path) -> bool:
|
|
368
|
+
for name in ("SKILL.md", "AGENTS.md", "skill.md", "agents.md", "CLAUDE.md"):
|
|
369
|
+
if (root / name).exists():
|
|
370
|
+
return True
|
|
371
|
+
return False
|