PyPI - skip-trace - Versions diffs - 0.1.0__py3-none-any.whl - Mend

skip-trace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

skip_trace/__about__.py +19 -0
skip_trace/__init__.py +6 -0
skip_trace/__main__.py +9 -0
skip_trace/analysis/__init__.py +4 -0
skip_trace/analysis/evidence.py +312 -0
skip_trace/analysis/ner.py +58 -0
skip_trace/analysis/scoring.py +282 -0
skip_trace/analysis/source_scanner.py +411 -0
skip_trace/cli.py +177 -0
skip_trace/collectors/__init__.py +4 -0
skip_trace/collectors/github.py +241 -0
skip_trace/collectors/package_files.py +150 -0
skip_trace/collectors/pypi.py +158 -0
skip_trace/collectors/whois.py +202 -0
skip_trace/config.py +165 -0
skip_trace/exceptions.py +22 -0
skip_trace/main.py +269 -0
skip_trace/py.typed.py +0 -0
skip_trace/reporting/__init__.py +0 -0
skip_trace/reporting/json_reporter.py +22 -0
skip_trace/reporting/md_reporter.py +115 -0
skip_trace/schemas.py +131 -0
skip_trace/utils/__init__.py +4 -0
skip_trace/utils/cache.py +77 -0
skip_trace/utils/cli_suggestions.py +91 -0
skip_trace/utils/http_client.py +45 -0
skip_trace/utils/safe_targz.py +161 -0
skip_trace/utils/validation.py +52 -0
skip_trace-0.1.0.dist-info/METADATA +125 -0
skip_trace-0.1.0.dist-info/RECORD +33 -0
skip_trace-0.1.0.dist-info/WHEEL +4 -0
skip_trace-0.1.0.dist-info/entry_points.txt +2 -0
skip_trace-0.1.0.dist-info/licenses/LICENSE +21 -0

skip_trace/collectors/whois.py ADDED Viewed

@@ -0,0 +1,202 @@
+# skip_trace/collectors/whois.py
+from __future__ import annotations
+import datetime as _dt
+import logging
+from typing import Any, Dict, List, Optional
+import whois as python_whois
+from whoisit import domain as rdap_domain
+from ..analysis.evidence import generate_evidence_id
+from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
+from ..utils.cache import get_cached_data, set_cached_data
+logger = logging.getLogger(__name__)
+def _normalize_org_name(name: Optional[str]) -> Optional[str]:
+    """Cleans up organization names from WHOIS/RDAP data."""
+    if not isinstance(name, str):
+        return None
+    name = name.strip()
+    common_suffixes = [
+        "LLC",
+        "L.L.C.",
+        "INC",
+        "INCORPORATED",
+        "CORP",
+        "CORPORATION",
+        "LTD",
+        "LIMITED",
+        "GMBH",
+        "S.A.",
+        "S.L.",
+    ]
+    up = name.upper()
+    for suf in common_suffixes:
+        suf_dot = f"{suf}."
+        if up.endswith(f" {suf}") or up.endswith(f",{suf}"):
+            name = name[: -(len(suf) + 1)].strip().rstrip(",")
+            break
+        if up.endswith(f" {suf_dot}") or up.endswith(f",{suf_dot}"):
+            name = name[: -(len(suf_dot) + 1)].strip().rstrip(",")
+            break
+    return name.title()
+def _rdap_extract(w: Dict[str, Any]) -> Dict[str, Any]:
+    """Map RDAP JSON -> normalized fields: org, registrar, creation_date, expiration_date."""
+    org = None
+    registrar = None
+    creation_date = None
+    expiration_date = None
+    # Entities: find registrant/registrar
+    for ent in w.get("entities", []) or []:
+        roles = {r.lower() for r in (ent.get("roles") or [])}
+        v = ent.get("vcardArray")
+        fn = None
+        org_v = None
+        if isinstance(v, list) and len(v) == 2 and isinstance(v[1], list):
+            for item in v[1]:
+                # item like ["fn", {}, "text", "Example Corp"]
+                if isinstance(item, list) and len(item) >= 4:
+                    if item[0] == "fn" and isinstance(item[3], str):
+                        fn = item[3]
+                    if item[0] == "org" and isinstance(item[3], str):
+                        org_v = item[3]
+        if "registrant" in roles and not org:
+            org = org_v or fn
+        if "registrar" in roles and not registrar:
+            registrar = org_v or fn
+    # Some registries put registrar at top-level
+    registrar = registrar or w.get("registrar")
+    # Events: registration/expiration
+    for ev in w.get("events", []) or []:
+        action = str(ev.get("eventAction", "")).lower()
+        date = ev.get("eventDate")
+        if action in {"registration", "registered"} and not creation_date:
+            creation_date = date
+        if action in {"expiration", "expiry", "paid-through"} and not expiration_date:
+            expiration_date = date
+    # ISO8601 -> datetime with tz
+    def _parse_dt(x: Any) -> Optional[_dt.datetime]:
+        if not x:
+            return None
+        try:
+            # RDAP dates are ISO-8601; fromisoformat handles 'Z' only in 3.11+; fall back simple replace.
+            s = str(x).replace("Z", "+00:00")
+            return _dt.datetime.fromisoformat(s)
+        except Exception:
+            return None
+    return {
+        "org": org,
+        "registrar": registrar,
+        "creation_date": _parse_dt(creation_date),
+        "expiration_date": _parse_dt(expiration_date),
+        "source": "RDAP",
+    }
+def _whois_extract(w: Any) -> Dict[str, Any]:
+    """Map python-whois result -> normalized fields."""
+    get = w.get if hasattr(w, "get") else lambda k, d=None: getattr(w, k, d)
+    return {
+        "org": get("org"),
+        "registrar": get("registrar"),
+        "creation_date": get("creation_date"),
+        "expiration_date": get("expiration_date"),
+        "source": "WHOIS",
+    }
+def _lookup(domain: str) -> Dict[str, Any]:
+    """RDAP first, WHOIS fallback. Returns normalized dict or {'error': ...}."""
+    # 1) RDAP (HTTP/JSON; far more reliable)
+    if rdap_domain is not None:
+        try:
+            rd = rdap_domain(domain, timeout=10)  # type: ignore[arg-type]
+            if isinstance(rd, dict):
+                data = _rdap_extract(rd)
+                if data.get("org") or data.get("registrar"):
+                    return data
+        except Exception as e:
+            logger.debug("RDAP error for %s: %s", domain, e)
+    # 2) WHOIS fallback (may be blocked/rate-limited)
+    if python_whois is not None:
+        try:
+            w = python_whois.whois(domain, timeout=5)
+            data = _whois_extract(w)
+            if data.get("org") or data.get("registrar"):
+                return data
+        except Exception as e:
+            return {"error": f"WHOIS error: {e}"}
+    return {"error": "No RDAP/WHOIS client available or no usable data returned."}
+def collect_from_domain(domain: str) -> List[EvidenceRecord]:
+    """
+    Collect registration ownership signals for a domain using RDAP (preferred) with WHOIS fallback.
+    Uses caching to avoid repeated lookups and rate limits.
+    """
+    logger.info("Checking %s", domain)
+    now = _dt.datetime.now(_dt.timezone.utc)
+    cache_key_ns = "rdap"  # new namespace; do not collide with legacy "whois"
+    locator_base = "rdap://"
+    cached = get_cached_data(cache_key_ns, domain)
+    if cached:
+        logger.debug("Using cached RDAP/WHOIS data for %s", domain)
+        info = cached
+    else:
+        info = _lookup(domain)
+        set_cached_data(cache_key_ns, domain, info if info else {"error": "empty"})
+    if not info or "error" in info:
+        logger.warning(
+            "RDAP/WHOIS lookup for %s failed: %s",
+            domain,
+            info.get("error") if info else "unknown",
+        )
+        return []
+    org_name = _normalize_org_name(info.get("org"))
+    if not org_name:
+        logger.warning(f"No org name for {domain}")
+        # Even without org, keep cache; just no evidence emitted.
+        return []
+    value = {
+        "name": org_name,
+        "domain": domain,
+        "registrar": info.get("registrar"),
+        "source": info.get("source", "RDAP"),
+        "creation_date": info.get("creation_date"),
+        "expiration_date": info.get("expiration_date"),
+    }
+    # Keep EvidenceSource.WHOIS for backward compatibility if RDAP enum doesn't exist in your schema.
+    record = EvidenceRecord(
+        id=generate_evidence_id(
+            EvidenceSource.WHOIS,  # if you add EvidenceSource.RDAP later, switch based on info["source"]
+            EvidenceKind.DOMAIN,
+            f"{locator_base}{domain}",
+            str(value),
+            org_name,
+        ),
+        source=EvidenceSource.WHOIS,
+        locator=f"{locator_base}{domain}",
+        kind=EvidenceKind.DOMAIN,
+        value=value,
+        observed_at=now,
+        confidence=0.30,
+        notes=f"Domain '{domain}' registration entity normalized to '{org_name}' via {value['source']}.",
+    )
+    return [record]

skip_trace/config.py ADDED Viewed

@@ -0,0 +1,165 @@
+# skip_trace/config.py
+from __future__ import annotations
+import os
+from typing import Any, Dict, Optional, cast
+# Use tomllib if available (Python 3.11+), otherwise fall back to tomli
+try:
+    import tomllib
+except ImportError:
+    import tomli as tomllib  # type: ignore
+from dotenv import load_dotenv
+from .exceptions import ConfigurationError
+# Load .env file at module level
+load_dotenv()
+DEFAULT_CONFIG: Dict[str, Any] = {
+    "default_min_score": 0.70,
+    "default_fail_under": 0.50,
+    "entity_resolution_llm": False,
+    "weights": {
+        "verified_release_signature": 0.50,
+        "repo_org_matches_email_domain": 0.35,
+        "codeowners_org_team": 0.25,
+        "pypi_maintainer_corporate_domain": 0.20,
+        "local_copyright_header_org": 0.25,
+        "governance_doc_org": 0.20,
+        "llm_ner_claim": 0.20,
+        "conflict": -0.15,
+    },
+    "llm": {
+        "provider": "openrouter",
+        "model": "mistralai/mistral-7b-instruct",
+        "api_key_env_var": "OPENROUTER_API_KEY",
+        "base_url": "https://openrouter.ai/api/v1",
+    },
+    "http": {
+        "user_agent": "skip-trace/0.1.0",
+        "timeout": 30,
+    },
+    # GitHub API configuration
+    "github": {
+        "api_key_env_var": "GITHUB_TOKEN",
+    },
+    # Cache configuration
+    "cache": {
+        "enabled": True,
+        "dir": ".skip_trace_cache",
+        "ttl_seconds": 604800,  # 7 days
+    },
+    # Domains to ignore for WHOIS lookups
+    "whois_ignored_domains": [
+        "gmail.com",
+        "googlemail.com",
+        "google.com",
+        "yahoo.com",
+        "hotmail.com",
+        "outlook.com",
+        "live.com",
+        "msn.com",
+        "aol.com",
+        "icloud.com",
+        "me.com",
+        "mac.com",
+        "protonmail.com",
+        "pm.me",
+        "github.com",
+        "users.noreply.github.com",
+        "gitlab.com",
+        "sourceforge.net",
+        "readthedocs.io",
+        "twitter.com",
+        "mastodon.social",
+        "linkedin.com",
+        "googlegroups.com",
+    ],
+    "suppressed_tool_orgs": [
+        "github",
+        "gitlab",
+        "bitbucket",
+        "sourceforge",
+        "readthedocs",
+        "codeberg",
+        "pypi",  # PSF owns PyPI, but not the packages on it
+    ],
+}
+def find_pyproject_toml(start_dir: str = ".") -> Optional[str]:
+    """Finds pyproject.toml by searching upwards from start_dir."""
+    path = os.path.abspath(start_dir)
+    while True:
+        pyproject_path = os.path.join(path, "pyproject.toml")
+        if os.path.exists(pyproject_path):
+            return pyproject_path
+        parent = os.path.dirname(path)
+        if parent == path:  # Reached the root
+            return None
+        path = parent
+def load_config(test_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """
+    Loads configuration, allowing for test overrides.
+    Priority order:
+    1. test_config (if provided)
+    2. [tool.skip-trace] in pyproject.toml
+    3. DEFAULT_CONFIG
+    Args:
+        test_config: A dictionary to use as the config, for testing.
+    Returns:
+        The final configuration dictionary.
+    """
+    if test_config:
+        return test_config
+    config = DEFAULT_CONFIG.copy()
+    pyproject_path = find_pyproject_toml()
+    if pyproject_path:
+        try:
+            with open(pyproject_path, "rb") as f:
+                pyproject_data = tomllib.load(f)
+            if tool_config := pyproject_data.get("tool", {}).get("skip-trace", {}):
+                # Deep merge user config into default
+                for key, value in tool_config.items():
+                    if isinstance(value, dict) and isinstance(config.get(key), dict):
+                        config[key].update(value)
+                    else:
+                        config[key] = value
+        except Exception as e:
+            raise ConfigurationError(f"Error reading {pyproject_path}: {e}") from e
+    # Load secrets from environment variables
+    # LLM API Key
+    llm_config = config.get("llm", {})
+    api_key_env_var = llm_config.get("api_key_env_var")
+    if api_key_env_var:
+        api_key = os.getenv(api_key_env_var)
+        # Ensure the key is nested correctly in the final config object
+        config["llm"]["api_key"] = api_key
+    # GitHub API Key
+    github_config = config.get("github", {})
+    gh_api_key_env_var = github_config.get("api_key_env_var")
+    if gh_api_key_env_var:
+        gh_api_key = os.getenv(gh_api_key_env_var)
+        config["github"]["api_key"] = gh_api_key
+    config["lenient_mode_enabled"] = (
+        os.getenv("SKIP_TRACE_INCLUDE_TOOL_ORGS") is not None
+    )
+    return cast(Dict[str, Any], config)
+# Load once at module level to be imported by other parts of the app
+CONFIG = load_config()

skip_trace/exceptions.py ADDED Viewed

@@ -0,0 +1,22 @@
+# skip_trace/exceptions.py
+from __future__ import annotations
+class SkipTraceError(Exception):
+    """Base exception for all application-specific errors."""
+class ConfigurationError(SkipTraceError):
+    """Raised for invalid or missing configuration."""
+class NetworkError(SkipTraceError):
+    """Raised for network-related issues like timeouts or connection errors."""
+class NoEvidenceError(SkipTraceError):
+    """Raised when no usable evidence can be found for a package."""
+class CollectorError(SkipTraceError):
+    """Raised when a specific data collector fails."""

skip_trace/main.py ADDED Viewed

@@ -0,0 +1,269 @@
+# skip_trace/main.py
+from __future__ import annotations
+import argparse
+import dataclasses
+import json
+import logging
+import sys
+from typing import Set
+import tldextract
+from rich.logging import RichHandler
+from . import schemas
+from .analysis import evidence as evidence_analyzer
+from .analysis import scoring
+from .collectors import github, package_files, pypi, whois
+from .config import CONFIG
+from .exceptions import CollectorError, NetworkError, NoEvidenceError
+from .reporting import json_reporter, md_reporter
+from .utils.validation import is_valid_email
+# Create a logger instance for this module
+logger = logging.getLogger(__name__)
+def setup_logging(level: str = "INFO"):
+    """Configures the application's logger.
+    Args:
+        level: The minimum logging level to display (e.g., "INFO", "DEBUG").
+    """
+    logging.basicConfig(
+        level=level,
+        format="%(message)s",
+        datefmt="[%X]",
+        handlers=[RichHandler(rich_tracebacks=True, show_path=False)],
+    )
+def run_who_owns(args: argparse.Namespace) -> int:
+    """Handler for the 'who-owns' command."""
+    logger.info(f"Executing 'who-owns' for package: {args.package}")
+    try:
+        # 1. Collect initial data from PyPI
+        metadata = pypi.fetch_package_metadata(args.package, args.version)
+        package_name = metadata.get("info", {}).get("name", args.package)
+        package_version = metadata.get("info", {}).get("version")
+        logger.debug(
+            f"Successfully fetched metadata for {package_name} v{package_version}"
+        )
+        # 2. Analyze primary package metadata
+        evidence_records, pypi_maintainers = evidence_analyzer.extract_from_pypi(
+            metadata
+        )
+        # 3. Cross-Reference for more PyPI evidence
+        cross_ref_evidence = pypi.cross_reference_by_user(package_name)
+        evidence_records.extend(cross_ref_evidence)
+        # 4. Fetch evidence from code repositories found in PyPI evidence
+        repo_urls = set()
+        for record in evidence_records:
+            if (
+                record.source == schemas.EvidenceSource.PYPI
+                and record.kind == schemas.EvidenceKind.ORGANIZATION
+            ):
+                url = record.value.get("url")
+                if url and "github.com" in url:
+                    repo_urls.add(url)
+        for url in repo_urls:
+            logger.info(f"Analyzing GitHub repository: {url}")
+            try:
+                github_evidence = github.extract_from_repo_url(url)
+                evidence_records.extend(github_evidence)
+            except CollectorError as e:
+                logger.warning(f"Could not fully analyze GitHub repo {url}: {e}")
+        # 5. Extract domains and perform WHOIS lookups
+        domains_to_check: Set[str] = set()
+        ignored_domains = set(CONFIG.get("whois_ignored_domains", []))
+        for record in evidence_records:
+            potential_domains: Set[str] = set()
+            # Case 1: Maintainer/Author email
+            if record.kind in (
+                schemas.EvidenceKind.EMAIL,
+                schemas.EvidenceKind.MAINTAINER,
+                schemas.EvidenceKind.AUTHOR_TAG,
+            ):
+                if email := record.value.get("email"):
+                    if "@" in email:
+                        potential_domains.add(email.split("@")[1])
+            # Case 2: URL from project_urls or org links
+            elif record.kind in (
+                schemas.EvidenceKind.ORGANIZATION,
+                schemas.EvidenceKind.PROJECT_URL,
+            ):
+                if url := record.value.get("url"):
+                    extracted = tldextract.extract(url)
+                    if extracted.registered_domain:
+                        potential_domains.add(extracted.registered_domain)
+            # Case 3: Contacts from a user profile (email, blog, etc.)
+            elif record.kind == schemas.EvidenceKind.USER_PROFILE:
+                if contacts := record.value.get("contacts"):
+                    for contact_value in contacts.values():
+                        if not contact_value:
+                            continue
+                        if valid_email := is_valid_email(contact_value):
+                            potential_domains.add(valid_email.split("@")[1])
+                        elif contact_value and "://" in contact_value:
+                            extracted = tldextract.extract(contact_value)
+                            if extracted.registered_domain:
+                                potential_domains.add(extracted.registered_domain)
+            # Add valid domains to the main set to be checked
+            for domain in potential_domains:
+                if domain not in ignored_domains:
+                    domains_to_check.add(domain)
+        if domains_to_check:
+            logger.info(
+                f"Found domains for WHOIS lookup: {', '.join(sorted(list(domains_to_check)))}"
+            )
+            for domain in domains_to_check:
+                try:
+                    whois_evidence = whois.collect_from_domain(domain)
+                    evidence_records.extend(whois_evidence)
+                except CollectorError as e:
+                    logger.warning(f"Could not get WHOIS evidence for {domain}: {e}")
+        # 6. Analyze package contents for deep evidence
+        try:
+            package_files_evidence = package_files.collect_from_package_files(metadata)
+            evidence_records.extend(package_files_evidence)
+        except CollectorError as e:
+            logger.warning(f"Could not analyze package files for {package_name}: {e}")
+        # 7. Score all collected evidence
+        owner_candidates = scoring.score_owners(evidence_records)
+        # 8. Assemble final result object
+        package_result = schemas.PackageResult(
+            package=package_name,
+            version=package_version,
+            owners=owner_candidates,
+            maintainers=pypi_maintainers,
+            evidence=evidence_records,
+        )
+        # 9. Report
+        if args.output_format == "json":
+            json_reporter.render(package_result)
+        else:
+            md_reporter.render(package_result)
+        # PEP specified exit codes based on score
+        # Using placeholder thresholds for now
+        top_score = owner_candidates[0].score if owner_candidates else 0
+        if top_score >= 0.7:
+            return 0  # Success
+        if top_score >= 0.5:
+            return 0  # Indeterminate # The tool didn't fail
+        return 101  # No usable evidence
+        # TODO: Pass evidence_records to the scoring engine
+        # Later, this will be replaced by a call to the analysis and reporting modules.
+        # For example:
+        #
+        # evidence = analysis.evidence.extract_from_pypi(metadata)
+        # owners = analysis.scoring.score_owners(evidence)
+        # package_result = schemas.PackageResult(package=args.package, owners=owners, evidence=evidence)
+        # reporting.json_reporter.render(package_result)
+        # return 0
+    except NoEvidenceError as e:
+        logger.error(f"{type(e).__name__}: {e}")
+        return 101  # As per the PEP for "No usable evidence"
+    except NetworkError as e:
+        print(f"Error: A network problem occurred: {e}", file=sys.stderr)
+        return 101
+# --- Handler for the `explain` command ---
+def run_explain(args: argparse.Namespace) -> int:
+    """Handler for the 'explain' command."""
+    logger.info(f"Explaining evidence for package: {args.package}")
+    try:
+        metadata = pypi.fetch_package_metadata(args.package)
+        evidence_records, _ = evidence_analyzer.extract_from_pypi(metadata)
+        if args.id:
+            # Filter for a specific evidence ID
+            record = next(
+                (r for r in evidence_records if r.id.startswith(args.id)), None
+            )
+            if record:
+                output_record = dataclasses.asdict(record)
+                print(json.dumps(output_record, indent=2, default=str))
+                return 0
+            logger.error(f"Evidence ID matching '{args.id}' not found.")
+            return 1
+        # Show all evidence
+        output: list[dict[str, str | None]] = [
+            dataclasses.asdict(r) for r in evidence_records
+        ]
+        print(json.dumps(output, indent=2, default=str))
+        return 0
+    except (NoEvidenceError, NetworkError) as e:
+        logger.error(f"{type(e).__name__}: {e}")
+        return 101
+def run_venv(args: argparse.Namespace) -> int:
+    """Handler for the 'venv' command."""
+    print("Executing 'venv' command...")
+    print(f"  Path: {args.path or 'current environment'}")
+    # TODO: Implement the actual logic
+    return 200  # Placeholder for "No anonymous"
+def run_reqs(args: argparse.Namespace) -> int:
+    """Handler for the 'reqs' command."""
+    print("Executing 'reqs' command...")
+    print(f"  Requirements File: {args.requirements_file}")
+    # TODO: Implement the actual logic
+    return 200  # Placeholder for "No anonymous"
+# ... Add placeholder functions for other commands ...
+def run_command(args: argparse.Namespace) -> int:
+    """
+    Dispatches the parsed arguments to the appropriate handler function.
+    Args:
+        args: The parsed arguments from argparse.
+    Returns:
+        An exit code.
+    """
+    # Prefer --verbose if set
+    log_level = "DEBUG" if args.log_level == "DEBUG" else args.log_level
+    setup_logging(log_level)
+    command_handlers = {
+        "who-owns": run_who_owns,
+        "explain": run_explain,
+        "venv": run_venv,
+        "reqs": run_reqs,
+        # "explain": run_explain,
+        # "graph": run_graph,
+        # "cache": run_cache,
+        # "policy": run_policy,
+    }
+    handler = command_handlers.get(args.command)
+    if handler:
+        return handler(args)
+    print(f"Error: Command '{args.command}' is not yet implemented.", file=sys.stderr)
+    return 2

skip_trace/py.typed.py ADDED Viewed

File without changes

skip_trace/reporting/__init__.py ADDED Viewed

File without changes

skip_trace/reporting/json_reporter.py ADDED Viewed

@@ -0,0 +1,22 @@
+# skip_trace/reporting/json_reporter.py
+from __future__ import annotations
+import dataclasses
+import json
+import sys
+from typing import IO
+from ..schemas import PackageResult
+def render(result: PackageResult, file: IO[str] = sys.stdout):
+    """
+    Renders the PackageResult as JSON to the specified file.
+    Args:
+        result: The PackageResult object to render.
+        file: The file object to write to (defaults to stdout).
+    """
+    # default=str is a handler for non-serializable types like datetime
+    json.dump(dataclasses.asdict(result), file, indent=2, default=str)
+    file.write("\n")