PyPI - skip-trace - Versions diffs - 0.1.0__py3-none-any.whl - Mend

skip-trace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

skip_trace/__about__.py +19 -0
skip_trace/__init__.py +6 -0
skip_trace/__main__.py +9 -0
skip_trace/analysis/__init__.py +4 -0
skip_trace/analysis/evidence.py +312 -0
skip_trace/analysis/ner.py +58 -0
skip_trace/analysis/scoring.py +282 -0
skip_trace/analysis/source_scanner.py +411 -0
skip_trace/cli.py +177 -0
skip_trace/collectors/__init__.py +4 -0
skip_trace/collectors/github.py +241 -0
skip_trace/collectors/package_files.py +150 -0
skip_trace/collectors/pypi.py +158 -0
skip_trace/collectors/whois.py +202 -0
skip_trace/config.py +165 -0
skip_trace/exceptions.py +22 -0
skip_trace/main.py +269 -0
skip_trace/py.typed.py +0 -0
skip_trace/reporting/__init__.py +0 -0
skip_trace/reporting/json_reporter.py +22 -0
skip_trace/reporting/md_reporter.py +115 -0
skip_trace/schemas.py +131 -0
skip_trace/utils/__init__.py +4 -0
skip_trace/utils/cache.py +77 -0
skip_trace/utils/cli_suggestions.py +91 -0
skip_trace/utils/http_client.py +45 -0
skip_trace/utils/safe_targz.py +161 -0
skip_trace/utils/validation.py +52 -0
skip_trace-0.1.0.dist-info/METADATA +125 -0
skip_trace-0.1.0.dist-info/RECORD +33 -0
skip_trace-0.1.0.dist-info/WHEEL +4 -0
skip_trace-0.1.0.dist-info/entry_points.txt +2 -0
skip_trace-0.1.0.dist-info/licenses/LICENSE +21 -0

skip_trace/analysis/source_scanner.py ADDED Viewed

@@ -0,0 +1,411 @@
+# skip_trace/analysis/source_scanner.py
+from __future__ import annotations
+import datetime
+import logging
+import os
+import re
+import string
+from typing import List
+from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
+from ..utils.validation import is_valid_email
+from . import ner
+from .evidence import _parse_contact_string, generate_evidence_id
+logger = logging.getLogger(__name__)
+# Regex to find copyright notices, capturing the holder.
+# Looks for "Copyright", optional (c) symbol, optional year, then the owner.
+COPYRIGHT_RE = re.compile(
+    r"copyright\s*(?:\(c\))?\s*(?:[0-9,\-\s]+)?\s*([^\n]+)", re.IGNORECASE
+)
+# Regex to find __author__ assignments
+AUTHOR_RE = re.compile(r"__author__\s*=\s*['\"]([^'\"]+)['\"]")
+# Regex for finding standalone email addresses - used as a fast pre-filter
+EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b")
+# Words that indicate a regex grabbed junk from a license instead of a name.
+# This filter now lives in the scanner, where the bad evidence is generated.
+JUNK_WORDS = {
+    "copyright",
+    "holders",
+    "license",
+    "document",
+    "accompanies",
+    "notice",
+    "authors",
+    "identifies",
+    "endorse",
+    "promote",
+    "software",
+    "permission",
+    "conditions",
+    # stop words
+    "and",
+    "other",
+    "the",
+    "for",
+    "with",
+    "this",
+    "list",
+    "following",
+    "txt",
+    "damages",
+    "owner",
+    # legalese
+    "incidental",
+    "holder",
+    "liability",
+    # license names
+    "MIT",
+    "BSD",
+}
+# --- NEW: Filename allowlist and more robust binary detection ---
+# A set of common extensionless text files that should never be treated as binary.
+TEXT_FILENAMES = {
+    "readme",
+    "license",
+    "copying",
+    "notice",
+    "authors",
+    "contributors",
+    "changelog",
+    "history",
+    "install",
+    "makefile",
+    "dockerfile",
+    "vagrantfile",
+}
+def _is_binary_file(filepath: str, chunk_size: int = 1024) -> bool:
+    """
+    Heuristically determines if a file is binary using a multi-step check.
+    1. Checks against an allowlist of common text filenames (e.g., 'LICENSE').
+    2. Checks for the presence of NULL bytes.
+    3. Checks the ratio of non-printable text characters.
+    Args:
+        filepath: The path to the file to check.
+        chunk_size: The number of bytes to read from the beginning of the file.
+    Returns:
+        True if the file is likely binary, False otherwise.
+    """
+    # 1. Check filename allowlist first.
+    basename = os.path.basename(filepath).lower()
+    if basename in TEXT_FILENAMES:
+        return False
+    try:
+        with open(filepath, "rb") as f:
+            chunk = f.read(chunk_size)
+    except IOError:
+        return True  # Cannot read, so skip it.
+    if not chunk:
+        return False  # Empty file is not binary.
+    # 2. A null byte is a strong indicator of a binary file.
+    if b"\0" in chunk:
+        return True
+    # 3. Check the ratio of text characters to total characters.
+    # A high percentage of non-printable characters indicates binary data.
+    printable = set(bytes(string.printable, "ascii"))
+    non_printable_count = sum(1 for byte in chunk if byte not in printable)
+    # If more than 30% of the characters are non-printable, it's likely binary.
+    ratio = non_printable_count / len(chunk)
+    return ratio > 0.3
+def _process_authors_file(
+    content: str, locator: str, now: datetime.datetime
+) -> List[EvidenceRecord]:
+    """Processes an AUTHORS file, treating each non-blank line as a potential author."""
+    evidence_list = []
+    logger.debug(f"Processing AUTHORS file at: {locator}")
+    lines = [line.strip() for line in content.splitlines()]
+    for line in lines:
+        if not line or line.startswith("#"):
+            continue
+        parsed = _parse_contact_string(line)
+        if not parsed.get("name") and not parsed.get("email"):
+            continue
+        value = {"name": parsed["name"], "email": parsed["email"]}
+        name_for_slug = parsed["name"] or parsed["email"] or "unknown"
+        record = EvidenceRecord(
+            id=generate_evidence_id(
+                EvidenceSource.WHEEL,
+                EvidenceKind.AUTHOR_TAG,
+                locator,
+                str(value),
+                name_for_slug,
+            ),
+            source=EvidenceSource.WHEEL,
+            locator=locator,
+            kind=EvidenceKind.AUTHOR_TAG,
+            value=value,
+            observed_at=now,
+            confidence=0.20,  # Higher confidence than a random email
+            notes=f"Found author '{line}' in AUTHORS file.",
+        )
+        evidence_list.append(record)
+        logger.debug(f"Found author from AUTHORS file: {line}")
+    return evidence_list
+def scan_directory(directory_path: str, locator_prefix: str) -> List[EvidenceRecord]:
+    """
+    Scans a directory of files for ownership evidence.
+    Args:
+        directory_path: The absolute path to the directory to scan.
+        locator_prefix: A prefix for the evidence locator (e.g., package name/version).
+    Returns:
+        A list of EvidenceRecord objects found in the files.
+    """
+    evidence_list: List[EvidenceRecord] = []
+    now = datetime.datetime.now(datetime.timezone.utc)
+    skip_dirs = {
+        ".git",
+        "__pycache__",
+        ".idea",
+        ".vscode",
+        "dist",
+        "build",
+        ".egg-info",
+        "node_modules",
+    }
+    # More comprehensive list of binary extensions
+    skip_extensions = {
+        ".pyc",
+        ".pyo",
+        ".so",
+        ".pyd",
+        ".egg",
+        ".whl",  # Python
+        ".o",
+        ".a",
+        ".dll",
+        ".exe",  # Compiled
+        ".svg",
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".ico",
+        ".webp",  # Images
+        ".woff",
+        ".woff2",
+        ".ttf",
+        ".eot",
+        ".otf",  # Fonts
+        ".zip",
+        ".tar",
+        ".gz",
+        ".bz2",
+        ".7z",
+        ".rar",  # Archives
+        ".pdf",
+        ".doc",
+        ".docx",
+        ".xls",
+        ".xlsx",
+        ".ppt",
+        ".pptx",
+        ".odt",  # Docs
+        ".mp3",
+        ".mp4",
+        ".wav",
+        ".flac",
+        ".ogg",
+        ".mov",
+        ".avi",
+        ".mkv",  # Media
+    }
+    file_count = 0
+    for root, dirs, files in os.walk(directory_path):
+        # Modify dirs in-place to prune the search
+        dirs[:] = [d for d in dirs if d not in skip_dirs]
+        for filename in files:
+            file_path = os.path.join(root, filename)
+            relative_path = os.path.relpath(file_path, directory_path)
+            file_count += 1
+            _, extension = os.path.splitext(filename)
+            if extension.lower() in skip_extensions:
+                continue
+            if _is_binary_file(file_path):
+                logger.debug(
+                    f"Skipping binary file detected by content: {relative_path}"
+                )
+                continue
+            logger.debug(f"Scanning file: {relative_path}")
+            try:
+                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                    content = f.read()
+                locator = f"{locator_prefix}/{relative_path}"
+                # 1. Special handling for AUTHORS files
+                if filename.lower().startswith(
+                    "authors"
+                ) or filename.lower().startswith("contributors"):
+                    evidence_list.extend(_process_authors_file(content, locator, now))
+                    continue  # Don't process this file further for generic matches
+                # Use NER for copyright lines
+                for match in COPYRIGHT_RE.finditer(content):
+                    copyright_text = match.group(1).strip().rstrip(",.")
+                    # Try NER first
+                    entities = ner.extract_entities(copyright_text)
+                    if entities:
+                        for entity_name, entity_label in entities:
+                            if entity_name.lower() not in JUNK_WORDS:
+                                value: dict[str, str | None] = {
+                                    "holder": entity_name,
+                                    "file": relative_path,
+                                }
+                                notes = f"Found copyright holder '{entity_name}' via NER ({entity_label})."
+                                record = EvidenceRecord(
+                                    id=generate_evidence_id(
+                                        EvidenceSource.WHEEL,
+                                        EvidenceKind.COPYRIGHT,
+                                        locator,
+                                        str(value),
+                                        entity_name,
+                                    ),
+                                    source=EvidenceSource.WHEEL,
+                                    locator=locator,
+                                    kind=EvidenceKind.COPYRIGHT,
+                                    value=value,
+                                    observed_at=now,
+                                    confidence=0.40,  # Higher confidence for NER
+                                    notes=notes,
+                                )
+                                already_in = False
+                                for already in evidence_list:
+                                    if already.notes == notes:
+                                        already_in = True
+                                if not already_in:
+                                    evidence_list.append(record)
+                    # else:
+                    #     # --- Stricter filtering for the regex fallback ---
+                    #     # 1. Reject if it's too long to be a name.
+                    #     if len(copyright_text) > 50: continue
+                    #     # 2. Reject if it contains common license garbage words.
+                    #     if any(word in copyright_text.lower() for word in JUNK_WORDS): continue
+                    #
+                    #     value = {"holder": copyright_text, "file": relative_path}
+                    #     record = EvidenceRecord(
+                    #         id=generate_evidence_id(EvidenceSource.WHEEL, EvidenceKind.COPYRIGHT, locator, str(value),
+                    #                                 copyright_text),
+                    #         source=EvidenceSource.WHEEL, locator=locator, kind=EvidenceKind.COPYRIGHT,
+                    #         value=value, observed_at=now, confidence=0.25,
+                    #         notes=f"Found copyright notice for '{copyright_text}' in file (regex fallback)."
+                    #     )
+                    #     evidence_list.append(record)else:
+                    #     # --- Stricter filtering for the regex fallback ---
+                    #     # 1. Reject if it's too long to be a name.
+                    #     if len(copyright_text) > 50: continue
+                    #     # 2. Reject if it contains common license garbage words.
+                    #     if any(word in copyright_text.lower() for word in JUNK_WORDS): continue
+                    #
+                    #     value = {"holder": copyright_text, "file": relative_path}
+                    #     record = EvidenceRecord(
+                    #         id=generate_evidence_id(EvidenceSource.WHEEL, EvidenceKind.COPYRIGHT, locator, str(value),
+                    #                                 copyright_text),
+                    #         source=EvidenceSource.WHEEL, locator=locator, kind=EvidenceKind.COPYRIGHT,
+                    #         value=value, observed_at=now, confidence=0.25,
+                    #         notes=f"Found copyright notice for '{copyright_text}' in file (regex fallback)."
+                    #     )
+                    #     evidence_list.append(record)
+                # 3. Scan for __author__ tags in Python files
+                if filename.endswith(".py"):
+                    for match in AUTHOR_RE.finditer(content):
+                        author_str = match.group(1).strip()
+                        parsed = _parse_contact_string(author_str)
+                        if not parsed.get("name") and not parsed.get("email"):
+                            continue
+                        value = {"name": parsed["name"], "email": parsed["email"]}
+                        name_for_slug = parsed["name"] or parsed["email"] or "unknown"
+                        record = EvidenceRecord(
+                            id=generate_evidence_id(
+                                EvidenceSource.WHEEL,
+                                EvidenceKind.AUTHOR_TAG,
+                                locator,
+                                str(value),
+                                name_for_slug,
+                            ),
+                            source=EvidenceSource.WHEEL,
+                            locator=locator,
+                            kind=EvidenceKind.AUTHOR_TAG,
+                            value=value,
+                            observed_at=now,
+                            confidence=0.20,
+                            notes=f"Found __author__ tag for '{author_str}' in file.",
+                        )
+                        evidence_list.append(record)
+                # 4. Scan for any standalone email address (lower confidence)
+                # First, find candidates with regex, then validate them properly.
+                for match in EMAIL_RE.finditer(content):
+                    potential_email = match.group(0)
+                    if valid_email := is_valid_email(potential_email):
+                        value = {"name": None, "email": valid_email}
+                        notes = (
+                            f"Found validated contact email '{valid_email}' in file."
+                        )
+                        record = EvidenceRecord(
+                            id=generate_evidence_id(
+                                EvidenceSource.WHEEL,
+                                EvidenceKind.CONTACT,
+                                locator,
+                                str(value),
+                                valid_email,
+                            ),
+                            source=EvidenceSource.WHEEL,
+                            locator=locator,
+                            kind=EvidenceKind.CONTACT,
+                            value=value,
+                            observed_at=now,
+                            confidence=0.15,  # Slightly higher confidence now that it's validated
+                            notes=notes,
+                        )
+                        already_in = False
+                        for already in evidence_list:
+                            if already.notes == notes:
+                                already_in = True
+                        if not already_in:
+                            evidence_list.append(record)
+            except (IOError, UnicodeDecodeError) as e:
+                logger.debug(f"Could not read or process file {file_path}: {e}")
+                continue
+    logger.info(
+        f"Scanned {file_count} files in directory, found {len(evidence_list)} potential evidence records."
+    )
+    return evidence_list

skip_trace/cli.py ADDED Viewed

@@ -0,0 +1,177 @@
+# skip_trace/cli.py
+from __future__ import annotations
+import sys
+from typing import List, Optional
+from rich_argparse import RichHelpFormatter
+from . import __version__
+from .main import run_command
+from .utils.cli_suggestions import SmartParser
+def create_parser() -> SmartParser:
+    """Creates the main argument parser for the application."""
+    parser = SmartParser(
+        prog="skip-trace",
+        description="Infer ownership of Python packages from public artifacts and local source.",
+        epilog="For more help on a specific command, use: skip-trace <command> -h",
+        formatter_class=RichHelpFormatter,
+    )
+    parser.add_argument(
+        "-v", "--version", action="version", version=f"%(prog)s {__version__}"
+    )
+    # --- --verbose flag ---
+    parser.add_argument(
+        "--verbose",
+        action="store_const",
+        dest="log_level",
+        const="DEBUG",
+        default="WARNING",
+        help="Enable verbose (debug) logging.",
+    )
+    parser.add_argument(
+        "--log-level",
+        choices=["ERROR", "WARNING", "INFO", "DEBUG"],
+        help="Set the logging level (overridden by --verbose).",
+    )
+    fmt = parser.add_mutually_exclusive_group()
+    fmt.add_argument(
+        "--json",
+        dest="output_format",
+        action="store_const",
+        const="json",
+        help="Output results in JSON format.",
+    )
+    fmt.add_argument(
+        "--md",
+        dest="output_format",
+        action="store_const",
+        const="md",
+        help="Output results in Markdown format.",
+    )
+    parser.add_argument(
+        "--no-redact",
+        action="store_true",
+        help="Do not redact contact information in output.",
+    )
+    parser.add_argument(
+        "--llm-ner",
+        choices=["off", "on", "auto"],
+        default="auto",
+        help="Control LLM-assisted Named Entity Recognition.",
+    )
+    parser.add_argument(
+        "--jobs", type=int, default=None, help="Number of concurrent jobs to run."
+    )
+    parser.add_argument(
+        "--cache-dir", type=str, default=None, help="Path to the cache directory."
+    )
+    sub = parser.add_subparsers(dest="command", required=True, title="Commands")
+    # --- `who-owns` subcommand ---
+    p_who = sub.add_parser(
+        "who-owns", help="Find ownership for a single remote package."
+    )
+    p_who.add_argument("package", help="The name of the package (e.g., 'requests').")
+    p_who.add_argument("--version", help="The specific version of the package.")
+    # --- `venv` subcommand ---
+    p_venv = sub.add_parser(
+        "venv", help="Scan all packages in a virtual environment (not yet implemented)."
+    )
+    p_venv.add_argument(
+        "--path", help="Path to the Python executable or site-packages of the venv."
+    )
+    # --- `reqs` subcommand ---
+    p_reqs = sub.add_parser(
+        "reqs", help="Scan packages from a requirements file (not yet implemented)."
+    )
+    p_reqs.add_argument("requirements_file", help="Path to the requirements.txt file.")
+    # --- `explain` subcommand ---
+    p_explain = sub.add_parser(
+        "explain",
+        help="Show the evidence behind an ownership claim (not yet implemented).",
+    )
+    p_explain.add_argument("package", help="The name of the package.")
+    p_explain.add_argument("--id", help="The specific evidence ID to display.")
+    # --- `graph` subcommand ---
+    p_graph = sub.add_parser(
+        "graph", help="Generate an ownership graph for a package (not yet implemented)."
+    )
+    p_graph.add_argument("package", help="The name of the package.")
+    p_graph.add_argument(
+        "--format",
+        choices=["dot", "mermaid"],
+        default="mermaid",
+        help="The output format for the graph.",
+    )
+    # --- `cache` subcommand ---
+    p_cache = sub.add_parser("cache", help="Manage the local cache.")
+    cache_group = p_cache.add_mutually_exclusive_group(required=True)
+    cache_group.add_argument(
+        "--clear",
+        action="store_true",
+        help="Clear all cached data (not yet implemented).",
+    )
+    cache_group.add_argument(
+        "--show", action="store_true", help="Show cache statistics and location."
+    )
+    # --- `policy` subcommand ---
+    p_policy = sub.add_parser(
+        "policy", help="Configure and view policy thresholds (not yet implemented)."
+    )
+    p_policy.add_argument(
+        "--min-score", type=float, help="Set the minimum score for a package to 'pass'."
+    )
+    p_policy.add_argument(
+        "--fail-under",
+        type=float,
+        help="Set the score below which a package is 'anonymous'.",
+    )
+    # Set default output format
+    parser.set_defaults(output_format="md")
+    return parser
+def main(argv: Optional[List[str]] = None) -> int:
+    """
+    Main entry point for the CLI.
+    Parses arguments and dispatches to the main application logic.
+    :param argv: Command line arguments (defaults to sys.argv[1:]).
+    :return: Exit code.
+    """
+    if argv is None:
+        argv = sys.argv[1:]
+    parser = create_parser()
+    args = parser.parse_args(argv)
+    # For commands that pipe, default to JSON
+    if (
+        not sys.stdout.isatty()
+        and "output_format" in args
+        and args.output_format != "json"
+    ):
+        args.output_format = "json"
+    try:
+        return run_command(args)
+    except Exception as e:
+        # TODO: Add proper logging based on log-level
+        print(f"An unexpected error occurred: {e}", file=sys.stderr)
+        return 1

skip_trace/collectors/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+# skip_trace/collectors/__init__.py
+from . import github, package_files, pypi, whois
+__all__ = ["github", "pypi", "whois", "package_files"]