PyPI - skip-trace - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

skip-trace 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

skip_trace/__about__.py +13 -3
skip_trace/__init__.py +0 -2
skip_trace/analysis/content_scanner.py +189 -0
skip_trace/analysis/evidence.py +1 -1
skip_trace/analysis/scoring.py +46 -1
skip_trace/analysis/source_scanner.py +1 -1
skip_trace/cli.py +1 -1
skip_trace/collectors/__init__.py +2 -2
skip_trace/collectors/github_files.py +359 -0
skip_trace/collectors/package_files.py +232 -41
skip_trace/collectors/pypi.py +1 -1
skip_trace/collectors/pypi_attestations.py +160 -0
skip_trace/collectors/sigstore.py +160 -0
skip_trace/collectors/urls.py +96 -0
skip_trace/m.py +287 -0
skip_trace/main.py +103 -85
skip_trace/reporting/md_reporter.py +68 -4
skip_trace/schemas.py +21 -0
skip_trace/utils/http_client.py +18 -0
{skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/METADATA +7 -3
skip_trace-0.1.1.dist-info/RECORD +39 -0
skip_trace-0.1.0.dist-info/RECORD +0 -33
{skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/WHEEL +0 -0
{skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/entry_points.txt +0 -0
{skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/licenses/LICENSE +0 -0

skip_trace/__about__.py CHANGED Viewed

@@ -5,15 +5,25 @@ __all__ = [
     "__version__",
     "__description__",
     "__readme__",
+    "__license__",
     "__credits__",
     "__requires_python__",
     "__status__",
+    "__keywords__",
 ]
 __title__ = "skip-trace"
-__version__ = "0.1.0"
+__version__ = "0.1.1"
 __description__ = "Ownership Attribution for Python Packages"
 __readme__ = "README.md"
+__license__ = "MIT"
 __credits__ = [{"name": "Matthew Dean Martin", "email": "matthewdeanmartin@gmail.com"}]
-__requires_python__ = ">=3.8"
-__status__ = "1 - Planning"
+__requires_python__ = ">=3.13"
+__status__ = "3 - Alpha"
+__keywords__ = [
+    "PyPI maintainers",
+    "package owners",
+    "package provenance",
+    "software supply chain",
+    "PEP 541",
+]

skip_trace/__init__.py CHANGED Viewed

@@ -1,6 +1,4 @@
 # skip_trace/__init__.py
-__version__ = "0.1.0"
 # __all__ will be populated as public functions/classes are added.
 __all__ = []

skip_trace/analysis/content_scanner.py ADDED Viewed

@@ -0,0 +1,189 @@
+# skip_trace/analysis/content_scanner.py
+from __future__ import annotations
+import datetime
+import logging
+import re
+from typing import List
+from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
+from ..utils.validation import is_valid_email
+from . import ner
+from .evidence import _parse_contact_string, generate_evidence_id
+# Regex to find copyright notices, capturing the holder.
+COPYRIGHT_RE = re.compile(
+    r"copyright\s*(?:\(c\))?\s*(?:[0-9,\-\s]+)?\s*([^\n]+)", re.IGNORECASE
+)
+# Regex to find __author__ assignments
+AUTHOR_RE = re.compile(r"__author__\s*=\s*['\"]([^'\"]+)['\"]")
+# Regex for finding standalone email addresses - used as a fast pre-filter
+EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b")
+# --- Regex for finding URLs in text content ---
+URL_RE = re.compile(
+    r"""\b(?:https?://|www\.)[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+(?:[/?#]\S*)?"""
+)
+# Words that indicate a regex grabbed junk from a license instead of a name.
+JUNK_WORDS = {
+    "copyright",
+    "holders",
+    "license",
+    "document",
+    "accompanies",
+    "notice",
+    "authors",
+    "identifies",
+    "endorse",
+    "promote",
+    "software",
+    "permission",
+    "conditions",
+    "and",
+    "other",
+    "the",
+    "for",
+    "with",
+    "this",
+    "list",
+    "following",
+    "txt",
+    "damages",
+    "owner",
+    "incidental",
+    "holder",
+    "liability",
+    "MIT",
+    "BSD",
+}
+logger = logging.getLogger(__name__)
+def scan_text(
+    content: str, locator: str, source: EvidenceSource, is_python_file: bool = False
+) -> List[EvidenceRecord]:
+    """
+    Scans a string of text content for ownership evidence.
+    Args:
+        content: The text content to scan.
+        locator: The path or URL where the content was found.
+        source: The EvidenceSource to assign to new records.
+        is_python_file: Flag to enable Python-specific scans like `__author__`.
+    Returns:
+        A list of EvidenceRecord objects found in the text.
+    """
+    logger.info(f"Scanning {locator}")
+    evidence_list: List[EvidenceRecord] = []
+    now = datetime.datetime.now(datetime.timezone.utc)
+    found_in_scan = set()  # Avoid creating duplicate records from the same scan
+    # 1. Scan for copyright notices
+    for match in COPYRIGHT_RE.finditer(content):
+        copyright_text = match.group(1).strip().rstrip(",.")
+        entities = ner.extract_entities(copyright_text)
+        if entities:
+            for entity_name, entity_label in entities:
+                if entity_name.lower() not in JUNK_WORDS:
+                    key = ("copyright", entity_name)
+                    if key in found_in_scan:
+                        continue
+                    found_in_scan.add(key)
+                    value: dict[str, str | None] = {"holder": entity_name}
+                    record = EvidenceRecord(
+                        id=generate_evidence_id(
+                            source,
+                            EvidenceKind.COPYRIGHT,
+                            locator,
+                            str(value),
+                            entity_name,
+                        ),
+                        source=source,
+                        locator=locator,
+                        kind=EvidenceKind.COPYRIGHT,
+                        value=value,
+                        observed_at=now,
+                        confidence=0.40,
+                        notes=f"Found copyright holder '{entity_name}' via NER ({entity_label}) in '{locator}'.",
+                    )
+                    evidence_list.append(record)
+    # 2. Scan for __author__ tags in Python files
+    if is_python_file:
+        for match in AUTHOR_RE.finditer(content):
+            author_str = match.group(1).strip()
+            key = ("author", author_str)
+            if key in found_in_scan:
+                continue
+            found_in_scan.add(key)
+            parsed = _parse_contact_string(author_str)
+            if parsed.get("name") or parsed.get("email"):
+                value = {"name": parsed["name"], "email": parsed["email"]}
+                slug = parsed["name"] or parsed["email"] or "unknown"
+                record = EvidenceRecord(
+                    id=generate_evidence_id(
+                        source, EvidenceKind.AUTHOR_TAG, locator, str(value), slug
+                    ),
+                    source=source,
+                    locator=locator,
+                    kind=EvidenceKind.AUTHOR_TAG,
+                    value=value,
+                    observed_at=now,
+                    confidence=0.20,
+                    notes=f"Found __author__ tag for '{author_str}' in '{locator}'.",
+                )
+                evidence_list.append(record)
+    # 3. Scan for any standalone email address
+    for match in EMAIL_RE.finditer(content):
+        if valid_email := is_valid_email(match.group(0)):
+            if ("email", valid_email) in found_in_scan:
+                continue
+            found_in_scan.add(("email", valid_email))
+            value = {"name": None, "email": valid_email}
+            record = EvidenceRecord(
+                id=generate_evidence_id(
+                    source, EvidenceKind.CONTACT, locator, str(value), valid_email
+                ),
+                source=source,
+                locator=locator,
+                kind=EvidenceKind.CONTACT,
+                value=value,
+                observed_at=now,
+                confidence=0.15,
+                notes=f"Found validated contact email '{valid_email}' in '{locator}'.",
+            )
+            evidence_list.append(record)
+    # 4. Scan for any URLs
+    for match in URL_RE.finditer(content):
+        url = match.group(0)
+        if ("url", url) in found_in_scan:
+            continue
+        found_in_scan.add(("url", url))
+        value = {"label": "URL found in content", "url": url}
+        record = EvidenceRecord(
+            id=generate_evidence_id(
+                source,
+                EvidenceKind.PROJECT_URL,
+                locator,
+                str(value),
+                url,
+                hint="content-scan",
+            ),
+            source=source,
+            locator=locator,
+            kind=EvidenceKind.PROJECT_URL,
+            value=value,
+            observed_at=now,
+            confidence=0.10,
+            notes=f"Found URL '{url}' in '{locator}'.",
+        )
+        evidence_list.append(record)
+    return evidence_list

skip_trace/analysis/evidence.py CHANGED Viewed

@@ -94,7 +94,7 @@ def _parse_contact_string(contact_str: str) -> Dict[str, Optional[str]]:
     if not contact_str or not contact_str.strip():
         return {"name": None, "email": None}
-    # Pattern for "Name <email@domain.com>"
+    # Pattern for "Name <user@example.com>"
     match = re.search(r"(.+)<(.+)>", contact_str)
     if match:
         name = match.group(1).strip()

skip_trace/analysis/scoring.py CHANGED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import collections
 import logging
 from typing import Dict, List, Optional, Tuple
+from urllib.parse import urlparse
 import tldextract
@@ -97,6 +98,18 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
             kind = OwnerKind.INDIVIDUAL
         else:
             kind = OwnerKind.PROJECT
+    # NEW: Handle PyPI Publisher Attestation
+    elif record.kind == EvidenceKind.PYPI_PUBLISHER_ATTESTATION:
+        repo_slug = record.value.get("repository")
+        if repo_slug and "/" in repo_slug:
+            name = repo_slug.split("/")[0]  # The user or org
+            kind = OwnerKind.PROJECT
+    # --- Handle EMAIL evidence directly ---
+    elif record.kind == EvidenceKind.EMAIL:
+        name = record.value.get("email")
+        kind = OwnerKind.INDIVIDUAL
     # Handle user profile and company evidence
     elif record.kind == EvidenceKind.USER_PROFILE:
         name = record.value.get("user_name")
@@ -123,7 +136,7 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
         if not raw_holder:
             return None, kind
-        # --- NEW: Sanitize the raw string before accepting it as a name ---
+        # --- Sanitize the raw string before accepting it as a name ---
         # 1. Reject if it's too long to be a name.
         if len(raw_holder) > 50:
             return None, kind
@@ -137,6 +150,38 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
             kind = OwnerKind.INDIVIDUAL
         else:
             kind = OwnerKind.COMPANY
+    elif record.kind == EvidenceKind.SIGSTORE_SIGNER_IDENTITY:
+        identity = record.value.get("identity", "")
+        if "@" in identity and "." in identity:  # Looks like an email
+            name = identity
+            kind = OwnerKind.INDIVIDUAL
+        else:
+            try:
+                # Try to parse a build identity URL
+                parsed = urlparse(identity)
+                if parsed.hostname and "github.com" in parsed.hostname:
+                    path_parts = [p for p in parsed.path.split("/") if p]
+                    if len(path_parts) >= 1:
+                        name = path_parts[0]  # The user or org
+                        kind = OwnerKind.PROJECT
+                else:
+                    name = identity
+                    kind = OwnerKind.PROJECT
+            except Exception:
+                name = identity
+                kind = OwnerKind.PROJECT
+    elif record.kind == EvidenceKind.SIGSTORE_BUILD_PROVENANCE:
+        repo_uri = record.value.get("repo_uri", "")
+        try:
+            # Parse git+https://github.com/org/repo.git
+            parsed = urlparse(repo_uri.split("@")[0].replace("git+", ""))
+            if parsed.hostname and "github.com" in parsed.hostname:
+                path_parts = [p for p in parsed.path.split("/") if p]
+                if len(path_parts) >= 1:
+                    name = path_parts[0]  # The user or org
+                    kind = OwnerKind.PROJECT
+        except Exception:
+            name = None
     return name, kind

skip_trace/analysis/source_scanner.py CHANGED Viewed

@@ -64,7 +64,7 @@ JUNK_WORDS = {
     "BSD",
 }
-# --- NEW: Filename allowlist and more robust binary detection ---
+# --- Filename allowlist and more robust binary detection ---
 # A set of common extensionless text files that should never be treated as binary.
 TEXT_FILENAMES = {

skip_trace/cli.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List, Optional
 from rich_argparse import RichHelpFormatter
-from . import __version__
+from .__about__ import __version__
 from .main import run_command
 from .utils.cli_suggestions import SmartParser

skip_trace/collectors/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # skip_trace/collectors/__init__.py
-from . import github, package_files, pypi, whois
+from . import github, github_files, package_files, pypi, sigstore, whois
-__all__ = ["github", "pypi", "whois", "package_files"]
+__all__ = ["github", "github_files", "package_files", "pypi", "whois", "sigstore"]

skip-trace 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

skip-trace 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl