PyPI - skip-trace - Versions diffs - 0.1.0__tar.gz → 0.1.1__tar.gz - Mend

skip-trace 0.1.0tar.gz → 0.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{skip_trace-0.1.0 → skip_trace-0.1.1}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,15 @@
 Metadata-Version: 2.4
 Name: skip-trace
-Version: 0.1.0
+Version: 0.1.1
 Summary: Ownership Attribution for Python Packages
 Project-URL: Homepage, https://github.com/matthewdeanmartin/skip-trace
 Project-URL: Issues, https://github.com/matthewdeanmartin/skip-trace/issues
 Author-email: Matthew Dean Martin <matthewdeanmartin@gmail.com>
+License-Expression: MIT
 License-File: LICENSE
-Classifier: Development Status :: 1 - Planning
+Keywords: PEP 541,PyPI maintainers,package owners,package provenance,software supply chain
+Classifier: Development Status :: 3 - Alpha
+Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.9
@@ -16,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Security
 Classifier: Topic :: Software Development :: Quality Assurance
-Requires-Python: >=3.8
+Requires-Python: >=3.13
 Requires-Dist: beautifulsoup4>=4.12.0
 Requires-Dist: email-validator>=2.0.0
 Requires-Dist: en-core-web-sm
@@ -27,6 +30,7 @@ Requires-Dist: pygithub>=1.59.0
 Requires-Dist: python-dotenv
 Requires-Dist: python-dotenv>=1.0.0
 Requires-Dist: python-whois>=0.8.0
+Requires-Dist: pyyaml>=6.0
 Requires-Dist: rich-argparse
 Requires-Dist: rich>=13.0.0
 Requires-Dist: sigstore>=1.0.0

{skip_trace-0.1.0 → skip_trace-0.1.1}/pyproject.toml RENAMED Viewed

@@ -3,13 +3,14 @@
 [project]
 name = "skip-trace"
-version = "0.1.0"
+version = "0.1.1"
 description = "Ownership Attribution for Python Packages"
 readme = "README.md"
+license = "MIT"
 authors = [
     { name = "Matthew Dean Martin", email = "matthewdeanmartin@gmail.com" },
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.13"
 classifiers = [
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.9",
@@ -17,10 +18,11 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
+    "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
     "Topic :: Security",
     "Topic :: Software Development :: Quality Assurance",
-    "Development Status :: 1 - Planning"
+    "Development Status :: 3 - Alpha"
 ]
 dependencies = [
     "httpx[http2]>=0.25.0",
@@ -33,7 +35,7 @@ dependencies = [
     "beautifulsoup4>=4.12.0", # Added for HTML scraping
     "PyGithub>=1.59.0", # NEW: For GitHub API interaction
     "openai>=1.3.0",
-    "sigstore>=1.0.0",
+    "sigstore>=1.0.0", # not used yet, may need to remove
     # "socials", is for regexing
     # custom domains
     "python-whois>=0.8.0",
@@ -44,6 +46,11 @@ dependencies = [
     # "en_core_web_sm"
     "rich-argparse",
     "en-core-web-sm",
+    # "pypi_attestations"
+    "PyYAML>=6.0"
+]
+keywords = [
+  "PyPI maintainers", "package owners", "package provenance", "software supply chain", "PEP 541"
 ]
@@ -66,6 +73,7 @@ dev = [
     "mypy; python_version >= '3.8'",
     "types-toml; python_version >= '3.8'",
     "types-jsonschema; python_version >= '3.8'",
+    "types-PyYAML",
     # reports
     # build
@@ -150,3 +158,8 @@ entity_resolution_llm = false # As requested, disabled by default
 "llm_ner_claim" = 0.20 # Max weight for an LLM-only claim
 "conflict" = -0.15
+[tool.jiggle_version]
+scheme = "pep440"
+default_increment = "patch"
+ignore = ["test", "sample_projects", "dead_code", ".packages"]

{skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/__about__.py RENAMED Viewed

@@ -5,15 +5,25 @@ __all__ = [
     "__version__",
     "__description__",
     "__readme__",
+    "__license__",
     "__credits__",
     "__requires_python__",
     "__status__",
+    "__keywords__",
 ]
 __title__ = "skip-trace"
-__version__ = "0.1.0"
+__version__ = "0.1.1"
 __description__ = "Ownership Attribution for Python Packages"
 __readme__ = "README.md"
+__license__ = "MIT"
 __credits__ = [{"name": "Matthew Dean Martin", "email": "matthewdeanmartin@gmail.com"}]
-__requires_python__ = ">=3.8"
-__status__ = "1 - Planning"
+__requires_python__ = ">=3.13"
+__status__ = "3 - Alpha"
+__keywords__ = [
+    "PyPI maintainers",
+    "package owners",
+    "package provenance",
+    "software supply chain",
+    "PEP 541",
+]

{skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/__init__.py RENAMED Viewed

@@ -1,6 +1,4 @@
 # skip_trace/__init__.py
-__version__ = "0.1.0"
 # __all__ will be populated as public functions/classes are added.
 __all__ = []

skip_trace-0.1.1/skip_trace/analysis/content_scanner.py ADDED Viewed

@@ -0,0 +1,189 @@
+# skip_trace/analysis/content_scanner.py
+from __future__ import annotations
+import datetime
+import logging
+import re
+from typing import List
+from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
+from ..utils.validation import is_valid_email
+from . import ner
+from .evidence import _parse_contact_string, generate_evidence_id
+# Regex to find copyright notices, capturing the holder.
+COPYRIGHT_RE = re.compile(
+    r"copyright\s*(?:\(c\))?\s*(?:[0-9,\-\s]+)?\s*([^\n]+)", re.IGNORECASE
+)
+# Regex to find __author__ assignments
+AUTHOR_RE = re.compile(r"__author__\s*=\s*['\"]([^'\"]+)['\"]")
+# Regex for finding standalone email addresses - used as a fast pre-filter
+EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b")
+# --- Regex for finding URLs in text content ---
+URL_RE = re.compile(
+    r"""\b(?:https?://|www\.)[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+(?:[/?#]\S*)?"""
+)
+# Words that indicate a regex grabbed junk from a license instead of a name.
+JUNK_WORDS = {
+    "copyright",
+    "holders",
+    "license",
+    "document",
+    "accompanies",
+    "notice",
+    "authors",
+    "identifies",
+    "endorse",
+    "promote",
+    "software",
+    "permission",
+    "conditions",
+    "and",
+    "other",
+    "the",
+    "for",
+    "with",
+    "this",
+    "list",
+    "following",
+    "txt",
+    "damages",
+    "owner",
+    "incidental",
+    "holder",
+    "liability",
+    "MIT",
+    "BSD",
+}
+logger = logging.getLogger(__name__)
+def scan_text(
+    content: str, locator: str, source: EvidenceSource, is_python_file: bool = False
+) -> List[EvidenceRecord]:
+    """
+    Scans a string of text content for ownership evidence.
+    Args:
+        content: The text content to scan.
+        locator: The path or URL where the content was found.
+        source: The EvidenceSource to assign to new records.
+        is_python_file: Flag to enable Python-specific scans like `__author__`.
+    Returns:
+        A list of EvidenceRecord objects found in the text.
+    """
+    logger.info(f"Scanning {locator}")
+    evidence_list: List[EvidenceRecord] = []
+    now = datetime.datetime.now(datetime.timezone.utc)
+    found_in_scan = set()  # Avoid creating duplicate records from the same scan
+    # 1. Scan for copyright notices
+    for match in COPYRIGHT_RE.finditer(content):
+        copyright_text = match.group(1).strip().rstrip(",.")
+        entities = ner.extract_entities(copyright_text)
+        if entities:
+            for entity_name, entity_label in entities:
+                if entity_name.lower() not in JUNK_WORDS:
+                    key = ("copyright", entity_name)
+                    if key in found_in_scan:
+                        continue
+                    found_in_scan.add(key)
+                    value: dict[str, str | None] = {"holder": entity_name}
+                    record = EvidenceRecord(
+                        id=generate_evidence_id(
+                            source,
+                            EvidenceKind.COPYRIGHT,
+                            locator,
+                            str(value),
+                            entity_name,
+                        ),
+                        source=source,
+                        locator=locator,
+                        kind=EvidenceKind.COPYRIGHT,
+                        value=value,
+                        observed_at=now,
+                        confidence=0.40,
+                        notes=f"Found copyright holder '{entity_name}' via NER ({entity_label}) in '{locator}'.",
+                    )
+                    evidence_list.append(record)
+    # 2. Scan for __author__ tags in Python files
+    if is_python_file:
+        for match in AUTHOR_RE.finditer(content):
+            author_str = match.group(1).strip()
+            key = ("author", author_str)
+            if key in found_in_scan:
+                continue
+            found_in_scan.add(key)
+            parsed = _parse_contact_string(author_str)
+            if parsed.get("name") or parsed.get("email"):
+                value = {"name": parsed["name"], "email": parsed["email"]}
+                slug = parsed["name"] or parsed["email"] or "unknown"
+                record = EvidenceRecord(
+                    id=generate_evidence_id(
+                        source, EvidenceKind.AUTHOR_TAG, locator, str(value), slug
+                    ),
+                    source=source,
+                    locator=locator,
+                    kind=EvidenceKind.AUTHOR_TAG,
+                    value=value,
+                    observed_at=now,
+                    confidence=0.20,
+                    notes=f"Found __author__ tag for '{author_str}' in '{locator}'.",
+                )
+                evidence_list.append(record)
+    # 3. Scan for any standalone email address
+    for match in EMAIL_RE.finditer(content):
+        if valid_email := is_valid_email(match.group(0)):
+            if ("email", valid_email) in found_in_scan:
+                continue
+            found_in_scan.add(("email", valid_email))
+            value = {"name": None, "email": valid_email}
+            record = EvidenceRecord(
+                id=generate_evidence_id(
+                    source, EvidenceKind.CONTACT, locator, str(value), valid_email
+                ),
+                source=source,
+                locator=locator,
+                kind=EvidenceKind.CONTACT,
+                value=value,
+                observed_at=now,
+                confidence=0.15,
+                notes=f"Found validated contact email '{valid_email}' in '{locator}'.",
+            )
+            evidence_list.append(record)
+    # 4. Scan for any URLs
+    for match in URL_RE.finditer(content):
+        url = match.group(0)
+        if ("url", url) in found_in_scan:
+            continue
+        found_in_scan.add(("url", url))
+        value = {"label": "URL found in content", "url": url}
+        record = EvidenceRecord(
+            id=generate_evidence_id(
+                source,
+                EvidenceKind.PROJECT_URL,
+                locator,
+                str(value),
+                url,
+                hint="content-scan",
+            ),
+            source=source,
+            locator=locator,
+            kind=EvidenceKind.PROJECT_URL,
+            value=value,
+            observed_at=now,
+            confidence=0.10,
+            notes=f"Found URL '{url}' in '{locator}'.",
+        )
+        evidence_list.append(record)
+    return evidence_list

{skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/analysis/evidence.py RENAMED Viewed

@@ -94,7 +94,7 @@ def _parse_contact_string(contact_str: str) -> Dict[str, Optional[str]]:
     if not contact_str or not contact_str.strip():
         return {"name": None, "email": None}
-    # Pattern for "Name <email@domain.com>"
+    # Pattern for "Name <user@example.com>"
     match = re.search(r"(.+)<(.+)>", contact_str)
     if match:
         name = match.group(1).strip()

{skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/analysis/scoring.py RENAMED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import collections
 import logging
 from typing import Dict, List, Optional, Tuple
+from urllib.parse import urlparse
 import tldextract
@@ -97,6 +98,18 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
             kind = OwnerKind.INDIVIDUAL
         else:
             kind = OwnerKind.PROJECT
+    # NEW: Handle PyPI Publisher Attestation
+    elif record.kind == EvidenceKind.PYPI_PUBLISHER_ATTESTATION:
+        repo_slug = record.value.get("repository")
+        if repo_slug and "/" in repo_slug:
+            name = repo_slug.split("/")[0]  # The user or org
+            kind = OwnerKind.PROJECT
+    # --- Handle EMAIL evidence directly ---
+    elif record.kind == EvidenceKind.EMAIL:
+        name = record.value.get("email")
+        kind = OwnerKind.INDIVIDUAL
     # Handle user profile and company evidence
     elif record.kind == EvidenceKind.USER_PROFILE:
         name = record.value.get("user_name")
@@ -123,7 +136,7 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
         if not raw_holder:
             return None, kind
-        # --- NEW: Sanitize the raw string before accepting it as a name ---
+        # --- Sanitize the raw string before accepting it as a name ---
         # 1. Reject if it's too long to be a name.
         if len(raw_holder) > 50:
             return None, kind
@@ -137,6 +150,38 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
             kind = OwnerKind.INDIVIDUAL
         else:
             kind = OwnerKind.COMPANY
+    elif record.kind == EvidenceKind.SIGSTORE_SIGNER_IDENTITY:
+        identity = record.value.get("identity", "")
+        if "@" in identity and "." in identity:  # Looks like an email
+            name = identity
+            kind = OwnerKind.INDIVIDUAL
+        else:
+            try:
+                # Try to parse a build identity URL
+                parsed = urlparse(identity)
+                if parsed.hostname and "github.com" in parsed.hostname:
+                    path_parts = [p for p in parsed.path.split("/") if p]
+                    if len(path_parts) >= 1:
+                        name = path_parts[0]  # The user or org
+                        kind = OwnerKind.PROJECT
+                else:
+                    name = identity
+                    kind = OwnerKind.PROJECT
+            except Exception:
+                name = identity
+                kind = OwnerKind.PROJECT
+    elif record.kind == EvidenceKind.SIGSTORE_BUILD_PROVENANCE:
+        repo_uri = record.value.get("repo_uri", "")
+        try:
+            # Parse git+https://github.com/org/repo.git
+            parsed = urlparse(repo_uri.split("@")[0].replace("git+", ""))
+            if parsed.hostname and "github.com" in parsed.hostname:
+                path_parts = [p for p in parsed.path.split("/") if p]
+                if len(path_parts) >= 1:
+                    name = path_parts[0]  # The user or org
+                    kind = OwnerKind.PROJECT
+        except Exception:
+            name = None
     return name, kind

{skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/analysis/source_scanner.py RENAMED Viewed

@@ -64,7 +64,7 @@ JUNK_WORDS = {
     "BSD",
 }
-# --- NEW: Filename allowlist and more robust binary detection ---
+# --- Filename allowlist and more robust binary detection ---
 # A set of common extensionless text files that should never be treated as binary.
 TEXT_FILENAMES = {

{skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/cli.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import List, Optional
 from rich_argparse import RichHelpFormatter
-from . import __version__
+from .__about__ import __version__
 from .main import run_command
 from .utils.cli_suggestions import SmartParser

skip_trace-0.1.1/skip_trace/collectors/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+# skip_trace/collectors/__init__.py
+from . import github, github_files, package_files, pypi, sigstore, whois
+__all__ = ["github", "github_files", "package_files", "pypi", "whois", "sigstore"]

skip-trace 0.1.0__tar.gz → 0.1.1__tar.gz

skip-trace 0.1.0tar.gz → 0.1.1tar.gz