skip-trace 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
skip_trace/__about__.py CHANGED
@@ -5,15 +5,25 @@ __all__ = [
5
5
  "__version__",
6
6
  "__description__",
7
7
  "__readme__",
8
+ "__license__",
8
9
  "__credits__",
9
10
  "__requires_python__",
10
11
  "__status__",
12
+ "__keywords__",
11
13
  ]
12
14
 
13
15
  __title__ = "skip-trace"
14
- __version__ = "0.1.0"
16
+ __version__ = "0.1.1"
15
17
  __description__ = "Ownership Attribution for Python Packages"
16
18
  __readme__ = "README.md"
19
+ __license__ = "MIT"
17
20
  __credits__ = [{"name": "Matthew Dean Martin", "email": "matthewdeanmartin@gmail.com"}]
18
- __requires_python__ = ">=3.8"
19
- __status__ = "1 - Planning"
21
+ __requires_python__ = ">=3.13"
22
+ __status__ = "3 - Alpha"
23
+ __keywords__ = [
24
+ "PyPI maintainers",
25
+ "package owners",
26
+ "package provenance",
27
+ "software supply chain",
28
+ "PEP 541",
29
+ ]
skip_trace/__init__.py CHANGED
@@ -1,6 +1,4 @@
1
1
  # skip_trace/__init__.py
2
2
 
3
- __version__ = "0.1.0"
4
-
5
3
  # __all__ will be populated as public functions/classes are added.
6
4
  __all__ = []
@@ -0,0 +1,189 @@
1
+ # skip_trace/analysis/content_scanner.py
2
+ from __future__ import annotations
3
+
4
+ import datetime
5
+ import logging
6
+ import re
7
+ from typing import List
8
+
9
+ from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
10
+ from ..utils.validation import is_valid_email
11
+ from . import ner
12
+ from .evidence import _parse_contact_string, generate_evidence_id
13
+
14
+ # Regex to find copyright notices, capturing the holder.
15
+ COPYRIGHT_RE = re.compile(
16
+ r"copyright\s*(?:\(c\))?\s*(?:[0-9,\-\s]+)?\s*([^\n]+)", re.IGNORECASE
17
+ )
18
+
19
+ # Regex to find __author__ assignments
20
+ AUTHOR_RE = re.compile(r"__author__\s*=\s*['\"]([^'\"]+)['\"]")
21
+
22
+ # Regex for finding standalone email addresses - used as a fast pre-filter
23
+ EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b")
24
+
25
+ # --- Regex for finding URLs in text content ---
26
+ URL_RE = re.compile(
27
+ r"""\b(?:https?://|www\.)[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+(?:[/?#]\S*)?"""
28
+ )
29
+
30
+ # Words that indicate a regex grabbed junk from a license instead of a name.
31
+ JUNK_WORDS = {
32
+ "copyright",
33
+ "holders",
34
+ "license",
35
+ "document",
36
+ "accompanies",
37
+ "notice",
38
+ "authors",
39
+ "identifies",
40
+ "endorse",
41
+ "promote",
42
+ "software",
43
+ "permission",
44
+ "conditions",
45
+ "and",
46
+ "other",
47
+ "the",
48
+ "for",
49
+ "with",
50
+ "this",
51
+ "list",
52
+ "following",
53
+ "txt",
54
+ "damages",
55
+ "owner",
56
+ "incidental",
57
+ "holder",
58
+ "liability",
59
+ "MIT",
60
+ "BSD",
61
+ }
62
+
63
+ logger = logging.getLogger(__name__)
64
+
65
+
66
+ def scan_text(
67
+ content: str, locator: str, source: EvidenceSource, is_python_file: bool = False
68
+ ) -> List[EvidenceRecord]:
69
+ """
70
+ Scans a string of text content for ownership evidence.
71
+
72
+ Args:
73
+ content: The text content to scan.
74
+ locator: The path or URL where the content was found.
75
+ source: The EvidenceSource to assign to new records.
76
+ is_python_file: Flag to enable Python-specific scans like `__author__`.
77
+
78
+ Returns:
79
+ A list of EvidenceRecord objects found in the text.
80
+ """
81
+ logger.info(f"Scanning {locator}")
82
+ evidence_list: List[EvidenceRecord] = []
83
+ now = datetime.datetime.now(datetime.timezone.utc)
84
+ found_in_scan = set() # Avoid creating duplicate records from the same scan
85
+
86
+ # 1. Scan for copyright notices
87
+ for match in COPYRIGHT_RE.finditer(content):
88
+ copyright_text = match.group(1).strip().rstrip(",.")
89
+ entities = ner.extract_entities(copyright_text)
90
+ if entities:
91
+ for entity_name, entity_label in entities:
92
+ if entity_name.lower() not in JUNK_WORDS:
93
+ key = ("copyright", entity_name)
94
+ if key in found_in_scan:
95
+ continue
96
+ found_in_scan.add(key)
97
+ value: dict[str, str | None] = {"holder": entity_name}
98
+ record = EvidenceRecord(
99
+ id=generate_evidence_id(
100
+ source,
101
+ EvidenceKind.COPYRIGHT,
102
+ locator,
103
+ str(value),
104
+ entity_name,
105
+ ),
106
+ source=source,
107
+ locator=locator,
108
+ kind=EvidenceKind.COPYRIGHT,
109
+ value=value,
110
+ observed_at=now,
111
+ confidence=0.40,
112
+ notes=f"Found copyright holder '{entity_name}' via NER ({entity_label}) in '{locator}'.",
113
+ )
114
+ evidence_list.append(record)
115
+
116
+ # 2. Scan for __author__ tags in Python files
117
+ if is_python_file:
118
+ for match in AUTHOR_RE.finditer(content):
119
+ author_str = match.group(1).strip()
120
+ key = ("author", author_str)
121
+ if key in found_in_scan:
122
+ continue
123
+ found_in_scan.add(key)
124
+ parsed = _parse_contact_string(author_str)
125
+ if parsed.get("name") or parsed.get("email"):
126
+ value = {"name": parsed["name"], "email": parsed["email"]}
127
+ slug = parsed["name"] or parsed["email"] or "unknown"
128
+ record = EvidenceRecord(
129
+ id=generate_evidence_id(
130
+ source, EvidenceKind.AUTHOR_TAG, locator, str(value), slug
131
+ ),
132
+ source=source,
133
+ locator=locator,
134
+ kind=EvidenceKind.AUTHOR_TAG,
135
+ value=value,
136
+ observed_at=now,
137
+ confidence=0.20,
138
+ notes=f"Found __author__ tag for '{author_str}' in '{locator}'.",
139
+ )
140
+ evidence_list.append(record)
141
+
142
+ # 3. Scan for any standalone email address
143
+ for match in EMAIL_RE.finditer(content):
144
+ if valid_email := is_valid_email(match.group(0)):
145
+ if ("email", valid_email) in found_in_scan:
146
+ continue
147
+ found_in_scan.add(("email", valid_email))
148
+ value = {"name": None, "email": valid_email}
149
+ record = EvidenceRecord(
150
+ id=generate_evidence_id(
151
+ source, EvidenceKind.CONTACT, locator, str(value), valid_email
152
+ ),
153
+ source=source,
154
+ locator=locator,
155
+ kind=EvidenceKind.CONTACT,
156
+ value=value,
157
+ observed_at=now,
158
+ confidence=0.15,
159
+ notes=f"Found validated contact email '{valid_email}' in '{locator}'.",
160
+ )
161
+ evidence_list.append(record)
162
+
163
+ # 4. Scan for any URLs
164
+ for match in URL_RE.finditer(content):
165
+ url = match.group(0)
166
+ if ("url", url) in found_in_scan:
167
+ continue
168
+ found_in_scan.add(("url", url))
169
+ value = {"label": "URL found in content", "url": url}
170
+ record = EvidenceRecord(
171
+ id=generate_evidence_id(
172
+ source,
173
+ EvidenceKind.PROJECT_URL,
174
+ locator,
175
+ str(value),
176
+ url,
177
+ hint="content-scan",
178
+ ),
179
+ source=source,
180
+ locator=locator,
181
+ kind=EvidenceKind.PROJECT_URL,
182
+ value=value,
183
+ observed_at=now,
184
+ confidence=0.10,
185
+ notes=f"Found URL '{url}' in '{locator}'.",
186
+ )
187
+ evidence_list.append(record)
188
+
189
+ return evidence_list
@@ -94,7 +94,7 @@ def _parse_contact_string(contact_str: str) -> Dict[str, Optional[str]]:
94
94
  if not contact_str or not contact_str.strip():
95
95
  return {"name": None, "email": None}
96
96
 
97
- # Pattern for "Name <email@domain.com>"
97
+ # Pattern for "Name <user@example.com>"
98
98
  match = re.search(r"(.+)<(.+)>", contact_str)
99
99
  if match:
100
100
  name = match.group(1).strip()
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
  import collections
5
5
  import logging
6
6
  from typing import Dict, List, Optional, Tuple
7
+ from urllib.parse import urlparse
7
8
 
8
9
  import tldextract
9
10
 
@@ -97,6 +98,18 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
97
98
  kind = OwnerKind.INDIVIDUAL
98
99
  else:
99
100
  kind = OwnerKind.PROJECT
101
+
102
+ # NEW: Handle PyPI Publisher Attestation
103
+ elif record.kind == EvidenceKind.PYPI_PUBLISHER_ATTESTATION:
104
+ repo_slug = record.value.get("repository")
105
+ if repo_slug and "/" in repo_slug:
106
+ name = repo_slug.split("/")[0] # The user or org
107
+ kind = OwnerKind.PROJECT
108
+
109
+ # --- Handle EMAIL evidence directly ---
110
+ elif record.kind == EvidenceKind.EMAIL:
111
+ name = record.value.get("email")
112
+ kind = OwnerKind.INDIVIDUAL
100
113
  # Handle user profile and company evidence
101
114
  elif record.kind == EvidenceKind.USER_PROFILE:
102
115
  name = record.value.get("user_name")
@@ -123,7 +136,7 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
123
136
  if not raw_holder:
124
137
  return None, kind
125
138
 
126
- # --- NEW: Sanitize the raw string before accepting it as a name ---
139
+ # --- Sanitize the raw string before accepting it as a name ---
127
140
  # 1. Reject if it's too long to be a name.
128
141
  if len(raw_holder) > 50:
129
142
  return None, kind
@@ -137,6 +150,38 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
137
150
  kind = OwnerKind.INDIVIDUAL
138
151
  else:
139
152
  kind = OwnerKind.COMPANY
153
+ elif record.kind == EvidenceKind.SIGSTORE_SIGNER_IDENTITY:
154
+ identity = record.value.get("identity", "")
155
+ if "@" in identity and "." in identity: # Looks like an email
156
+ name = identity
157
+ kind = OwnerKind.INDIVIDUAL
158
+ else:
159
+ try:
160
+ # Try to parse a build identity URL
161
+ parsed = urlparse(identity)
162
+ if parsed.hostname and "github.com" in parsed.hostname:
163
+ path_parts = [p for p in parsed.path.split("/") if p]
164
+ if len(path_parts) >= 1:
165
+ name = path_parts[0] # The user or org
166
+ kind = OwnerKind.PROJECT
167
+ else:
168
+ name = identity
169
+ kind = OwnerKind.PROJECT
170
+ except Exception:
171
+ name = identity
172
+ kind = OwnerKind.PROJECT
173
+ elif record.kind == EvidenceKind.SIGSTORE_BUILD_PROVENANCE:
174
+ repo_uri = record.value.get("repo_uri", "")
175
+ try:
176
+ # Parse git+https://github.com/org/repo.git
177
+ parsed = urlparse(repo_uri.split("@")[0].replace("git+", ""))
178
+ if parsed.hostname and "github.com" in parsed.hostname:
179
+ path_parts = [p for p in parsed.path.split("/") if p]
180
+ if len(path_parts) >= 1:
181
+ name = path_parts[0] # The user or org
182
+ kind = OwnerKind.PROJECT
183
+ except Exception:
184
+ name = None
140
185
 
141
186
  return name, kind
142
187
 
@@ -64,7 +64,7 @@ JUNK_WORDS = {
64
64
  "BSD",
65
65
  }
66
66
 
67
- # --- NEW: Filename allowlist and more robust binary detection ---
67
+ # --- Filename allowlist and more robust binary detection ---
68
68
 
69
69
  # A set of common extensionless text files that should never be treated as binary.
70
70
  TEXT_FILENAMES = {
skip_trace/cli.py CHANGED
@@ -6,7 +6,7 @@ from typing import List, Optional
6
6
 
7
7
  from rich_argparse import RichHelpFormatter
8
8
 
9
- from . import __version__
9
+ from .__about__ import __version__
10
10
  from .main import run_command
11
11
  from .utils.cli_suggestions import SmartParser
12
12
 
@@ -1,4 +1,4 @@
1
1
  # skip_trace/collectors/__init__.py
2
- from . import github, package_files, pypi, whois
2
+ from . import github, github_files, package_files, pypi, sigstore, whois
3
3
 
4
- __all__ = ["github", "pypi", "whois", "package_files"]
4
+ __all__ = ["github", "github_files", "package_files", "pypi", "whois", "sigstore"]