skip-trace 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {skip_trace-0.1.0 → skip_trace-0.1.1}/PKG-INFO +7 -3
- {skip_trace-0.1.0 → skip_trace-0.1.1}/pyproject.toml +17 -4
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/__about__.py +13 -3
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/__init__.py +0 -2
- skip_trace-0.1.1/skip_trace/analysis/content_scanner.py +189 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/analysis/evidence.py +1 -1
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/analysis/scoring.py +46 -1
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/analysis/source_scanner.py +1 -1
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/cli.py +1 -1
- skip_trace-0.1.1/skip_trace/collectors/__init__.py +4 -0
- skip_trace-0.1.1/skip_trace/collectors/github_files.py +359 -0
- skip_trace-0.1.1/skip_trace/collectors/package_files.py +341 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/collectors/pypi.py +1 -1
- skip_trace-0.1.1/skip_trace/collectors/pypi_attestations.py +160 -0
- skip_trace-0.1.1/skip_trace/collectors/sigstore.py +160 -0
- skip_trace-0.1.1/skip_trace/collectors/urls.py +96 -0
- skip_trace-0.1.1/skip_trace/m.py +287 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/main.py +103 -85
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/reporting/md_reporter.py +68 -4
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/schemas.py +21 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/utils/http_client.py +18 -0
- skip_trace-0.1.0/skip_trace/collectors/__init__.py +0 -4
- skip_trace-0.1.0/skip_trace/collectors/package_files.py +0 -150
- {skip_trace-0.1.0 → skip_trace-0.1.1}/.gitignore +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/LICENSE +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/README.md +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/__main__.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/analysis/__init__.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/analysis/ner.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/collectors/github.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/collectors/whois.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/config.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/exceptions.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/py.typed.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/reporting/__init__.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/reporting/json_reporter.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/utils/__init__.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/utils/cache.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/utils/cli_suggestions.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/utils/safe_targz.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/skip_trace/utils/validation.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/test/__init__.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/test/conftest.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/test/test_collectors/__init__.py +0 -0
- {skip_trace-0.1.0 → skip_trace-0.1.1}/test/test_collectors/test_pypi.py +0 -0
@@ -1,12 +1,15 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: skip-trace
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.1
|
4
4
|
Summary: Ownership Attribution for Python Packages
|
5
5
|
Project-URL: Homepage, https://github.com/matthewdeanmartin/skip-trace
|
6
6
|
Project-URL: Issues, https://github.com/matthewdeanmartin/skip-trace/issues
|
7
7
|
Author-email: Matthew Dean Martin <matthewdeanmartin@gmail.com>
|
8
|
+
License-Expression: MIT
|
8
9
|
License-File: LICENSE
|
9
|
-
|
10
|
+
Keywords: PEP 541,PyPI maintainers,package owners,package provenance,software supply chain
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
13
|
Classifier: Operating System :: OS Independent
|
11
14
|
Classifier: Programming Language :: Python :: 3
|
12
15
|
Classifier: Programming Language :: Python :: 3.9
|
@@ -16,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
16
19
|
Classifier: Programming Language :: Python :: 3.13
|
17
20
|
Classifier: Topic :: Security
|
18
21
|
Classifier: Topic :: Software Development :: Quality Assurance
|
19
|
-
Requires-Python: >=3.
|
22
|
+
Requires-Python: >=3.13
|
20
23
|
Requires-Dist: beautifulsoup4>=4.12.0
|
21
24
|
Requires-Dist: email-validator>=2.0.0
|
22
25
|
Requires-Dist: en-core-web-sm
|
@@ -27,6 +30,7 @@ Requires-Dist: pygithub>=1.59.0
|
|
27
30
|
Requires-Dist: python-dotenv
|
28
31
|
Requires-Dist: python-dotenv>=1.0.0
|
29
32
|
Requires-Dist: python-whois>=0.8.0
|
33
|
+
Requires-Dist: pyyaml>=6.0
|
30
34
|
Requires-Dist: rich-argparse
|
31
35
|
Requires-Dist: rich>=13.0.0
|
32
36
|
Requires-Dist: sigstore>=1.0.0
|
@@ -3,13 +3,14 @@
|
|
3
3
|
|
4
4
|
[project]
|
5
5
|
name = "skip-trace"
|
6
|
-
version = "0.1.
|
6
|
+
version = "0.1.1"
|
7
7
|
description = "Ownership Attribution for Python Packages"
|
8
8
|
readme = "README.md"
|
9
|
+
license = "MIT"
|
9
10
|
authors = [
|
10
11
|
{ name = "Matthew Dean Martin", email = "matthewdeanmartin@gmail.com" },
|
11
12
|
]
|
12
|
-
requires-python = ">=3.
|
13
|
+
requires-python = ">=3.13"
|
13
14
|
classifiers = [
|
14
15
|
"Programming Language :: Python :: 3",
|
15
16
|
"Programming Language :: Python :: 3.9",
|
@@ -17,10 +18,11 @@ classifiers = [
|
|
17
18
|
"Programming Language :: Python :: 3.11",
|
18
19
|
"Programming Language :: Python :: 3.12",
|
19
20
|
"Programming Language :: Python :: 3.13",
|
21
|
+
"License :: OSI Approved :: MIT License",
|
20
22
|
"Operating System :: OS Independent",
|
21
23
|
"Topic :: Security",
|
22
24
|
"Topic :: Software Development :: Quality Assurance",
|
23
|
-
"Development Status ::
|
25
|
+
"Development Status :: 3 - Alpha"
|
24
26
|
]
|
25
27
|
dependencies = [
|
26
28
|
"httpx[http2]>=0.25.0",
|
@@ -33,7 +35,7 @@ dependencies = [
|
|
33
35
|
"beautifulsoup4>=4.12.0", # Added for HTML scraping
|
34
36
|
"PyGithub>=1.59.0", # NEW: For GitHub API interaction
|
35
37
|
"openai>=1.3.0",
|
36
|
-
"sigstore>=1.0.0",
|
38
|
+
"sigstore>=1.0.0", # not used yet, may need to remove
|
37
39
|
# "socials", is for regexing
|
38
40
|
# custom domains
|
39
41
|
"python-whois>=0.8.0",
|
@@ -44,6 +46,11 @@ dependencies = [
|
|
44
46
|
# "en_core_web_sm"
|
45
47
|
"rich-argparse",
|
46
48
|
"en-core-web-sm",
|
49
|
+
# "pypi_attestations"
|
50
|
+
"PyYAML>=6.0"
|
51
|
+
]
|
52
|
+
keywords = [
|
53
|
+
"PyPI maintainers", "package owners", "package provenance", "software supply chain", "PEP 541"
|
47
54
|
]
|
48
55
|
|
49
56
|
|
@@ -66,6 +73,7 @@ dev = [
|
|
66
73
|
"mypy; python_version >= '3.8'",
|
67
74
|
"types-toml; python_version >= '3.8'",
|
68
75
|
"types-jsonschema; python_version >= '3.8'",
|
76
|
+
"types-PyYAML",
|
69
77
|
# reports
|
70
78
|
|
71
79
|
# build
|
@@ -150,3 +158,8 @@ entity_resolution_llm = false # As requested, disabled by default
|
|
150
158
|
"llm_ner_claim" = 0.20 # Max weight for an LLM-only claim
|
151
159
|
"conflict" = -0.15
|
152
160
|
|
161
|
+
|
162
|
+
[tool.jiggle_version]
|
163
|
+
scheme = "pep440"
|
164
|
+
default_increment = "patch"
|
165
|
+
ignore = ["test", "sample_projects", "dead_code", ".packages"]
|
@@ -5,15 +5,25 @@ __all__ = [
|
|
5
5
|
"__version__",
|
6
6
|
"__description__",
|
7
7
|
"__readme__",
|
8
|
+
"__license__",
|
8
9
|
"__credits__",
|
9
10
|
"__requires_python__",
|
10
11
|
"__status__",
|
12
|
+
"__keywords__",
|
11
13
|
]
|
12
14
|
|
13
15
|
__title__ = "skip-trace"
|
14
|
-
__version__ = "0.1.
|
16
|
+
__version__ = "0.1.1"
|
15
17
|
__description__ = "Ownership Attribution for Python Packages"
|
16
18
|
__readme__ = "README.md"
|
19
|
+
__license__ = "MIT"
|
17
20
|
__credits__ = [{"name": "Matthew Dean Martin", "email": "matthewdeanmartin@gmail.com"}]
|
18
|
-
__requires_python__ = ">=3.
|
19
|
-
__status__ = "
|
21
|
+
__requires_python__ = ">=3.13"
|
22
|
+
__status__ = "3 - Alpha"
|
23
|
+
__keywords__ = [
|
24
|
+
"PyPI maintainers",
|
25
|
+
"package owners",
|
26
|
+
"package provenance",
|
27
|
+
"software supply chain",
|
28
|
+
"PEP 541",
|
29
|
+
]
|
@@ -0,0 +1,189 @@
|
|
1
|
+
# skip_trace/analysis/content_scanner.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import datetime
|
5
|
+
import logging
|
6
|
+
import re
|
7
|
+
from typing import List
|
8
|
+
|
9
|
+
from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
|
10
|
+
from ..utils.validation import is_valid_email
|
11
|
+
from . import ner
|
12
|
+
from .evidence import _parse_contact_string, generate_evidence_id
|
13
|
+
|
14
|
+
# Regex to find copyright notices, capturing the holder.
|
15
|
+
COPYRIGHT_RE = re.compile(
|
16
|
+
r"copyright\s*(?:\(c\))?\s*(?:[0-9,\-\s]+)?\s*([^\n]+)", re.IGNORECASE
|
17
|
+
)
|
18
|
+
|
19
|
+
# Regex to find __author__ assignments
|
20
|
+
AUTHOR_RE = re.compile(r"__author__\s*=\s*['\"]([^'\"]+)['\"]")
|
21
|
+
|
22
|
+
# Regex for finding standalone email addresses - used as a fast pre-filter
|
23
|
+
EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b")
|
24
|
+
|
25
|
+
# --- Regex for finding URLs in text content ---
|
26
|
+
URL_RE = re.compile(
|
27
|
+
r"""\b(?:https?://|www\.)[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+(?:[/?#]\S*)?"""
|
28
|
+
)
|
29
|
+
|
30
|
+
# Words that indicate a regex grabbed junk from a license instead of a name.
|
31
|
+
JUNK_WORDS = {
|
32
|
+
"copyright",
|
33
|
+
"holders",
|
34
|
+
"license",
|
35
|
+
"document",
|
36
|
+
"accompanies",
|
37
|
+
"notice",
|
38
|
+
"authors",
|
39
|
+
"identifies",
|
40
|
+
"endorse",
|
41
|
+
"promote",
|
42
|
+
"software",
|
43
|
+
"permission",
|
44
|
+
"conditions",
|
45
|
+
"and",
|
46
|
+
"other",
|
47
|
+
"the",
|
48
|
+
"for",
|
49
|
+
"with",
|
50
|
+
"this",
|
51
|
+
"list",
|
52
|
+
"following",
|
53
|
+
"txt",
|
54
|
+
"damages",
|
55
|
+
"owner",
|
56
|
+
"incidental",
|
57
|
+
"holder",
|
58
|
+
"liability",
|
59
|
+
"MIT",
|
60
|
+
"BSD",
|
61
|
+
}
|
62
|
+
|
63
|
+
logger = logging.getLogger(__name__)
|
64
|
+
|
65
|
+
|
66
|
+
def scan_text(
|
67
|
+
content: str, locator: str, source: EvidenceSource, is_python_file: bool = False
|
68
|
+
) -> List[EvidenceRecord]:
|
69
|
+
"""
|
70
|
+
Scans a string of text content for ownership evidence.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
content: The text content to scan.
|
74
|
+
locator: The path or URL where the content was found.
|
75
|
+
source: The EvidenceSource to assign to new records.
|
76
|
+
is_python_file: Flag to enable Python-specific scans like `__author__`.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
A list of EvidenceRecord objects found in the text.
|
80
|
+
"""
|
81
|
+
logger.info(f"Scanning {locator}")
|
82
|
+
evidence_list: List[EvidenceRecord] = []
|
83
|
+
now = datetime.datetime.now(datetime.timezone.utc)
|
84
|
+
found_in_scan = set() # Avoid creating duplicate records from the same scan
|
85
|
+
|
86
|
+
# 1. Scan for copyright notices
|
87
|
+
for match in COPYRIGHT_RE.finditer(content):
|
88
|
+
copyright_text = match.group(1).strip().rstrip(",.")
|
89
|
+
entities = ner.extract_entities(copyright_text)
|
90
|
+
if entities:
|
91
|
+
for entity_name, entity_label in entities:
|
92
|
+
if entity_name.lower() not in JUNK_WORDS:
|
93
|
+
key = ("copyright", entity_name)
|
94
|
+
if key in found_in_scan:
|
95
|
+
continue
|
96
|
+
found_in_scan.add(key)
|
97
|
+
value: dict[str, str | None] = {"holder": entity_name}
|
98
|
+
record = EvidenceRecord(
|
99
|
+
id=generate_evidence_id(
|
100
|
+
source,
|
101
|
+
EvidenceKind.COPYRIGHT,
|
102
|
+
locator,
|
103
|
+
str(value),
|
104
|
+
entity_name,
|
105
|
+
),
|
106
|
+
source=source,
|
107
|
+
locator=locator,
|
108
|
+
kind=EvidenceKind.COPYRIGHT,
|
109
|
+
value=value,
|
110
|
+
observed_at=now,
|
111
|
+
confidence=0.40,
|
112
|
+
notes=f"Found copyright holder '{entity_name}' via NER ({entity_label}) in '{locator}'.",
|
113
|
+
)
|
114
|
+
evidence_list.append(record)
|
115
|
+
|
116
|
+
# 2. Scan for __author__ tags in Python files
|
117
|
+
if is_python_file:
|
118
|
+
for match in AUTHOR_RE.finditer(content):
|
119
|
+
author_str = match.group(1).strip()
|
120
|
+
key = ("author", author_str)
|
121
|
+
if key in found_in_scan:
|
122
|
+
continue
|
123
|
+
found_in_scan.add(key)
|
124
|
+
parsed = _parse_contact_string(author_str)
|
125
|
+
if parsed.get("name") or parsed.get("email"):
|
126
|
+
value = {"name": parsed["name"], "email": parsed["email"]}
|
127
|
+
slug = parsed["name"] or parsed["email"] or "unknown"
|
128
|
+
record = EvidenceRecord(
|
129
|
+
id=generate_evidence_id(
|
130
|
+
source, EvidenceKind.AUTHOR_TAG, locator, str(value), slug
|
131
|
+
),
|
132
|
+
source=source,
|
133
|
+
locator=locator,
|
134
|
+
kind=EvidenceKind.AUTHOR_TAG,
|
135
|
+
value=value,
|
136
|
+
observed_at=now,
|
137
|
+
confidence=0.20,
|
138
|
+
notes=f"Found __author__ tag for '{author_str}' in '{locator}'.",
|
139
|
+
)
|
140
|
+
evidence_list.append(record)
|
141
|
+
|
142
|
+
# 3. Scan for any standalone email address
|
143
|
+
for match in EMAIL_RE.finditer(content):
|
144
|
+
if valid_email := is_valid_email(match.group(0)):
|
145
|
+
if ("email", valid_email) in found_in_scan:
|
146
|
+
continue
|
147
|
+
found_in_scan.add(("email", valid_email))
|
148
|
+
value = {"name": None, "email": valid_email}
|
149
|
+
record = EvidenceRecord(
|
150
|
+
id=generate_evidence_id(
|
151
|
+
source, EvidenceKind.CONTACT, locator, str(value), valid_email
|
152
|
+
),
|
153
|
+
source=source,
|
154
|
+
locator=locator,
|
155
|
+
kind=EvidenceKind.CONTACT,
|
156
|
+
value=value,
|
157
|
+
observed_at=now,
|
158
|
+
confidence=0.15,
|
159
|
+
notes=f"Found validated contact email '{valid_email}' in '{locator}'.",
|
160
|
+
)
|
161
|
+
evidence_list.append(record)
|
162
|
+
|
163
|
+
# 4. Scan for any URLs
|
164
|
+
for match in URL_RE.finditer(content):
|
165
|
+
url = match.group(0)
|
166
|
+
if ("url", url) in found_in_scan:
|
167
|
+
continue
|
168
|
+
found_in_scan.add(("url", url))
|
169
|
+
value = {"label": "URL found in content", "url": url}
|
170
|
+
record = EvidenceRecord(
|
171
|
+
id=generate_evidence_id(
|
172
|
+
source,
|
173
|
+
EvidenceKind.PROJECT_URL,
|
174
|
+
locator,
|
175
|
+
str(value),
|
176
|
+
url,
|
177
|
+
hint="content-scan",
|
178
|
+
),
|
179
|
+
source=source,
|
180
|
+
locator=locator,
|
181
|
+
kind=EvidenceKind.PROJECT_URL,
|
182
|
+
value=value,
|
183
|
+
observed_at=now,
|
184
|
+
confidence=0.10,
|
185
|
+
notes=f"Found URL '{url}' in '{locator}'.",
|
186
|
+
)
|
187
|
+
evidence_list.append(record)
|
188
|
+
|
189
|
+
return evidence_list
|
@@ -94,7 +94,7 @@ def _parse_contact_string(contact_str: str) -> Dict[str, Optional[str]]:
|
|
94
94
|
if not contact_str or not contact_str.strip():
|
95
95
|
return {"name": None, "email": None}
|
96
96
|
|
97
|
-
# Pattern for "Name <
|
97
|
+
# Pattern for "Name <user@example.com>"
|
98
98
|
match = re.search(r"(.+)<(.+)>", contact_str)
|
99
99
|
if match:
|
100
100
|
name = match.group(1).strip()
|
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
4
4
|
import collections
|
5
5
|
import logging
|
6
6
|
from typing import Dict, List, Optional, Tuple
|
7
|
+
from urllib.parse import urlparse
|
7
8
|
|
8
9
|
import tldextract
|
9
10
|
|
@@ -97,6 +98,18 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
|
|
97
98
|
kind = OwnerKind.INDIVIDUAL
|
98
99
|
else:
|
99
100
|
kind = OwnerKind.PROJECT
|
101
|
+
|
102
|
+
# NEW: Handle PyPI Publisher Attestation
|
103
|
+
elif record.kind == EvidenceKind.PYPI_PUBLISHER_ATTESTATION:
|
104
|
+
repo_slug = record.value.get("repository")
|
105
|
+
if repo_slug and "/" in repo_slug:
|
106
|
+
name = repo_slug.split("/")[0] # The user or org
|
107
|
+
kind = OwnerKind.PROJECT
|
108
|
+
|
109
|
+
# --- Handle EMAIL evidence directly ---
|
110
|
+
elif record.kind == EvidenceKind.EMAIL:
|
111
|
+
name = record.value.get("email")
|
112
|
+
kind = OwnerKind.INDIVIDUAL
|
100
113
|
# Handle user profile and company evidence
|
101
114
|
elif record.kind == EvidenceKind.USER_PROFILE:
|
102
115
|
name = record.value.get("user_name")
|
@@ -123,7 +136,7 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
|
|
123
136
|
if not raw_holder:
|
124
137
|
return None, kind
|
125
138
|
|
126
|
-
# ---
|
139
|
+
# --- Sanitize the raw string before accepting it as a name ---
|
127
140
|
# 1. Reject if it's too long to be a name.
|
128
141
|
if len(raw_holder) > 50:
|
129
142
|
return None, kind
|
@@ -137,6 +150,38 @@ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], Owne
|
|
137
150
|
kind = OwnerKind.INDIVIDUAL
|
138
151
|
else:
|
139
152
|
kind = OwnerKind.COMPANY
|
153
|
+
elif record.kind == EvidenceKind.SIGSTORE_SIGNER_IDENTITY:
|
154
|
+
identity = record.value.get("identity", "")
|
155
|
+
if "@" in identity and "." in identity: # Looks like an email
|
156
|
+
name = identity
|
157
|
+
kind = OwnerKind.INDIVIDUAL
|
158
|
+
else:
|
159
|
+
try:
|
160
|
+
# Try to parse a build identity URL
|
161
|
+
parsed = urlparse(identity)
|
162
|
+
if parsed.hostname and "github.com" in parsed.hostname:
|
163
|
+
path_parts = [p for p in parsed.path.split("/") if p]
|
164
|
+
if len(path_parts) >= 1:
|
165
|
+
name = path_parts[0] # The user or org
|
166
|
+
kind = OwnerKind.PROJECT
|
167
|
+
else:
|
168
|
+
name = identity
|
169
|
+
kind = OwnerKind.PROJECT
|
170
|
+
except Exception:
|
171
|
+
name = identity
|
172
|
+
kind = OwnerKind.PROJECT
|
173
|
+
elif record.kind == EvidenceKind.SIGSTORE_BUILD_PROVENANCE:
|
174
|
+
repo_uri = record.value.get("repo_uri", "")
|
175
|
+
try:
|
176
|
+
# Parse git+https://github.com/org/repo.git
|
177
|
+
parsed = urlparse(repo_uri.split("@")[0].replace("git+", ""))
|
178
|
+
if parsed.hostname and "github.com" in parsed.hostname:
|
179
|
+
path_parts = [p for p in parsed.path.split("/") if p]
|
180
|
+
if len(path_parts) >= 1:
|
181
|
+
name = path_parts[0] # The user or org
|
182
|
+
kind = OwnerKind.PROJECT
|
183
|
+
except Exception:
|
184
|
+
name = None
|
140
185
|
|
141
186
|
return name, kind
|
142
187
|
|
@@ -64,7 +64,7 @@ JUNK_WORDS = {
|
|
64
64
|
"BSD",
|
65
65
|
}
|
66
66
|
|
67
|
-
# ---
|
67
|
+
# --- Filename allowlist and more robust binary detection ---
|
68
68
|
|
69
69
|
# A set of common extensionless text files that should never be treated as binary.
|
70
70
|
TEXT_FILENAMES = {
|