skip-trace 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skip_trace/__about__.py +19 -0
- skip_trace/__init__.py +6 -0
- skip_trace/__main__.py +9 -0
- skip_trace/analysis/__init__.py +4 -0
- skip_trace/analysis/evidence.py +312 -0
- skip_trace/analysis/ner.py +58 -0
- skip_trace/analysis/scoring.py +282 -0
- skip_trace/analysis/source_scanner.py +411 -0
- skip_trace/cli.py +177 -0
- skip_trace/collectors/__init__.py +4 -0
- skip_trace/collectors/github.py +241 -0
- skip_trace/collectors/package_files.py +150 -0
- skip_trace/collectors/pypi.py +158 -0
- skip_trace/collectors/whois.py +202 -0
- skip_trace/config.py +165 -0
- skip_trace/exceptions.py +22 -0
- skip_trace/main.py +269 -0
- skip_trace/py.typed.py +0 -0
- skip_trace/reporting/__init__.py +0 -0
- skip_trace/reporting/json_reporter.py +22 -0
- skip_trace/reporting/md_reporter.py +115 -0
- skip_trace/schemas.py +131 -0
- skip_trace/utils/__init__.py +4 -0
- skip_trace/utils/cache.py +77 -0
- skip_trace/utils/cli_suggestions.py +91 -0
- skip_trace/utils/http_client.py +45 -0
- skip_trace/utils/safe_targz.py +161 -0
- skip_trace/utils/validation.py +52 -0
- skip_trace-0.1.0.dist-info/METADATA +125 -0
- skip_trace-0.1.0.dist-info/RECORD +33 -0
- skip_trace-0.1.0.dist-info/WHEEL +4 -0
- skip_trace-0.1.0.dist-info/entry_points.txt +2 -0
- skip_trace-0.1.0.dist-info/licenses/LICENSE +21 -0
skip_trace/__about__.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
"""Metadata for skip_trace."""
|
2
|
+
|
3
|
+
__all__ = [
|
4
|
+
"__title__",
|
5
|
+
"__version__",
|
6
|
+
"__description__",
|
7
|
+
"__readme__",
|
8
|
+
"__credits__",
|
9
|
+
"__requires_python__",
|
10
|
+
"__status__",
|
11
|
+
]
|
12
|
+
|
13
|
+
__title__ = "skip-trace"
|
14
|
+
__version__ = "0.1.0"
|
15
|
+
__description__ = "Ownership Attribution for Python Packages"
|
16
|
+
__readme__ = "README.md"
|
17
|
+
__credits__ = [{"name": "Matthew Dean Martin", "email": "matthewdeanmartin@gmail.com"}]
|
18
|
+
__requires_python__ = ">=3.8"
|
19
|
+
__status__ = "1 - Planning"
|
skip_trace/__init__.py
ADDED
skip_trace/__main__.py
ADDED
@@ -0,0 +1,312 @@
|
|
1
|
+
# skip_trace/analysis/evidence.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import datetime
|
5
|
+
import hashlib
|
6
|
+
import logging
|
7
|
+
import re
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple
|
9
|
+
|
10
|
+
import tldextract
|
11
|
+
|
12
|
+
from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource, Maintainer
|
13
|
+
from ..utils.validation import is_valid_email
|
14
|
+
from . import ner # Import the NER module
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
def _slugify(text: str) -> str:
|
20
|
+
"""
|
21
|
+
Creates a URL-friendly slug from a string.
|
22
|
+
|
23
|
+
Converts to lowercase, folds to ASCII, replaces non-alphanumeric
|
24
|
+
characters with hyphens, and removes duplicate hyphens.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
text: The string to slugify.
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
A slugified string.
|
31
|
+
"""
|
32
|
+
if not text:
|
33
|
+
return ""
|
34
|
+
# Simple ASCII folding
|
35
|
+
text = text.encode("ascii", "ignore").decode("ascii")
|
36
|
+
text = text.lower()
|
37
|
+
# Replace non-alphanumeric with hyphen
|
38
|
+
text = re.sub(r"[^a-z0-9]+", "-", text).strip("-")
|
39
|
+
return text
|
40
|
+
|
41
|
+
|
42
|
+
def generate_evidence_id(
|
43
|
+
source: EvidenceSource,
|
44
|
+
kind: EvidenceKind,
|
45
|
+
locator: str,
|
46
|
+
value: Any, # Changed to Any for dataclasses
|
47
|
+
slug_subject: str,
|
48
|
+
hint: Optional[str] = None,
|
49
|
+
) -> str:
|
50
|
+
"""
|
51
|
+
Generates a human-readable and deterministic Evidence ID.
|
52
|
+
|
53
|
+
Format: e-<source>-<kind>-<slug>[--<hint>]~<hash8>
|
54
|
+
|
55
|
+
Args:
|
56
|
+
source: The source of the evidence.
|
57
|
+
kind: The kind of evidence.
|
58
|
+
locator: The URL or path where the evidence was found.
|
59
|
+
value: The value of the evidence itself.
|
60
|
+
slug_subject: The primary entity to use for the slug (e.g., person, org).
|
61
|
+
hint: An optional hint to add to the slug for disambiguation.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
A formatted, unique evidence ID string.
|
65
|
+
"""
|
66
|
+
slug = _slugify(slug_subject)
|
67
|
+
if hint:
|
68
|
+
slug = f"{slug}--{_slugify(hint)}"
|
69
|
+
|
70
|
+
# Truncate slug to a reasonable length to keep ID under 96 chars
|
71
|
+
max_slug_len = 60
|
72
|
+
slug = slug[:max_slug_len]
|
73
|
+
|
74
|
+
# Create the hash
|
75
|
+
hasher = hashlib.sha256()
|
76
|
+
hasher.update(f"{source.value}|{kind.value}|{locator}|{value}".encode("utf-8"))
|
77
|
+
hash8 = hasher.hexdigest()[:8]
|
78
|
+
|
79
|
+
return f"e-{source.value}-{kind.value}-{slug}~{hash8}"
|
80
|
+
|
81
|
+
|
82
|
+
def _parse_contact_string(contact_str: str) -> Dict[str, Optional[str]]:
|
83
|
+
"""
|
84
|
+
Parses a contact string into its name and email components.
|
85
|
+
|
86
|
+
Handles formats like "Name <email>", "email", and "Name".
|
87
|
+
|
88
|
+
Args:
|
89
|
+
contact_str: The string to parse.
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
A dictionary with "name" and "email" keys.
|
93
|
+
"""
|
94
|
+
if not contact_str or not contact_str.strip():
|
95
|
+
return {"name": None, "email": None}
|
96
|
+
|
97
|
+
# Pattern for "Name <email@domain.com>"
|
98
|
+
match = re.search(r"(.+)<(.+)>", contact_str)
|
99
|
+
if match:
|
100
|
+
name = match.group(1).strip()
|
101
|
+
# Validate the email part using the robust validator
|
102
|
+
email = is_valid_email(match.group(2).strip())
|
103
|
+
return {"name": name, "email": email}
|
104
|
+
|
105
|
+
# If the whole string is a valid email, use it.
|
106
|
+
if email := is_valid_email(contact_str):
|
107
|
+
return {"name": None, "email": email}
|
108
|
+
|
109
|
+
# Fallback to NER if it's not a clear email format
|
110
|
+
list_of_entities = ner.extract_entities(contact_str.strip())
|
111
|
+
for entity, kind in list_of_entities:
|
112
|
+
if kind == "PERSON":
|
113
|
+
return {"name": entity, "email": None}
|
114
|
+
|
115
|
+
# If no email and no person from NER, we got nothing
|
116
|
+
return {"name": None, "email": None}
|
117
|
+
|
118
|
+
|
119
|
+
# Helper to sanitize fields that might contain the literal string "None"
|
120
|
+
def _clean_pypi_field(field_value: Any) -> str:
|
121
|
+
"""Returns an empty string if the field is None or the literal string 'None'."""
|
122
|
+
if field_value is None or str(field_value).strip().lower() == "none":
|
123
|
+
return ""
|
124
|
+
return str(field_value).strip()
|
125
|
+
|
126
|
+
|
127
|
+
def extract_from_pypi(
|
128
|
+
metadata: Dict[str, Any],
|
129
|
+
) -> Tuple[List[EvidenceRecord], List[Maintainer]]:
|
130
|
+
"""
|
131
|
+
Extracts evidence from raw PyPI package metadata.
|
132
|
+
|
133
|
+
Args:
|
134
|
+
metadata: The dictionary of package metadata from the PyPI JSON API.
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
A tuple containing:
|
138
|
+
- A list of EvidenceRecord objects.
|
139
|
+
- A list of Maintainer objects from direct PyPI fields.
|
140
|
+
"""
|
141
|
+
evidence_list: List[EvidenceRecord] = []
|
142
|
+
maintainer_list: List[Maintainer] = []
|
143
|
+
seen_maintainers = set()
|
144
|
+
|
145
|
+
info = metadata.get("info", {})
|
146
|
+
if not info:
|
147
|
+
logger.warning("PyPI metadata is missing the 'info' dictionary.")
|
148
|
+
return [], []
|
149
|
+
|
150
|
+
package_name = info.get("name", "unknown")
|
151
|
+
package_version = info.get("version", "latest")
|
152
|
+
locator = f"https://pypi.org/pypi/{package_name}/{package_version}/json"
|
153
|
+
now = datetime.datetime.now(datetime.timezone.utc)
|
154
|
+
|
155
|
+
# --- Create separate evidence for names and emails ---
|
156
|
+
def process_contact_string(
|
157
|
+
raw_string: str, role_kind: EvidenceKind, field_name: str
|
158
|
+
) -> None:
|
159
|
+
"""
|
160
|
+
Parses a string for contacts and creates separate evidence records
|
161
|
+
for each piece of information (name, email).
|
162
|
+
"""
|
163
|
+
if not raw_string:
|
164
|
+
return
|
165
|
+
|
166
|
+
def add_separate_evidence(
|
167
|
+
parsed_contact: Dict[str, Optional[str]],
|
168
|
+
source_note: str,
|
169
|
+
confidence: float,
|
170
|
+
) -> None:
|
171
|
+
"""Creates and appends separate evidence for name and email."""
|
172
|
+
name = parsed_contact.get("name")
|
173
|
+
email = parsed_contact.get("email")
|
174
|
+
|
175
|
+
# Also create a simple Maintainer object for direct reporting
|
176
|
+
if name or email:
|
177
|
+
key = (name, email)
|
178
|
+
if key not in seen_maintainers:
|
179
|
+
maintainer_list.append(
|
180
|
+
Maintainer(
|
181
|
+
name=name or "Unknown", email=email, confidence=confidence
|
182
|
+
)
|
183
|
+
)
|
184
|
+
seen_maintainers.add(key)
|
185
|
+
|
186
|
+
# Create evidence for the name, if it exists
|
187
|
+
if name:
|
188
|
+
# NOTE: Assumes EvidenceKind.PERSON exists in your schema
|
189
|
+
kind = EvidenceKind.PERSON
|
190
|
+
value = {"name": name}
|
191
|
+
record = EvidenceRecord(
|
192
|
+
id=generate_evidence_id(
|
193
|
+
EvidenceSource.PYPI, kind, locator, str(value), name
|
194
|
+
),
|
195
|
+
source=EvidenceSource.PYPI,
|
196
|
+
locator=locator,
|
197
|
+
kind=kind,
|
198
|
+
value=value,
|
199
|
+
observed_at=now,
|
200
|
+
confidence=confidence,
|
201
|
+
notes=f"Found person '{name}' from PyPI '{field_name}' field ({source_note}). Designated as {role_kind.value}.",
|
202
|
+
)
|
203
|
+
evidence_list.append(record)
|
204
|
+
logger.debug(
|
205
|
+
f"Created {kind.value} evidence for '{name}' from '{field_name}'."
|
206
|
+
)
|
207
|
+
|
208
|
+
# Create evidence for the email, if it exists
|
209
|
+
if email:
|
210
|
+
# NOTE: Assumes EvidenceKind.EMAIL exists in your schema
|
211
|
+
kind = EvidenceKind.EMAIL
|
212
|
+
value = {"email": email}
|
213
|
+
# Use the name for the slug if available, otherwise email's local part
|
214
|
+
slug_subject = name or email.split("@")[0]
|
215
|
+
record = EvidenceRecord(
|
216
|
+
id=generate_evidence_id(
|
217
|
+
EvidenceSource.PYPI, kind, locator, str(value), slug_subject
|
218
|
+
),
|
219
|
+
source=EvidenceSource.PYPI,
|
220
|
+
locator=locator,
|
221
|
+
kind=kind,
|
222
|
+
value=value,
|
223
|
+
observed_at=now,
|
224
|
+
confidence=confidence + 0.1, # Emails are slightly more reliable
|
225
|
+
notes=f"Found email for '{slug_subject}' from PyPI '{field_name}' field ({source_note}). Designated as {role_kind.value}.",
|
226
|
+
)
|
227
|
+
evidence_list.append(record)
|
228
|
+
logger.debug(
|
229
|
+
f"Created {kind.value} evidence for '{email}' from '{field_name}'."
|
230
|
+
)
|
231
|
+
|
232
|
+
# Attempt to use NER to find multiple entities
|
233
|
+
entities = ner.extract_entities(raw_string)
|
234
|
+
if entities:
|
235
|
+
logger.debug(
|
236
|
+
f"NER found {len(entities)} entities in PyPI field '{field_name}': {entities}"
|
237
|
+
)
|
238
|
+
for entity_name, _entity_label in entities:
|
239
|
+
parsed = _parse_contact_string(entity_name)
|
240
|
+
add_separate_evidence(parsed, "NER", confidence=0.45)
|
241
|
+
# else:
|
242
|
+
# # Fallback to simple parsing if NER finds nothing
|
243
|
+
# parsed = _parse_contact_string(raw_string)
|
244
|
+
# add_separate_evidence(parsed, "regex fallback", confidence=0.30)
|
245
|
+
|
246
|
+
# Process author and maintainer fields
|
247
|
+
author_name = _clean_pypi_field(info.get("author"))
|
248
|
+
author_email = _clean_pypi_field(info.get("author_email"))
|
249
|
+
# Prefer email string as it's more likely to contain both name and email
|
250
|
+
author_string = author_email or author_name
|
251
|
+
if author_string:
|
252
|
+
process_contact_string(
|
253
|
+
author_string, EvidenceKind.AUTHOR_TAG, "author/author_email"
|
254
|
+
)
|
255
|
+
|
256
|
+
maintainer_name = _clean_pypi_field(info.get("maintainer"))
|
257
|
+
maintainer_email = _clean_pypi_field(info.get("maintainer_email"))
|
258
|
+
maintainer_string = maintainer_email or maintainer_name
|
259
|
+
|
260
|
+
# Only process maintainer if it's different from the author string
|
261
|
+
if maintainer_string and maintainer_string != author_string:
|
262
|
+
process_contact_string(
|
263
|
+
maintainer_string, EvidenceKind.MAINTAINER, "maintainer/maintainer_email"
|
264
|
+
)
|
265
|
+
|
266
|
+
# --- Project URL parsing ---
|
267
|
+
project_urls = info.get("project_urls")
|
268
|
+
if isinstance(project_urls, dict):
|
269
|
+
logger.debug(f"Found {len(project_urls)} project URLs to analyze.")
|
270
|
+
for label, url in project_urls.items():
|
271
|
+
if not url or not isinstance(url, str):
|
272
|
+
continue
|
273
|
+
|
274
|
+
domain_info = tldextract.extract(url)
|
275
|
+
repo_host = domain_info.domain
|
276
|
+
logger.debug(f"Parsing project URL ({label}): {url}")
|
277
|
+
|
278
|
+
if repo_host in ("github", "gitlab", "codeberg"):
|
279
|
+
path_parts = url.strip("/").split("/")
|
280
|
+
if len(path_parts) >= 4:
|
281
|
+
org_or_user = path_parts[3]
|
282
|
+
logger.debug(f"Extracted user/org '{org_or_user}' from URL.")
|
283
|
+
value = {"name": org_or_user, "url": url}
|
284
|
+
notes = f"Found user/org '{org_or_user}' from repository URL in project_urls."
|
285
|
+
record = EvidenceRecord(
|
286
|
+
id=generate_evidence_id(
|
287
|
+
EvidenceSource.PYPI,
|
288
|
+
EvidenceKind.ORGANIZATION,
|
289
|
+
locator,
|
290
|
+
str(value),
|
291
|
+
org_or_user,
|
292
|
+
hint=f"{repo_host}-user",
|
293
|
+
),
|
294
|
+
source=EvidenceSource.PYPI,
|
295
|
+
locator=locator,
|
296
|
+
kind=EvidenceKind.ORGANIZATION,
|
297
|
+
value=value,
|
298
|
+
observed_at=now,
|
299
|
+
confidence=0.35,
|
300
|
+
notes=notes,
|
301
|
+
)
|
302
|
+
already_in = False
|
303
|
+
for already in evidence_list:
|
304
|
+
if already.notes == notes:
|
305
|
+
already_in = True
|
306
|
+
if not already_in:
|
307
|
+
evidence_list.append(record)
|
308
|
+
|
309
|
+
logger.info(
|
310
|
+
f"Extracted {len(evidence_list)} evidence records and {len(maintainer_list)} maintainers from PyPI."
|
311
|
+
)
|
312
|
+
return evidence_list, maintainer_list
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# skip_trace/analysis/ner.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import logging
|
5
|
+
from typing import List, Optional, Tuple
|
6
|
+
|
7
|
+
import spacy
|
8
|
+
from spacy.language import Language
|
9
|
+
|
10
|
+
SPACY_AVAILABLE = True
|
11
|
+
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
_nlp: Optional[Language] = None
|
16
|
+
|
17
|
+
|
18
|
+
def _get_nlp_model() -> Optional[Language]:
|
19
|
+
"""Loads and caches the spaCy model. Returns None if unavailable."""
|
20
|
+
global _nlp
|
21
|
+
if not SPACY_AVAILABLE:
|
22
|
+
return None
|
23
|
+
if _nlp is None:
|
24
|
+
try:
|
25
|
+
logger.debug("Loading spaCy model 'en_core_web_sm'...")
|
26
|
+
_nlp = spacy.load("en_core_web_sm")
|
27
|
+
logger.info("Successfully loaded spaCy NER model.")
|
28
|
+
except IOError:
|
29
|
+
logger.warning(
|
30
|
+
"spaCy is installed, but model 'en_core_web_sm' not found. "
|
31
|
+
"Run 'python -m spacy download en_core_web_sm' to install it."
|
32
|
+
)
|
33
|
+
return None
|
34
|
+
return _nlp
|
35
|
+
|
36
|
+
|
37
|
+
def extract_entities(text: str) -> List[Tuple[str, str]]:
|
38
|
+
"""
|
39
|
+
Extracts person and organization entities from a string using spaCy.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
text: The text to process.
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
A list of tuples, where each tuple is (entity_text, entity_label).
|
46
|
+
Returns an empty list if spaCy is not available or fails.
|
47
|
+
"""
|
48
|
+
nlp = _get_nlp_model()
|
49
|
+
if not nlp:
|
50
|
+
return []
|
51
|
+
|
52
|
+
doc = nlp(text)
|
53
|
+
entities = []
|
54
|
+
for ent in doc.ents:
|
55
|
+
if ent.label_ in ["PERSON", "ORG"]:
|
56
|
+
entities.append((ent.text.strip(), ent.label_))
|
57
|
+
logger.debug(f"NER found entity: '{ent.text}' (Label: {ent.label_})")
|
58
|
+
return entities
|
@@ -0,0 +1,282 @@
|
|
1
|
+
# skip_trace/analysis/scoring.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import collections
|
5
|
+
import logging
|
6
|
+
from typing import Dict, List, Optional, Tuple
|
7
|
+
|
8
|
+
import tldextract
|
9
|
+
|
10
|
+
from ..analysis.evidence import _parse_contact_string # Import the parser for reuse
|
11
|
+
from ..config import CONFIG
|
12
|
+
from ..schemas import (
|
13
|
+
Contact,
|
14
|
+
ContactType,
|
15
|
+
EvidenceKind,
|
16
|
+
EvidenceRecord,
|
17
|
+
OwnerCandidate,
|
18
|
+
OwnerKind,
|
19
|
+
)
|
20
|
+
|
21
|
+
# Words that indicate a regex grabbed junk from a license instead of a name.
|
22
|
+
JUNK_WORDS = {
|
23
|
+
"copyright",
|
24
|
+
"holders",
|
25
|
+
"license",
|
26
|
+
"document",
|
27
|
+
"accompanies",
|
28
|
+
"identifies",
|
29
|
+
"endorse",
|
30
|
+
"promote",
|
31
|
+
"software",
|
32
|
+
"permission",
|
33
|
+
"danger",
|
34
|
+
"warranty",
|
35
|
+
"bsd",
|
36
|
+
"liability",
|
37
|
+
# duped
|
38
|
+
"notice",
|
39
|
+
"authors",
|
40
|
+
"conditions",
|
41
|
+
# stop words
|
42
|
+
"and",
|
43
|
+
"other",
|
44
|
+
"the",
|
45
|
+
"for",
|
46
|
+
"with",
|
47
|
+
"this",
|
48
|
+
"list",
|
49
|
+
"following",
|
50
|
+
"txt",
|
51
|
+
"damages",
|
52
|
+
"owner",
|
53
|
+
# legalese
|
54
|
+
"incidental",
|
55
|
+
"holder",
|
56
|
+
# license names
|
57
|
+
"MIT",
|
58
|
+
"BSD",
|
59
|
+
}
|
60
|
+
|
61
|
+
logger = logging.getLogger(__name__)
|
62
|
+
|
63
|
+
|
64
|
+
def _normalize_name(name: str) -> str:
|
65
|
+
"""Normalizes a name for entity grouping."""
|
66
|
+
# Also parse out emails that might be part of the name
|
67
|
+
parsed = _parse_contact_string(name)
|
68
|
+
raw_name = parsed.get("name") or parsed.get("email") or name
|
69
|
+
# Strip common trailing punctuation for better grouping
|
70
|
+
return raw_name.strip().rstrip(",.'").lower()
|
71
|
+
|
72
|
+
|
73
|
+
def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], OwnerKind]:
|
74
|
+
"""Extracts a primary entity name and kind from an evidence record."""
|
75
|
+
kind = OwnerKind.INDIVIDUAL # Default
|
76
|
+
name = None
|
77
|
+
|
78
|
+
if record.kind in (
|
79
|
+
EvidenceKind.MAINTAINER,
|
80
|
+
EvidenceKind.AUTHOR_TAG,
|
81
|
+
EvidenceKind.COMMIT_AUTHOR,
|
82
|
+
EvidenceKind.PYPI_USER,
|
83
|
+
EvidenceKind.USER_PROFILE,
|
84
|
+
EvidenceKind.CONTACT, # Handle generic contacts
|
85
|
+
):
|
86
|
+
raw_name = record.value.get("name") or record.value.get("email")
|
87
|
+
if raw_name:
|
88
|
+
# The name might be "Name <email>", so parse it
|
89
|
+
parsed = _parse_contact_string(raw_name)
|
90
|
+
name = parsed.get("name") or parsed.get("email")
|
91
|
+
kind = OwnerKind.INDIVIDUAL
|
92
|
+
elif record.kind in (EvidenceKind.ORGANIZATION, EvidenceKind.REPO_OWNER):
|
93
|
+
name = record.value.get("name")
|
94
|
+
# Check if the name looks like a user or an org
|
95
|
+
# A simple heuristic: if it contains spaces, it's likely a person's name
|
96
|
+
if name and " " in name:
|
97
|
+
kind = OwnerKind.INDIVIDUAL
|
98
|
+
else:
|
99
|
+
kind = OwnerKind.PROJECT
|
100
|
+
# Handle user profile and company evidence
|
101
|
+
elif record.kind == EvidenceKind.USER_PROFILE:
|
102
|
+
name = record.value.get("user_name")
|
103
|
+
kind = OwnerKind.INDIVIDUAL
|
104
|
+
elif record.kind == EvidenceKind.USER_COMPANY:
|
105
|
+
# The primary entity is the user, but this record also implies a company
|
106
|
+
name = record.value.get("user_name")
|
107
|
+
kind = OwnerKind.INDIVIDUAL
|
108
|
+
elif record.kind == EvidenceKind.PROJECT_URL:
|
109
|
+
url = record.value.get("url", "")
|
110
|
+
domain_info = tldextract.extract(url)
|
111
|
+
if domain_info.domain and domain_info.suffix:
|
112
|
+
name = domain_info.domain.capitalize()
|
113
|
+
kind = OwnerKind.COMPANY
|
114
|
+
# Handle WHOIS domain evidence
|
115
|
+
elif record.kind == EvidenceKind.DOMAIN:
|
116
|
+
name = record.value.get("name")
|
117
|
+
kind = OwnerKind.COMPANY
|
118
|
+
# Handle COPYRIGHT evidence from file scans
|
119
|
+
elif record.kind == EvidenceKind.COPYRIGHT:
|
120
|
+
# The scanner is now responsible for pre-filtering junk.
|
121
|
+
# This logic can now trust its input more.
|
122
|
+
raw_holder = record.value.get("holder")
|
123
|
+
if not raw_holder:
|
124
|
+
return None, kind
|
125
|
+
|
126
|
+
# --- NEW: Sanitize the raw string before accepting it as a name ---
|
127
|
+
# 1. Reject if it's too long to be a name.
|
128
|
+
if len(raw_holder) > 50:
|
129
|
+
return None, kind
|
130
|
+
# 2. Reject if it contains common license garbage words.
|
131
|
+
if any(word in raw_holder.lower() for word in JUNK_WORDS):
|
132
|
+
return None, kind
|
133
|
+
|
134
|
+
parsed = _parse_contact_string(raw_holder)
|
135
|
+
name = parsed.get("name") or parsed.get("email") or raw_holder
|
136
|
+
if parsed.get("email") or " " in name or "," in name:
|
137
|
+
kind = OwnerKind.INDIVIDUAL
|
138
|
+
else:
|
139
|
+
kind = OwnerKind.COMPANY
|
140
|
+
|
141
|
+
return name, kind
|
142
|
+
|
143
|
+
|
144
|
+
def score_owners(evidence_records: List[EvidenceRecord]) -> List[OwnerCandidate]:
|
145
|
+
"""
|
146
|
+
Scores and ranks potential owners from a list of evidence.
|
147
|
+
|
148
|
+
This function performs entity resolution by normalizing names, aggregates
|
149
|
+
evidence for each unique entity, and calculates a score based on the
|
150
|
+
weights defined in the application configuration.
|
151
|
+
|
152
|
+
Args:
|
153
|
+
evidence_records: A list of EvidenceRecord objects to analyze.
|
154
|
+
|
155
|
+
Returns:
|
156
|
+
A list of OwnerCandidate objects, sorted by score in descending order.
|
157
|
+
"""
|
158
|
+
# Get suppression settings from config
|
159
|
+
suppressed_orgs = CONFIG.get("suppressed_tool_orgs", [])
|
160
|
+
lenient_mode = CONFIG.get("lenient_mode_enabled", False)
|
161
|
+
|
162
|
+
# --- 1. Initial Entity Extraction & Alias Mapping ---
|
163
|
+
entities: Dict[str, OwnerCandidate] = {}
|
164
|
+
evidence_by_entity: Dict[str, List[EvidenceRecord]] = collections.defaultdict(list)
|
165
|
+
|
166
|
+
# 1. First pass: extract all entities and map evidence
|
167
|
+
for record in evidence_records:
|
168
|
+
name, kind = _get_entity_from_record(record)
|
169
|
+
# if not name:
|
170
|
+
# logging.warning(f"Skipping {record.kind}")
|
171
|
+
# continue
|
172
|
+
|
173
|
+
if not name:
|
174
|
+
name = ""
|
175
|
+
|
176
|
+
# Suppress tool orgs like 'github' unless in lenient mode
|
177
|
+
if name and (name.lower() in suppressed_orgs) and not lenient_mode:
|
178
|
+
continue
|
179
|
+
|
180
|
+
norm_name = _normalize_name(name)
|
181
|
+
evidence_by_entity[norm_name].append(record)
|
182
|
+
if norm_name not in entities:
|
183
|
+
# Use the raw name for display, but the normalized name for grouping
|
184
|
+
entities[norm_name] = OwnerCandidate(
|
185
|
+
name=name.strip().rstrip(",.'"), kind=kind
|
186
|
+
)
|
187
|
+
|
188
|
+
# Also create entities for companies mentioned in user profiles
|
189
|
+
if record.kind == EvidenceKind.USER_COMPANY:
|
190
|
+
company_name = record.value.get("company_name")
|
191
|
+
if company_name:
|
192
|
+
norm_co_name = _normalize_name(company_name)
|
193
|
+
if norm_co_name not in entities:
|
194
|
+
entities[norm_co_name] = OwnerCandidate(
|
195
|
+
name=company_name, kind=OwnerKind.COMPANY
|
196
|
+
)
|
197
|
+
evidence_by_entity[norm_co_name].append(
|
198
|
+
record
|
199
|
+
) # Associate this evidence with the company too
|
200
|
+
|
201
|
+
# --- 2. Score each candidate and collect contacts ---
|
202
|
+
contact_map = {
|
203
|
+
"email": ContactType.EMAIL,
|
204
|
+
"twitter": ContactType.TWITTER,
|
205
|
+
"linkedin": ContactType.LINKEDIN,
|
206
|
+
"mastodon": ContactType.MASTODON,
|
207
|
+
"facebook": ContactType.FACEBOOK,
|
208
|
+
"instagram": ContactType.INSTAGRAM,
|
209
|
+
"youtube": ContactType.YOUTUBE,
|
210
|
+
"tiktok": ContactType.TIKTOK,
|
211
|
+
}
|
212
|
+
for norm_name, owner in entities.items():
|
213
|
+
score = 0.0
|
214
|
+
seen_rationale_keys = set()
|
215
|
+
contacts: Dict[Tuple[ContactType, str], Contact] = {}
|
216
|
+
|
217
|
+
for record in evidence_by_entity[norm_name]:
|
218
|
+
owner.evidence.append(record.id)
|
219
|
+
rationale_key = f"{record.source.value}-{record.kind.value}"
|
220
|
+
weight = record.confidence # Use confidence from collector
|
221
|
+
|
222
|
+
if rationale_key not in seen_rationale_keys:
|
223
|
+
score += weight
|
224
|
+
seen_rationale_keys.add(rationale_key)
|
225
|
+
else:
|
226
|
+
score += weight * 0.1 # Diminishing return
|
227
|
+
|
228
|
+
# --- UPDATED: Collect contact info from all relevant evidence kinds ---
|
229
|
+
contact_source_string = None
|
230
|
+
if record.kind in (
|
231
|
+
EvidenceKind.MAINTAINER,
|
232
|
+
EvidenceKind.AUTHOR_TAG,
|
233
|
+
EvidenceKind.COMMIT_AUTHOR,
|
234
|
+
EvidenceKind.CONTACT,
|
235
|
+
):
|
236
|
+
contact_source_string = record.value.get("email") or record.value.get(
|
237
|
+
"name"
|
238
|
+
)
|
239
|
+
elif record.kind in (EvidenceKind.ORGANIZATION, EvidenceKind.REPO_OWNER):
|
240
|
+
if url := record.value.get("url", record.locator):
|
241
|
+
contacts[(ContactType.REPO, url)] = Contact(
|
242
|
+
type=ContactType.REPO, value=url
|
243
|
+
)
|
244
|
+
elif record.kind == EvidenceKind.PYPI_USER:
|
245
|
+
if url := record.value.get("url"):
|
246
|
+
contacts[(ContactType.URL, url)] = Contact(
|
247
|
+
type=ContactType.URL, value=url
|
248
|
+
)
|
249
|
+
# Collect contacts from USER_PROFILE evidence
|
250
|
+
elif record.kind == EvidenceKind.USER_PROFILE:
|
251
|
+
for key, value in record.value.get("contacts", {}).items():
|
252
|
+
contact_type = contact_map.get(
|
253
|
+
key, ContactType.URL
|
254
|
+
) # Default to generic URL
|
255
|
+
contacts[(contact_type, value)] = Contact(
|
256
|
+
type=contact_type, value=value
|
257
|
+
)
|
258
|
+
elif record.kind == EvidenceKind.COPYRIGHT:
|
259
|
+
contact_source_string = record.value.get("holder")
|
260
|
+
|
261
|
+
# Parse any found string for an email
|
262
|
+
if contact_source_string:
|
263
|
+
parsed_contact = _parse_contact_string(contact_source_string)
|
264
|
+
if email := parsed_contact.get("email"):
|
265
|
+
contacts[(ContactType.EMAIL, email)] = Contact(
|
266
|
+
type=ContactType.EMAIL, value=email
|
267
|
+
)
|
268
|
+
|
269
|
+
owner.score = min(round(score, 2), 1.0)
|
270
|
+
owner.evidence = sorted(list(set(owner.evidence)))
|
271
|
+
owner.rationale = " + ".join(sorted(list(seen_rationale_keys)))
|
272
|
+
owner.contacts = sorted(
|
273
|
+
list(contacts.values()), key=lambda c: (c.type.value, c.value)
|
274
|
+
)
|
275
|
+
|
276
|
+
# 4. Filter and Sort
|
277
|
+
# filtered_candidates = [
|
278
|
+
# c for c in entities.values()
|
279
|
+
# if not (c.name.lower() in ["nobody", "nobody in particular", "example"] and c.score < 0.3)
|
280
|
+
# ]
|
281
|
+
|
282
|
+
return sorted(entities.values(), key=lambda c: c.score, reverse=True)
|