skip-trace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ """Metadata for skip_trace."""
2
+
3
+ __all__ = [
4
+ "__title__",
5
+ "__version__",
6
+ "__description__",
7
+ "__readme__",
8
+ "__credits__",
9
+ "__requires_python__",
10
+ "__status__",
11
+ ]
12
+
13
+ __title__ = "skip-trace"
14
+ __version__ = "0.1.0"
15
+ __description__ = "Ownership Attribution for Python Packages"
16
+ __readme__ = "README.md"
17
+ __credits__ = [{"name": "Matthew Dean Martin", "email": "matthewdeanmartin@gmail.com"}]
18
+ __requires_python__ = ">=3.8"
19
+ __status__ = "1 - Planning"
skip_trace/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ # skip_trace/__init__.py
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ # __all__ will be populated as public functions/classes are added.
6
+ __all__ = []
skip_trace/__main__.py ADDED
@@ -0,0 +1,9 @@
1
+ # skip_trace/__main__.py
2
+ from __future__ import annotations
3
+
4
+ import sys
5
+
6
+ from .cli import main
7
+
8
+ if __name__ == "__main__":
9
+ sys.exit(main())
@@ -0,0 +1,4 @@
1
+ # skip_trace/analysis/__init__.py
2
+ from . import evidence, scoring, source_scanner
3
+
4
+ __all__ = ["evidence", "scoring", "source_scanner"]
@@ -0,0 +1,312 @@
1
+ # skip_trace/analysis/evidence.py
2
+ from __future__ import annotations
3
+
4
+ import datetime
5
+ import hashlib
6
+ import logging
7
+ import re
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ import tldextract
11
+
12
+ from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource, Maintainer
13
+ from ..utils.validation import is_valid_email
14
+ from . import ner # Import the NER module
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def _slugify(text: str) -> str:
20
+ """
21
+ Creates a URL-friendly slug from a string.
22
+
23
+ Converts to lowercase, folds to ASCII, replaces non-alphanumeric
24
+ characters with hyphens, and removes duplicate hyphens.
25
+
26
+ Args:
27
+ text: The string to slugify.
28
+
29
+ Returns:
30
+ A slugified string.
31
+ """
32
+ if not text:
33
+ return ""
34
+ # Simple ASCII folding
35
+ text = text.encode("ascii", "ignore").decode("ascii")
36
+ text = text.lower()
37
+ # Replace non-alphanumeric with hyphen
38
+ text = re.sub(r"[^a-z0-9]+", "-", text).strip("-")
39
+ return text
40
+
41
+
42
+ def generate_evidence_id(
43
+ source: EvidenceSource,
44
+ kind: EvidenceKind,
45
+ locator: str,
46
+ value: Any, # Changed to Any for dataclasses
47
+ slug_subject: str,
48
+ hint: Optional[str] = None,
49
+ ) -> str:
50
+ """
51
+ Generates a human-readable and deterministic Evidence ID.
52
+
53
+ Format: e-<source>-<kind>-<slug>[--<hint>]~<hash8>
54
+
55
+ Args:
56
+ source: The source of the evidence.
57
+ kind: The kind of evidence.
58
+ locator: The URL or path where the evidence was found.
59
+ value: The value of the evidence itself.
60
+ slug_subject: The primary entity to use for the slug (e.g., person, org).
61
+ hint: An optional hint to add to the slug for disambiguation.
62
+
63
+ Returns:
64
+ A formatted, unique evidence ID string.
65
+ """
66
+ slug = _slugify(slug_subject)
67
+ if hint:
68
+ slug = f"{slug}--{_slugify(hint)}"
69
+
70
+ # Truncate slug to a reasonable length to keep ID under 96 chars
71
+ max_slug_len = 60
72
+ slug = slug[:max_slug_len]
73
+
74
+ # Create the hash
75
+ hasher = hashlib.sha256()
76
+ hasher.update(f"{source.value}|{kind.value}|{locator}|{value}".encode("utf-8"))
77
+ hash8 = hasher.hexdigest()[:8]
78
+
79
+ return f"e-{source.value}-{kind.value}-{slug}~{hash8}"
80
+
81
+
82
+ def _parse_contact_string(contact_str: str) -> Dict[str, Optional[str]]:
83
+ """
84
+ Parses a contact string into its name and email components.
85
+
86
+ Handles formats like "Name <email>", "email", and "Name".
87
+
88
+ Args:
89
+ contact_str: The string to parse.
90
+
91
+ Returns:
92
+ A dictionary with "name" and "email" keys.
93
+ """
94
+ if not contact_str or not contact_str.strip():
95
+ return {"name": None, "email": None}
96
+
97
+ # Pattern for "Name <email@domain.com>"
98
+ match = re.search(r"(.+)<(.+)>", contact_str)
99
+ if match:
100
+ name = match.group(1).strip()
101
+ # Validate the email part using the robust validator
102
+ email = is_valid_email(match.group(2).strip())
103
+ return {"name": name, "email": email}
104
+
105
+ # If the whole string is a valid email, use it.
106
+ if email := is_valid_email(contact_str):
107
+ return {"name": None, "email": email}
108
+
109
+ # Fallback to NER if it's not a clear email format
110
+ list_of_entities = ner.extract_entities(contact_str.strip())
111
+ for entity, kind in list_of_entities:
112
+ if kind == "PERSON":
113
+ return {"name": entity, "email": None}
114
+
115
+ # If no email and no person from NER, we got nothing
116
+ return {"name": None, "email": None}
117
+
118
+
119
+ # Helper to sanitize fields that might contain the literal string "None"
120
+ def _clean_pypi_field(field_value: Any) -> str:
121
+ """Returns an empty string if the field is None or the literal string 'None'."""
122
+ if field_value is None or str(field_value).strip().lower() == "none":
123
+ return ""
124
+ return str(field_value).strip()
125
+
126
+
127
+ def extract_from_pypi(
128
+ metadata: Dict[str, Any],
129
+ ) -> Tuple[List[EvidenceRecord], List[Maintainer]]:
130
+ """
131
+ Extracts evidence from raw PyPI package metadata.
132
+
133
+ Args:
134
+ metadata: The dictionary of package metadata from the PyPI JSON API.
135
+
136
+ Returns:
137
+ A tuple containing:
138
+ - A list of EvidenceRecord objects.
139
+ - A list of Maintainer objects from direct PyPI fields.
140
+ """
141
+ evidence_list: List[EvidenceRecord] = []
142
+ maintainer_list: List[Maintainer] = []
143
+ seen_maintainers = set()
144
+
145
+ info = metadata.get("info", {})
146
+ if not info:
147
+ logger.warning("PyPI metadata is missing the 'info' dictionary.")
148
+ return [], []
149
+
150
+ package_name = info.get("name", "unknown")
151
+ package_version = info.get("version", "latest")
152
+ locator = f"https://pypi.org/pypi/{package_name}/{package_version}/json"
153
+ now = datetime.datetime.now(datetime.timezone.utc)
154
+
155
+ # --- Create separate evidence for names and emails ---
156
+ def process_contact_string(
157
+ raw_string: str, role_kind: EvidenceKind, field_name: str
158
+ ) -> None:
159
+ """
160
+ Parses a string for contacts and creates separate evidence records
161
+ for each piece of information (name, email).
162
+ """
163
+ if not raw_string:
164
+ return
165
+
166
+ def add_separate_evidence(
167
+ parsed_contact: Dict[str, Optional[str]],
168
+ source_note: str,
169
+ confidence: float,
170
+ ) -> None:
171
+ """Creates and appends separate evidence for name and email."""
172
+ name = parsed_contact.get("name")
173
+ email = parsed_contact.get("email")
174
+
175
+ # Also create a simple Maintainer object for direct reporting
176
+ if name or email:
177
+ key = (name, email)
178
+ if key not in seen_maintainers:
179
+ maintainer_list.append(
180
+ Maintainer(
181
+ name=name or "Unknown", email=email, confidence=confidence
182
+ )
183
+ )
184
+ seen_maintainers.add(key)
185
+
186
+ # Create evidence for the name, if it exists
187
+ if name:
188
+ # NOTE: Assumes EvidenceKind.PERSON exists in your schema
189
+ kind = EvidenceKind.PERSON
190
+ value = {"name": name}
191
+ record = EvidenceRecord(
192
+ id=generate_evidence_id(
193
+ EvidenceSource.PYPI, kind, locator, str(value), name
194
+ ),
195
+ source=EvidenceSource.PYPI,
196
+ locator=locator,
197
+ kind=kind,
198
+ value=value,
199
+ observed_at=now,
200
+ confidence=confidence,
201
+ notes=f"Found person '{name}' from PyPI '{field_name}' field ({source_note}). Designated as {role_kind.value}.",
202
+ )
203
+ evidence_list.append(record)
204
+ logger.debug(
205
+ f"Created {kind.value} evidence for '{name}' from '{field_name}'."
206
+ )
207
+
208
+ # Create evidence for the email, if it exists
209
+ if email:
210
+ # NOTE: Assumes EvidenceKind.EMAIL exists in your schema
211
+ kind = EvidenceKind.EMAIL
212
+ value = {"email": email}
213
+ # Use the name for the slug if available, otherwise email's local part
214
+ slug_subject = name or email.split("@")[0]
215
+ record = EvidenceRecord(
216
+ id=generate_evidence_id(
217
+ EvidenceSource.PYPI, kind, locator, str(value), slug_subject
218
+ ),
219
+ source=EvidenceSource.PYPI,
220
+ locator=locator,
221
+ kind=kind,
222
+ value=value,
223
+ observed_at=now,
224
+ confidence=confidence + 0.1, # Emails are slightly more reliable
225
+ notes=f"Found email for '{slug_subject}' from PyPI '{field_name}' field ({source_note}). Designated as {role_kind.value}.",
226
+ )
227
+ evidence_list.append(record)
228
+ logger.debug(
229
+ f"Created {kind.value} evidence for '{email}' from '{field_name}'."
230
+ )
231
+
232
+ # Attempt to use NER to find multiple entities
233
+ entities = ner.extract_entities(raw_string)
234
+ if entities:
235
+ logger.debug(
236
+ f"NER found {len(entities)} entities in PyPI field '{field_name}': {entities}"
237
+ )
238
+ for entity_name, _entity_label in entities:
239
+ parsed = _parse_contact_string(entity_name)
240
+ add_separate_evidence(parsed, "NER", confidence=0.45)
241
+ # else:
242
+ # # Fallback to simple parsing if NER finds nothing
243
+ # parsed = _parse_contact_string(raw_string)
244
+ # add_separate_evidence(parsed, "regex fallback", confidence=0.30)
245
+
246
+ # Process author and maintainer fields
247
+ author_name = _clean_pypi_field(info.get("author"))
248
+ author_email = _clean_pypi_field(info.get("author_email"))
249
+ # Prefer email string as it's more likely to contain both name and email
250
+ author_string = author_email or author_name
251
+ if author_string:
252
+ process_contact_string(
253
+ author_string, EvidenceKind.AUTHOR_TAG, "author/author_email"
254
+ )
255
+
256
+ maintainer_name = _clean_pypi_field(info.get("maintainer"))
257
+ maintainer_email = _clean_pypi_field(info.get("maintainer_email"))
258
+ maintainer_string = maintainer_email or maintainer_name
259
+
260
+ # Only process maintainer if it's different from the author string
261
+ if maintainer_string and maintainer_string != author_string:
262
+ process_contact_string(
263
+ maintainer_string, EvidenceKind.MAINTAINER, "maintainer/maintainer_email"
264
+ )
265
+
266
+ # --- Project URL parsing ---
267
+ project_urls = info.get("project_urls")
268
+ if isinstance(project_urls, dict):
269
+ logger.debug(f"Found {len(project_urls)} project URLs to analyze.")
270
+ for label, url in project_urls.items():
271
+ if not url or not isinstance(url, str):
272
+ continue
273
+
274
+ domain_info = tldextract.extract(url)
275
+ repo_host = domain_info.domain
276
+ logger.debug(f"Parsing project URL ({label}): {url}")
277
+
278
+ if repo_host in ("github", "gitlab", "codeberg"):
279
+ path_parts = url.strip("/").split("/")
280
+ if len(path_parts) >= 4:
281
+ org_or_user = path_parts[3]
282
+ logger.debug(f"Extracted user/org '{org_or_user}' from URL.")
283
+ value = {"name": org_or_user, "url": url}
284
+ notes = f"Found user/org '{org_or_user}' from repository URL in project_urls."
285
+ record = EvidenceRecord(
286
+ id=generate_evidence_id(
287
+ EvidenceSource.PYPI,
288
+ EvidenceKind.ORGANIZATION,
289
+ locator,
290
+ str(value),
291
+ org_or_user,
292
+ hint=f"{repo_host}-user",
293
+ ),
294
+ source=EvidenceSource.PYPI,
295
+ locator=locator,
296
+ kind=EvidenceKind.ORGANIZATION,
297
+ value=value,
298
+ observed_at=now,
299
+ confidence=0.35,
300
+ notes=notes,
301
+ )
302
+ already_in = False
303
+ for already in evidence_list:
304
+ if already.notes == notes:
305
+ already_in = True
306
+ if not already_in:
307
+ evidence_list.append(record)
308
+
309
+ logger.info(
310
+ f"Extracted {len(evidence_list)} evidence records and {len(maintainer_list)} maintainers from PyPI."
311
+ )
312
+ return evidence_list, maintainer_list
@@ -0,0 +1,58 @@
1
+ # skip_trace/analysis/ner.py
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ from typing import List, Optional, Tuple
6
+
7
+ import spacy
8
+ from spacy.language import Language
9
+
10
+ SPACY_AVAILABLE = True
11
+
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ _nlp: Optional[Language] = None
16
+
17
+
18
+ def _get_nlp_model() -> Optional[Language]:
19
+ """Loads and caches the spaCy model. Returns None if unavailable."""
20
+ global _nlp
21
+ if not SPACY_AVAILABLE:
22
+ return None
23
+ if _nlp is None:
24
+ try:
25
+ logger.debug("Loading spaCy model 'en_core_web_sm'...")
26
+ _nlp = spacy.load("en_core_web_sm")
27
+ logger.info("Successfully loaded spaCy NER model.")
28
+ except IOError:
29
+ logger.warning(
30
+ "spaCy is installed, but model 'en_core_web_sm' not found. "
31
+ "Run 'python -m spacy download en_core_web_sm' to install it."
32
+ )
33
+ return None
34
+ return _nlp
35
+
36
+
37
+ def extract_entities(text: str) -> List[Tuple[str, str]]:
38
+ """
39
+ Extracts person and organization entities from a string using spaCy.
40
+
41
+ Args:
42
+ text: The text to process.
43
+
44
+ Returns:
45
+ A list of tuples, where each tuple is (entity_text, entity_label).
46
+ Returns an empty list if spaCy is not available or fails.
47
+ """
48
+ nlp = _get_nlp_model()
49
+ if not nlp:
50
+ return []
51
+
52
+ doc = nlp(text)
53
+ entities = []
54
+ for ent in doc.ents:
55
+ if ent.label_ in ["PERSON", "ORG"]:
56
+ entities.append((ent.text.strip(), ent.label_))
57
+ logger.debug(f"NER found entity: '{ent.text}' (Label: {ent.label_})")
58
+ return entities
@@ -0,0 +1,282 @@
1
+ # skip_trace/analysis/scoring.py
2
+ from __future__ import annotations
3
+
4
+ import collections
5
+ import logging
6
+ from typing import Dict, List, Optional, Tuple
7
+
8
+ import tldextract
9
+
10
+ from ..analysis.evidence import _parse_contact_string # Import the parser for reuse
11
+ from ..config import CONFIG
12
+ from ..schemas import (
13
+ Contact,
14
+ ContactType,
15
+ EvidenceKind,
16
+ EvidenceRecord,
17
+ OwnerCandidate,
18
+ OwnerKind,
19
+ )
20
+
21
+ # Words that indicate a regex grabbed junk from a license instead of a name.
22
+ JUNK_WORDS = {
23
+ "copyright",
24
+ "holders",
25
+ "license",
26
+ "document",
27
+ "accompanies",
28
+ "identifies",
29
+ "endorse",
30
+ "promote",
31
+ "software",
32
+ "permission",
33
+ "danger",
34
+ "warranty",
35
+ "bsd",
36
+ "liability",
37
+ # duped
38
+ "notice",
39
+ "authors",
40
+ "conditions",
41
+ # stop words
42
+ "and",
43
+ "other",
44
+ "the",
45
+ "for",
46
+ "with",
47
+ "this",
48
+ "list",
49
+ "following",
50
+ "txt",
51
+ "damages",
52
+ "owner",
53
+ # legalese
54
+ "incidental",
55
+ "holder",
56
+ # license names
57
+ "MIT",
58
+ "BSD",
59
+ }
60
+
61
+ logger = logging.getLogger(__name__)
62
+
63
+
64
+ def _normalize_name(name: str) -> str:
65
+ """Normalizes a name for entity grouping."""
66
+ # Also parse out emails that might be part of the name
67
+ parsed = _parse_contact_string(name)
68
+ raw_name = parsed.get("name") or parsed.get("email") or name
69
+ # Strip common trailing punctuation for better grouping
70
+ return raw_name.strip().rstrip(",.'").lower()
71
+
72
+
73
+ def _get_entity_from_record(record: EvidenceRecord) -> Tuple[Optional[str], OwnerKind]:
74
+ """Extracts a primary entity name and kind from an evidence record."""
75
+ kind = OwnerKind.INDIVIDUAL # Default
76
+ name = None
77
+
78
+ if record.kind in (
79
+ EvidenceKind.MAINTAINER,
80
+ EvidenceKind.AUTHOR_TAG,
81
+ EvidenceKind.COMMIT_AUTHOR,
82
+ EvidenceKind.PYPI_USER,
83
+ EvidenceKind.USER_PROFILE,
84
+ EvidenceKind.CONTACT, # Handle generic contacts
85
+ ):
86
+ raw_name = record.value.get("name") or record.value.get("email")
87
+ if raw_name:
88
+ # The name might be "Name <email>", so parse it
89
+ parsed = _parse_contact_string(raw_name)
90
+ name = parsed.get("name") or parsed.get("email")
91
+ kind = OwnerKind.INDIVIDUAL
92
+ elif record.kind in (EvidenceKind.ORGANIZATION, EvidenceKind.REPO_OWNER):
93
+ name = record.value.get("name")
94
+ # Check if the name looks like a user or an org
95
+ # A simple heuristic: if it contains spaces, it's likely a person's name
96
+ if name and " " in name:
97
+ kind = OwnerKind.INDIVIDUAL
98
+ else:
99
+ kind = OwnerKind.PROJECT
100
+ # Handle user profile and company evidence
101
+ elif record.kind == EvidenceKind.USER_PROFILE:
102
+ name = record.value.get("user_name")
103
+ kind = OwnerKind.INDIVIDUAL
104
+ elif record.kind == EvidenceKind.USER_COMPANY:
105
+ # The primary entity is the user, but this record also implies a company
106
+ name = record.value.get("user_name")
107
+ kind = OwnerKind.INDIVIDUAL
108
+ elif record.kind == EvidenceKind.PROJECT_URL:
109
+ url = record.value.get("url", "")
110
+ domain_info = tldextract.extract(url)
111
+ if domain_info.domain and domain_info.suffix:
112
+ name = domain_info.domain.capitalize()
113
+ kind = OwnerKind.COMPANY
114
+ # Handle WHOIS domain evidence
115
+ elif record.kind == EvidenceKind.DOMAIN:
116
+ name = record.value.get("name")
117
+ kind = OwnerKind.COMPANY
118
+ # Handle COPYRIGHT evidence from file scans
119
+ elif record.kind == EvidenceKind.COPYRIGHT:
120
+ # The scanner is now responsible for pre-filtering junk.
121
+ # This logic can now trust its input more.
122
+ raw_holder = record.value.get("holder")
123
+ if not raw_holder:
124
+ return None, kind
125
+
126
+ # --- NEW: Sanitize the raw string before accepting it as a name ---
127
+ # 1. Reject if it's too long to be a name.
128
+ if len(raw_holder) > 50:
129
+ return None, kind
130
+ # 2. Reject if it contains common license garbage words.
131
+ if any(word in raw_holder.lower() for word in JUNK_WORDS):
132
+ return None, kind
133
+
134
+ parsed = _parse_contact_string(raw_holder)
135
+ name = parsed.get("name") or parsed.get("email") or raw_holder
136
+ if parsed.get("email") or " " in name or "," in name:
137
+ kind = OwnerKind.INDIVIDUAL
138
+ else:
139
+ kind = OwnerKind.COMPANY
140
+
141
+ return name, kind
142
+
143
+
144
+ def score_owners(evidence_records: List[EvidenceRecord]) -> List[OwnerCandidate]:
145
+ """
146
+ Scores and ranks potential owners from a list of evidence.
147
+
148
+ This function performs entity resolution by normalizing names, aggregates
149
+ evidence for each unique entity, and calculates a score based on the
150
+ weights defined in the application configuration.
151
+
152
+ Args:
153
+ evidence_records: A list of EvidenceRecord objects to analyze.
154
+
155
+ Returns:
156
+ A list of OwnerCandidate objects, sorted by score in descending order.
157
+ """
158
+ # Get suppression settings from config
159
+ suppressed_orgs = CONFIG.get("suppressed_tool_orgs", [])
160
+ lenient_mode = CONFIG.get("lenient_mode_enabled", False)
161
+
162
+ # --- 1. Initial Entity Extraction & Alias Mapping ---
163
+ entities: Dict[str, OwnerCandidate] = {}
164
+ evidence_by_entity: Dict[str, List[EvidenceRecord]] = collections.defaultdict(list)
165
+
166
+ # 1. First pass: extract all entities and map evidence
167
+ for record in evidence_records:
168
+ name, kind = _get_entity_from_record(record)
169
+ # if not name:
170
+ # logging.warning(f"Skipping {record.kind}")
171
+ # continue
172
+
173
+ if not name:
174
+ name = ""
175
+
176
+ # Suppress tool orgs like 'github' unless in lenient mode
177
+ if name and (name.lower() in suppressed_orgs) and not lenient_mode:
178
+ continue
179
+
180
+ norm_name = _normalize_name(name)
181
+ evidence_by_entity[norm_name].append(record)
182
+ if norm_name not in entities:
183
+ # Use the raw name for display, but the normalized name for grouping
184
+ entities[norm_name] = OwnerCandidate(
185
+ name=name.strip().rstrip(",.'"), kind=kind
186
+ )
187
+
188
+ # Also create entities for companies mentioned in user profiles
189
+ if record.kind == EvidenceKind.USER_COMPANY:
190
+ company_name = record.value.get("company_name")
191
+ if company_name:
192
+ norm_co_name = _normalize_name(company_name)
193
+ if norm_co_name not in entities:
194
+ entities[norm_co_name] = OwnerCandidate(
195
+ name=company_name, kind=OwnerKind.COMPANY
196
+ )
197
+ evidence_by_entity[norm_co_name].append(
198
+ record
199
+ ) # Associate this evidence with the company too
200
+
201
+ # --- 2. Score each candidate and collect contacts ---
202
+ contact_map = {
203
+ "email": ContactType.EMAIL,
204
+ "twitter": ContactType.TWITTER,
205
+ "linkedin": ContactType.LINKEDIN,
206
+ "mastodon": ContactType.MASTODON,
207
+ "facebook": ContactType.FACEBOOK,
208
+ "instagram": ContactType.INSTAGRAM,
209
+ "youtube": ContactType.YOUTUBE,
210
+ "tiktok": ContactType.TIKTOK,
211
+ }
212
+ for norm_name, owner in entities.items():
213
+ score = 0.0
214
+ seen_rationale_keys = set()
215
+ contacts: Dict[Tuple[ContactType, str], Contact] = {}
216
+
217
+ for record in evidence_by_entity[norm_name]:
218
+ owner.evidence.append(record.id)
219
+ rationale_key = f"{record.source.value}-{record.kind.value}"
220
+ weight = record.confidence # Use confidence from collector
221
+
222
+ if rationale_key not in seen_rationale_keys:
223
+ score += weight
224
+ seen_rationale_keys.add(rationale_key)
225
+ else:
226
+ score += weight * 0.1 # Diminishing return
227
+
228
+ # --- UPDATED: Collect contact info from all relevant evidence kinds ---
229
+ contact_source_string = None
230
+ if record.kind in (
231
+ EvidenceKind.MAINTAINER,
232
+ EvidenceKind.AUTHOR_TAG,
233
+ EvidenceKind.COMMIT_AUTHOR,
234
+ EvidenceKind.CONTACT,
235
+ ):
236
+ contact_source_string = record.value.get("email") or record.value.get(
237
+ "name"
238
+ )
239
+ elif record.kind in (EvidenceKind.ORGANIZATION, EvidenceKind.REPO_OWNER):
240
+ if url := record.value.get("url", record.locator):
241
+ contacts[(ContactType.REPO, url)] = Contact(
242
+ type=ContactType.REPO, value=url
243
+ )
244
+ elif record.kind == EvidenceKind.PYPI_USER:
245
+ if url := record.value.get("url"):
246
+ contacts[(ContactType.URL, url)] = Contact(
247
+ type=ContactType.URL, value=url
248
+ )
249
+ # Collect contacts from USER_PROFILE evidence
250
+ elif record.kind == EvidenceKind.USER_PROFILE:
251
+ for key, value in record.value.get("contacts", {}).items():
252
+ contact_type = contact_map.get(
253
+ key, ContactType.URL
254
+ ) # Default to generic URL
255
+ contacts[(contact_type, value)] = Contact(
256
+ type=contact_type, value=value
257
+ )
258
+ elif record.kind == EvidenceKind.COPYRIGHT:
259
+ contact_source_string = record.value.get("holder")
260
+
261
+ # Parse any found string for an email
262
+ if contact_source_string:
263
+ parsed_contact = _parse_contact_string(contact_source_string)
264
+ if email := parsed_contact.get("email"):
265
+ contacts[(ContactType.EMAIL, email)] = Contact(
266
+ type=ContactType.EMAIL, value=email
267
+ )
268
+
269
+ owner.score = min(round(score, 2), 1.0)
270
+ owner.evidence = sorted(list(set(owner.evidence)))
271
+ owner.rationale = " + ".join(sorted(list(seen_rationale_keys)))
272
+ owner.contacts = sorted(
273
+ list(contacts.values()), key=lambda c: (c.type.value, c.value)
274
+ )
275
+
276
+ # 4. Filter and Sort
277
+ # filtered_candidates = [
278
+ # c for c in entities.values()
279
+ # if not (c.name.lower() in ["nobody", "nobody in particular", "example"] and c.score < 0.3)
280
+ # ]
281
+
282
+ return sorted(entities.values(), key=lambda c: c.score, reverse=True)