skip-trace 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skip_trace/__about__.py +19 -0
- skip_trace/__init__.py +6 -0
- skip_trace/__main__.py +9 -0
- skip_trace/analysis/__init__.py +4 -0
- skip_trace/analysis/evidence.py +312 -0
- skip_trace/analysis/ner.py +58 -0
- skip_trace/analysis/scoring.py +282 -0
- skip_trace/analysis/source_scanner.py +411 -0
- skip_trace/cli.py +177 -0
- skip_trace/collectors/__init__.py +4 -0
- skip_trace/collectors/github.py +241 -0
- skip_trace/collectors/package_files.py +150 -0
- skip_trace/collectors/pypi.py +158 -0
- skip_trace/collectors/whois.py +202 -0
- skip_trace/config.py +165 -0
- skip_trace/exceptions.py +22 -0
- skip_trace/main.py +269 -0
- skip_trace/py.typed.py +0 -0
- skip_trace/reporting/__init__.py +0 -0
- skip_trace/reporting/json_reporter.py +22 -0
- skip_trace/reporting/md_reporter.py +115 -0
- skip_trace/schemas.py +131 -0
- skip_trace/utils/__init__.py +4 -0
- skip_trace/utils/cache.py +77 -0
- skip_trace/utils/cli_suggestions.py +91 -0
- skip_trace/utils/http_client.py +45 -0
- skip_trace/utils/safe_targz.py +161 -0
- skip_trace/utils/validation.py +52 -0
- skip_trace-0.1.0.dist-info/METADATA +125 -0
- skip_trace-0.1.0.dist-info/RECORD +33 -0
- skip_trace-0.1.0.dist-info/WHEEL +4 -0
- skip_trace-0.1.0.dist-info/entry_points.txt +2 -0
- skip_trace-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,115 @@
|
|
1
|
+
# skip_trace/reporting/md_reporter.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import sys
|
5
|
+
from typing import IO
|
6
|
+
|
7
|
+
from rich.console import Console
|
8
|
+
from rich.table import Table
|
9
|
+
|
10
|
+
from ..schemas import PackageResult
|
11
|
+
|
12
|
+
|
13
|
+
def render(result: PackageResult, file: IO[str] = sys.stdout):
|
14
|
+
"""
|
15
|
+
Renders the PackageResult as a rich report to the console.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
result: The PackageResult object to render.
|
19
|
+
file: The file object to write to (defaults to stdout).
|
20
|
+
"""
|
21
|
+
import shutil
|
22
|
+
|
23
|
+
width, _ = shutil.get_terminal_size((80, 175))
|
24
|
+
console = Console(file=file, width=width)
|
25
|
+
version_str = f" v{result.version}" if result.version else ""
|
26
|
+
console.print(
|
27
|
+
f"\n[bold]📦 skip-trace: Ownership Report for {result.package}{version_str}[/bold]"
|
28
|
+
)
|
29
|
+
console.print("-" * 80)
|
30
|
+
|
31
|
+
# --- OWNERS TABLE ---
|
32
|
+
if not result.owners:
|
33
|
+
console.print("\n[bold]## 🕵️ Owner Candidates[/bold]")
|
34
|
+
console.print("\nNo owner candidates found.\n")
|
35
|
+
else:
|
36
|
+
console.print("\n[bold]## 🕵️ Owner Candidates[/bold]")
|
37
|
+
# Create a lookup map for evidence for fast access
|
38
|
+
evidence_map = {ev.id: ev for ev in result.evidence}
|
39
|
+
|
40
|
+
owner_table = Table(
|
41
|
+
show_header=True,
|
42
|
+
header_style="bold magenta",
|
43
|
+
title="Top Owner Candidates",
|
44
|
+
title_style="bold",
|
45
|
+
)
|
46
|
+
owner_table.add_column("Owner", style="cyan", width=30, no_wrap=True)
|
47
|
+
owner_table.add_column("Kind", width=10)
|
48
|
+
owner_table.add_column("Score", justify="right", style="bold")
|
49
|
+
owner_table.add_column("Contacts", width=65)
|
50
|
+
owner_table.add_column("Key Evidence Notes", no_wrap=False)
|
51
|
+
|
52
|
+
for owner in result.owners:
|
53
|
+
score_str = f"{owner.score:.2f}"
|
54
|
+
score_style = (
|
55
|
+
"green"
|
56
|
+
if owner.score >= 0.7
|
57
|
+
else "yellow" if owner.score >= 0.5 else "red"
|
58
|
+
)
|
59
|
+
|
60
|
+
contact_parts = []
|
61
|
+
for contact in owner.contacts:
|
62
|
+
value = contact.value
|
63
|
+
if contact.type.value in ("url", "repo") and len(value) > 60:
|
64
|
+
value = value[:60] + "..."
|
65
|
+
contact_parts.append(f"[bold dim]{contact.type.value}[/]: {value}")
|
66
|
+
contacts_str = (
|
67
|
+
"\n".join(contact_parts)
|
68
|
+
if contact_parts
|
69
|
+
else "[italic]None found[/italic]"
|
70
|
+
)
|
71
|
+
|
72
|
+
# Look up the notes from the evidence IDs
|
73
|
+
evidence_notes = []
|
74
|
+
for ev_id in owner.evidence:
|
75
|
+
evidence_record = evidence_map.get(ev_id)
|
76
|
+
if evidence_record and evidence_record.notes:
|
77
|
+
evidence_notes.append(f"• {evidence_record.notes}")
|
78
|
+
key_evidence_str = (
|
79
|
+
"\n".join(evidence_notes)
|
80
|
+
if evidence_notes
|
81
|
+
else "[italic]No notes.[/italic]"
|
82
|
+
)
|
83
|
+
|
84
|
+
owner_table.add_row(
|
85
|
+
owner.name,
|
86
|
+
owner.kind.value,
|
87
|
+
f"[{score_style}]{score_str}[/]",
|
88
|
+
contacts_str,
|
89
|
+
key_evidence_str,
|
90
|
+
)
|
91
|
+
console.print(owner_table)
|
92
|
+
|
93
|
+
# --- MAINTAINERS TABLE ---
|
94
|
+
if result.maintainers:
|
95
|
+
console.print("\n[bold]## 🧑💻 PyPI Maintainers[/bold]")
|
96
|
+
maintainer_table = Table(
|
97
|
+
show_header=True,
|
98
|
+
header_style="bold cyan",
|
99
|
+
title="Directly Listed Maintainers",
|
100
|
+
title_style="bold",
|
101
|
+
)
|
102
|
+
maintainer_table.add_column("Name", style="cyan")
|
103
|
+
maintainer_table.add_column("Email")
|
104
|
+
maintainer_table.add_column("Confidence", justify="right")
|
105
|
+
|
106
|
+
for maintainer in sorted(
|
107
|
+
result.maintainers, key=lambda m: m.confidence, reverse=True
|
108
|
+
):
|
109
|
+
email_str = maintainer.email or "[italic]Not provided[/italic]"
|
110
|
+
confidence_str = f"{maintainer.confidence:.2f}"
|
111
|
+
maintainer_table.add_row(maintainer.name, email_str, confidence_str)
|
112
|
+
|
113
|
+
console.print(maintainer_table)
|
114
|
+
|
115
|
+
console.print("-" * 80)
|
skip_trace/schemas.py
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
# skip_trace/schemas.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import datetime
|
5
|
+
from dataclasses import dataclass, field
|
6
|
+
from enum import Enum
|
7
|
+
from typing import Any, List, Optional
|
8
|
+
|
9
|
+
# --- Enums for controlled vocabularies ---
|
10
|
+
|
11
|
+
|
12
|
+
class OwnerKind(str, Enum):
|
13
|
+
INDIVIDUAL = "individual"
|
14
|
+
COMPANY = "company"
|
15
|
+
FOUNDATION = "foundation"
|
16
|
+
PROJECT = "project"
|
17
|
+
|
18
|
+
|
19
|
+
class ContactType(str, Enum):
|
20
|
+
EMAIL = "email"
|
21
|
+
URL = "url"
|
22
|
+
SECURITY = "security"
|
23
|
+
REPO = "repo"
|
24
|
+
MATRIX = "matrix"
|
25
|
+
SLACK = "slack"
|
26
|
+
TWITTER = "twitter"
|
27
|
+
MASTODON = "mastodon"
|
28
|
+
LINKEDIN = "linkedin"
|
29
|
+
# Add more common social platforms
|
30
|
+
FACEBOOK = "facebook"
|
31
|
+
INSTAGRAM = "instagram"
|
32
|
+
YOUTUBE = "youtube"
|
33
|
+
TIKTOK = "tiktok"
|
34
|
+
OTHER = "other"
|
35
|
+
|
36
|
+
|
37
|
+
class EvidenceSource(str, Enum):
|
38
|
+
PYPI = "pypi"
|
39
|
+
REPO = "repo"
|
40
|
+
WHEEL = "wheel"
|
41
|
+
LOCAL = "local"
|
42
|
+
DOCS = "docs"
|
43
|
+
SIGSTORE = "sigstore"
|
44
|
+
WHOIS = "whois"
|
45
|
+
VENV_SCAN = "venv-scan"
|
46
|
+
LLM_NER = "llm-ner"
|
47
|
+
|
48
|
+
|
49
|
+
class EvidenceKind(str, Enum):
|
50
|
+
PERSON = "person"
|
51
|
+
EMAIL = "email"
|
52
|
+
MAINTAINER = "maintainer"
|
53
|
+
ORGANIZATION = "org"
|
54
|
+
DOMAIN = "domain"
|
55
|
+
GOVERNANCE = "governance"
|
56
|
+
SIGNATURE = "signature"
|
57
|
+
COPYRIGHT = "copyright"
|
58
|
+
AUTHOR_TAG = "author-tag"
|
59
|
+
CODEOWNERS = "codeowners"
|
60
|
+
CONTACT = "contact"
|
61
|
+
PROJECT_URL = "project-url"
|
62
|
+
PYPI_USER = "pypi-user"
|
63
|
+
# GitHub-specific evidence kinds
|
64
|
+
REPO_OWNER = "repo-owner"
|
65
|
+
COMMIT_AUTHOR = "commit-author"
|
66
|
+
# GitHub profile-specific evidence kinds
|
67
|
+
USER_PROFILE = "user-profile"
|
68
|
+
USER_COMPANY = "user-company"
|
69
|
+
|
70
|
+
|
71
|
+
# --- Core Data Schemas ---
|
72
|
+
|
73
|
+
|
74
|
+
@dataclass
|
75
|
+
class Contact:
|
76
|
+
"""Represents a method of contacting an entity."""
|
77
|
+
|
78
|
+
type: ContactType
|
79
|
+
value: str
|
80
|
+
verified: bool = False
|
81
|
+
|
82
|
+
|
83
|
+
@dataclass
|
84
|
+
class EvidenceRecord:
|
85
|
+
"""A single piece of evidence supporting an ownership claim."""
|
86
|
+
|
87
|
+
id: str
|
88
|
+
source: EvidenceSource
|
89
|
+
locator: str # URL, file path, or PURL
|
90
|
+
kind: EvidenceKind
|
91
|
+
value: Any
|
92
|
+
observed_at: datetime.datetime
|
93
|
+
linkage: List[str] = field(default_factory=list)
|
94
|
+
confidence: float = 0.0
|
95
|
+
notes: str = ""
|
96
|
+
|
97
|
+
|
98
|
+
@dataclass
|
99
|
+
class OwnerCandidate:
|
100
|
+
"""Represents a potential owner with an aggregated score."""
|
101
|
+
|
102
|
+
name: str
|
103
|
+
kind: OwnerKind
|
104
|
+
score: float = 0.0
|
105
|
+
contacts: List[Contact] = field(default_factory=list)
|
106
|
+
evidence: List[str] = field(default_factory=list) # List of EvidenceRecord IDs
|
107
|
+
rationale: str = ""
|
108
|
+
|
109
|
+
|
110
|
+
@dataclass
|
111
|
+
class Maintainer:
|
112
|
+
"""A simplified maintainer record, distinct from a scored owner."""
|
113
|
+
|
114
|
+
name: str
|
115
|
+
email: Optional[str] = None
|
116
|
+
confidence: float = 0.0
|
117
|
+
|
118
|
+
|
119
|
+
@dataclass
|
120
|
+
class PackageResult:
|
121
|
+
"""The final JSON output for a single package."""
|
122
|
+
|
123
|
+
package: str
|
124
|
+
version: Optional[str] = None
|
125
|
+
owners: List[OwnerCandidate] = field(default_factory=list)
|
126
|
+
maintainers: List[Maintainer] = field(default_factory=list)
|
127
|
+
evidence: List[EvidenceRecord] = field(default_factory=list)
|
128
|
+
timestamp: str = field(
|
129
|
+
default_factory=lambda: datetime.datetime.now(datetime.timezone.utc).isoformat()
|
130
|
+
)
|
131
|
+
schema_version: str = "1.0"
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# skip_trace/utils/cache.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import json
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
import time
|
8
|
+
from typing import Any, Optional
|
9
|
+
|
10
|
+
from ..config import CONFIG
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def get_cache_path(cache_type: str, key: str) -> str:
|
16
|
+
"""Constructs the full path for a given cache type and key."""
|
17
|
+
cache_config = CONFIG.get("cache", {})
|
18
|
+
base_dir = cache_config.get("dir", ".skip_trace_cache")
|
19
|
+
cache_dir = os.path.join(base_dir, cache_type)
|
20
|
+
os.makedirs(cache_dir, exist_ok=True)
|
21
|
+
|
22
|
+
# Sanitize key for filesystem compatibility
|
23
|
+
safe_key = "".join(c for c in key if c.isalnum() or c in ("-", "_", "."))
|
24
|
+
return os.path.join(cache_dir, f"{safe_key}.json")
|
25
|
+
|
26
|
+
|
27
|
+
def get_cached_data(cache_type: str, key: str) -> Optional[Any]:
|
28
|
+
"""
|
29
|
+
Retrieves data from the cache if it exists and is not expired.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
cache_type: The category of the cache (e.g., 'whois').
|
33
|
+
key: The unique identifier for the cached item.
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
The cached data, or None if not found or expired.
|
37
|
+
"""
|
38
|
+
cache_config = CONFIG.get("cache", {})
|
39
|
+
if not cache_config.get("enabled", True):
|
40
|
+
return None
|
41
|
+
|
42
|
+
file_path = get_cache_path(cache_type, key)
|
43
|
+
ttl = cache_config.get("ttl_seconds", 604800) # Default to 7 days
|
44
|
+
|
45
|
+
if os.path.exists(file_path):
|
46
|
+
mod_time = os.path.getmtime(file_path)
|
47
|
+
if (time.time() - mod_time) < ttl:
|
48
|
+
try:
|
49
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
50
|
+
return json.load(f)
|
51
|
+
except (json.JSONDecodeError, IOError) as e:
|
52
|
+
logger.warning(f"Could not read cache file {file_path}: {e}")
|
53
|
+
return None
|
54
|
+
return None
|
55
|
+
|
56
|
+
|
57
|
+
def set_cached_data(cache_type: str, key: str, data: Any):
|
58
|
+
"""
|
59
|
+
Writes data to the cache.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
cache_type: The category of the cache (e.g., 'whois').
|
63
|
+
key: The unique identifier for the item to cache.
|
64
|
+
data: The JSON-serializable data to store.
|
65
|
+
"""
|
66
|
+
if not data:
|
67
|
+
return
|
68
|
+
cache_config = CONFIG.get("cache", {})
|
69
|
+
if not cache_config.get("enabled", True):
|
70
|
+
return
|
71
|
+
|
72
|
+
file_path = get_cache_path(cache_type, key)
|
73
|
+
try:
|
74
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
75
|
+
json.dump(data, f, indent=2, default=str)
|
76
|
+
except IOError as e:
|
77
|
+
logger.error(f"Could not write to cache file {file_path}: {e}")
|
@@ -0,0 +1,91 @@
|
|
1
|
+
"""
|
2
|
+
Smart argument parser with typo suggestions.
|
3
|
+
|
4
|
+
This module provides a subclass of `argparse.ArgumentParser` that enhances
|
5
|
+
the error reporting behavior when users supply invalid choices. If a user
|
6
|
+
makes a typo in a choice, the parser will suggest the closest matches
|
7
|
+
based on string similarity.
|
8
|
+
|
9
|
+
Example:
|
10
|
+
```python
|
11
|
+
import sys
|
12
|
+
|
13
|
+
parser = SmartParser(prog="myapp")
|
14
|
+
parser.add_argument("color", choices=["red", "green", "blue"])
|
15
|
+
args = parser.parse_args()
|
16
|
+
|
17
|
+
# If the user runs:
|
18
|
+
# myapp gren
|
19
|
+
#
|
20
|
+
# The output will include:
|
21
|
+
# error: invalid choice: 'gren' (choose from 'red', 'green', 'blue')
|
22
|
+
#
|
23
|
+
# Did you mean: green?
|
24
|
+
```
|
25
|
+
"""
|
26
|
+
|
27
|
+
from __future__ import annotations
|
28
|
+
|
29
|
+
import argparse
|
30
|
+
import sys
|
31
|
+
from difflib import get_close_matches
|
32
|
+
|
33
|
+
|
34
|
+
class SmartParser(argparse.ArgumentParser):
|
35
|
+
"""Argument parser that suggests similar choices on invalid input.
|
36
|
+
|
37
|
+
This class extends `argparse.ArgumentParser` to provide more helpful
|
38
|
+
error messages when the user provides an invalid choice for an argument.
|
39
|
+
Instead of only showing the list of valid choices, it also suggests the
|
40
|
+
closest matches using fuzzy string matching.
|
41
|
+
|
42
|
+
Example:
|
43
|
+
```python
|
44
|
+
parser = SmartParser()
|
45
|
+
parser.add_argument("fruit", choices=["apple", "banana", "cherry"])
|
46
|
+
args = parser.parse_args()
|
47
|
+
```
|
48
|
+
|
49
|
+
If the user types:
|
50
|
+
```
|
51
|
+
myprog bannna
|
52
|
+
```
|
53
|
+
|
54
|
+
The error message will include:
|
55
|
+
```
|
56
|
+
Did you mean: banana?
|
57
|
+
```
|
58
|
+
"""
|
59
|
+
|
60
|
+
def error(self, message: str):
|
61
|
+
"""Handle parsing errors with suggestions for invalid choices.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
message (str): The error message generated by argparse,
|
65
|
+
typically when parsing fails (e.g., due to invalid
|
66
|
+
choices or syntax errors).
|
67
|
+
|
68
|
+
Side Effects:
|
69
|
+
- Prints usage information to `sys.stderr`.
|
70
|
+
- Exits the program with status code 2.
|
71
|
+
|
72
|
+
Behavior:
|
73
|
+
- If the error message contains an "invalid choice" message,
|
74
|
+
attempts to suggest the closest valid alternatives by
|
75
|
+
computing string similarity.
|
76
|
+
- Otherwise, preserves standard argparse behavior.
|
77
|
+
"""
|
78
|
+
# Detect "invalid choice: 'foo' (choose from ...)"
|
79
|
+
if "invalid choice" in message and "choose from" in message:
|
80
|
+
bad = message.split("invalid choice:")[1].split("(")[0].strip().strip("'\"")
|
81
|
+
choices_str = message.split("choose from")[1]
|
82
|
+
choices = [
|
83
|
+
c.strip().strip(",)'") for c in choices_str.split() if c.strip(",)")
|
84
|
+
]
|
85
|
+
|
86
|
+
tips = get_close_matches(bad, choices, n=3, cutoff=0.6)
|
87
|
+
if tips:
|
88
|
+
message += f"\n\nDid you mean: {', '.join(tips)}?"
|
89
|
+
|
90
|
+
self.print_usage(sys.stderr)
|
91
|
+
self.exit(2, f"{self.prog}: error: {message}\n")
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# skip_trace/utils/http_client.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
import httpx
|
7
|
+
|
8
|
+
from ..config import CONFIG
|
9
|
+
from ..exceptions import NetworkError
|
10
|
+
|
11
|
+
_client: Optional[httpx.Client] = None
|
12
|
+
|
13
|
+
|
14
|
+
def get_client() -> httpx.Client:
|
15
|
+
"""Returns a shared httpx.Client instance."""
|
16
|
+
global _client
|
17
|
+
if _client is None:
|
18
|
+
http_config = CONFIG.get("http", {})
|
19
|
+
_client = httpx.Client(
|
20
|
+
headers={"User-Agent": http_config.get("user_agent", "skip-trace")},
|
21
|
+
timeout=http_config.get("timeout", 5),
|
22
|
+
follow_redirects=True,
|
23
|
+
)
|
24
|
+
return _client
|
25
|
+
|
26
|
+
|
27
|
+
def make_request(url: str) -> httpx.Response:
|
28
|
+
"""
|
29
|
+
Makes a GET request using the shared client and handles common errors.
|
30
|
+
|
31
|
+
:param url: The URL to fetch.
|
32
|
+
:raises NetworkError: If the request fails due to network issues or an error status code.
|
33
|
+
:return: The httpx.Response object.
|
34
|
+
"""
|
35
|
+
client = get_client()
|
36
|
+
try:
|
37
|
+
response = client.get(url)
|
38
|
+
response.raise_for_status()
|
39
|
+
return response
|
40
|
+
except httpx.RequestError as e:
|
41
|
+
raise NetworkError(f"Network request to {e.request.url} failed: {e}") from e
|
42
|
+
except httpx.HTTPStatusError as e:
|
43
|
+
raise NetworkError(
|
44
|
+
f"Request to {e.request.url} failed with status {e.response.status_code}"
|
45
|
+
) from e
|
@@ -0,0 +1,161 @@
|
|
1
|
+
# safe_targz.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import os
|
5
|
+
import tarfile
|
6
|
+
from pathlib import Path, PurePosixPath
|
7
|
+
from posixpath import normpath as posix_normpath
|
8
|
+
from tarfile import TarFile, TarInfo
|
9
|
+
from typing import Iterable, List, Tuple
|
10
|
+
|
11
|
+
|
12
|
+
class TarExtractionError(Exception):
|
13
|
+
pass
|
14
|
+
|
15
|
+
|
16
|
+
def _is_within(base: Path, target: Path) -> bool:
|
17
|
+
try:
|
18
|
+
base_resolved = base.resolve(strict=False)
|
19
|
+
target_resolved = target.resolve(strict=False)
|
20
|
+
except Exception:
|
21
|
+
# Fall back if resolve fails on not-yet-created parents
|
22
|
+
base_resolved = base.absolute()
|
23
|
+
target_resolved = target.absolute()
|
24
|
+
try:
|
25
|
+
target_resolved.relative_to(base_resolved)
|
26
|
+
return True
|
27
|
+
except Exception:
|
28
|
+
return False
|
29
|
+
|
30
|
+
|
31
|
+
def _sanitize_member_name(name: str) -> str:
|
32
|
+
# Tar paths are POSIX; normalize and strip leading "./"
|
33
|
+
name = name.lstrip("./")
|
34
|
+
name = posix_normpath(name)
|
35
|
+
return name
|
36
|
+
|
37
|
+
|
38
|
+
def _is_bad_path(name: str) -> bool:
|
39
|
+
# Reject absolute paths, Windows drive letters, parent traversal
|
40
|
+
if not name or name == ".":
|
41
|
+
return True
|
42
|
+
if name.startswith("/") or name.startswith("\\"):
|
43
|
+
return True
|
44
|
+
if ":" in name.split("/")[0]: # e.g., "C:..." in archives created on Windows
|
45
|
+
return True
|
46
|
+
parts = PurePosixPath(name).parts
|
47
|
+
return any(p == ".." for p in parts)
|
48
|
+
|
49
|
+
|
50
|
+
def _iter_safe_members(
|
51
|
+
tf: TarFile, dest: Path, allow_symlinks: bool
|
52
|
+
) -> Iterable[Tuple[TarInfo, Path]]:
|
53
|
+
for m in tf.getmembers():
|
54
|
+
clean = _sanitize_member_name(m.name)
|
55
|
+
if _is_bad_path(clean):
|
56
|
+
continue
|
57
|
+
out_path = dest / Path(*PurePosixPath(clean).parts)
|
58
|
+
if not _is_within(dest, out_path):
|
59
|
+
continue
|
60
|
+
|
61
|
+
# Directories and regular files are allowed
|
62
|
+
if m.isdir():
|
63
|
+
yield (m, out_path)
|
64
|
+
elif m.isreg():
|
65
|
+
yield (m, out_path)
|
66
|
+
elif m.issym() or m.islnk():
|
67
|
+
if not allow_symlinks:
|
68
|
+
continue
|
69
|
+
# Only allow relative symlink targets that stay inside dest
|
70
|
+
link = m.linkname or ""
|
71
|
+
link = _sanitize_member_name(link)
|
72
|
+
if _is_bad_path(link):
|
73
|
+
continue
|
74
|
+
# Compute where the symlink would point to
|
75
|
+
# (symlink is created relative to out_path.parent)
|
76
|
+
target = out_path.parent / Path(*PurePosixPath(link).parts)
|
77
|
+
if not _is_within(dest, target):
|
78
|
+
continue
|
79
|
+
yield (m, out_path)
|
80
|
+
else:
|
81
|
+
# Block devices, fifos, sockets, etc.
|
82
|
+
continue
|
83
|
+
|
84
|
+
|
85
|
+
def _extract_member(tf: TarFile, m: TarInfo, out_path: Path) -> None:
|
86
|
+
if m.isdir():
|
87
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
88
|
+
return
|
89
|
+
|
90
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
91
|
+
|
92
|
+
if m.isreg():
|
93
|
+
src = tf.extractfile(m)
|
94
|
+
if src is None:
|
95
|
+
raise TarExtractionError(f"Missing file data for {m.name!r}")
|
96
|
+
with src, open(out_path, "wb") as f:
|
97
|
+
# Stream copy without trusting metadata
|
98
|
+
for chunk in iter(lambda: src.read(1024 * 1024), b""):
|
99
|
+
f.write(chunk)
|
100
|
+
# Apply conservative mode for regular files (rw-r--r--)
|
101
|
+
try:
|
102
|
+
os.chmod(out_path, 0o644)
|
103
|
+
except Exception:
|
104
|
+
pass # nosec # noqa
|
105
|
+
return
|
106
|
+
|
107
|
+
if m.issym() or m.islnk():
|
108
|
+
# Create a symlink with relative target; errors are non-fatal
|
109
|
+
try:
|
110
|
+
if out_path.exists() or out_path.is_symlink():
|
111
|
+
out_path.unlink()
|
112
|
+
os.symlink(m.linkname, out_path)
|
113
|
+
except Exception:
|
114
|
+
# If symlink creation is not permitted (e.g., Windows), skip
|
115
|
+
pass # nosec # noqa
|
116
|
+
return
|
117
|
+
|
118
|
+
|
119
|
+
def safe_extract_tar(
|
120
|
+
archive: Path, dest: Path, allow_symlinks: bool = False
|
121
|
+
) -> List[Path]:
|
122
|
+
"""
|
123
|
+
Safely extract a tar archive into 'dest'.
|
124
|
+
- Rejects absolute/parent-traversal paths and special members by default.
|
125
|
+
- Never calls TarFile.extractall() (satisfies Bandit B202).
|
126
|
+
- Returns list of extracted filesystem paths.
|
127
|
+
"""
|
128
|
+
archive = Path(archive)
|
129
|
+
dest = Path(dest)
|
130
|
+
dest.mkdir(parents=True, exist_ok=True)
|
131
|
+
|
132
|
+
mode = "r"
|
133
|
+
if archive.suffixes[-2:] in [[".tar", ".gz"]] or archive.suffix == ".tgz":
|
134
|
+
mode = "r:gz"
|
135
|
+
elif archive.suffixes[-2:] == [".tar", ".bz2"]:
|
136
|
+
mode = "r:bz2"
|
137
|
+
elif archive.suffixes[-2:] == [".tar", ".xz"]:
|
138
|
+
mode = "r:xz"
|
139
|
+
elif archive.suffix == ".tar":
|
140
|
+
mode = "r:*" # auto-detect compression
|
141
|
+
else:
|
142
|
+
raise TarExtractionError(f"Unsupported tar archive: {archive.name}")
|
143
|
+
|
144
|
+
extracted: List[Path] = []
|
145
|
+
with tarfile.open(archive, mode) as tf: # type: ignore[call-overload]
|
146
|
+
for m, out_path in _iter_safe_members(tf, dest, allow_symlinks=allow_symlinks):
|
147
|
+
_extract_member(tf, m, out_path)
|
148
|
+
extracted.append(out_path)
|
149
|
+
return extracted
|
150
|
+
|
151
|
+
|
152
|
+
def safe_extract_auto(
|
153
|
+
download_path: str, extract_dir: str, allow_symlinks: bool = False
|
154
|
+
) -> List[Path]:
|
155
|
+
"""
|
156
|
+
Backwards-compatible replacement for:
|
157
|
+
tarfile.open(...).extractall(extract_dir)
|
158
|
+
"""
|
159
|
+
return safe_extract_tar(
|
160
|
+
Path(download_path), Path(extract_dir), allow_symlinks=allow_symlinks
|
161
|
+
)
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# skip_trace/utils/validation.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import logging
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
from email_validator import EmailNotValidError, validate_email
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
RESERVED_DOMAINS = {
|
12
|
+
"example.com",
|
13
|
+
"example.net",
|
14
|
+
"example.org",
|
15
|
+
"localhost",
|
16
|
+
"localhost.localdomain",
|
17
|
+
}
|
18
|
+
|
19
|
+
RESERVED_SUFFIXES = {".test", ".example", ".invalid", ".localhost"}
|
20
|
+
|
21
|
+
|
22
|
+
def is_valid_email(email_string: str) -> Optional[str]:
|
23
|
+
"""
|
24
|
+
Checks if a string is a valid email address using a robust library.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
email_string: The string to validate.
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
The normalized email address if valid, otherwise None.
|
31
|
+
"""
|
32
|
+
if not isinstance(email_string, str):
|
33
|
+
return None
|
34
|
+
|
35
|
+
try:
|
36
|
+
# We only care about syntactic validity, not whether the domain's
|
37
|
+
# mail server is reachable, so we disable deliverability checks.
|
38
|
+
valid = validate_email(email_string, check_deliverability=False)
|
39
|
+
|
40
|
+
for reserved in RESERVED_DOMAINS:
|
41
|
+
if valid.domain.endswith(reserved):
|
42
|
+
return None
|
43
|
+
|
44
|
+
if valid.domain in RESERVED_DOMAINS or any(
|
45
|
+
valid.domain.endswith(suffix) for suffix in RESERVED_SUFFIXES
|
46
|
+
):
|
47
|
+
return None
|
48
|
+
|
49
|
+
return valid.normalized
|
50
|
+
except EmailNotValidError as e:
|
51
|
+
logger.debug(f"String '{email_string}' is not a valid email: {e}")
|
52
|
+
return None
|