skip-trace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,411 @@
1
+ # skip_trace/analysis/source_scanner.py
2
+ from __future__ import annotations
3
+
4
+ import datetime
5
+ import logging
6
+ import os
7
+ import re
8
+ import string
9
+ from typing import List
10
+
11
+ from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
12
+ from ..utils.validation import is_valid_email
13
+ from . import ner
14
+ from .evidence import _parse_contact_string, generate_evidence_id
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Regex to find copyright notices, capturing the holder.
19
+ # Looks for "Copyright", optional (c) symbol, optional year, then the owner.
20
+ COPYRIGHT_RE = re.compile(
21
+ r"copyright\s*(?:\(c\))?\s*(?:[0-9,\-\s]+)?\s*([^\n]+)", re.IGNORECASE
22
+ )
23
+
24
+ # Regex to find __author__ assignments
25
+ AUTHOR_RE = re.compile(r"__author__\s*=\s*['\"]([^'\"]+)['\"]")
26
+
27
+ # Regex for finding standalone email addresses - used as a fast pre-filter
28
+ EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b")
29
+
30
+ # Words that indicate a regex grabbed junk from a license instead of a name.
31
+ # This filter now lives in the scanner, where the bad evidence is generated.
32
+ JUNK_WORDS = {
33
+ "copyright",
34
+ "holders",
35
+ "license",
36
+ "document",
37
+ "accompanies",
38
+ "notice",
39
+ "authors",
40
+ "identifies",
41
+ "endorse",
42
+ "promote",
43
+ "software",
44
+ "permission",
45
+ "conditions",
46
+ # stop words
47
+ "and",
48
+ "other",
49
+ "the",
50
+ "for",
51
+ "with",
52
+ "this",
53
+ "list",
54
+ "following",
55
+ "txt",
56
+ "damages",
57
+ "owner",
58
+ # legalese
59
+ "incidental",
60
+ "holder",
61
+ "liability",
62
+ # license names
63
+ "MIT",
64
+ "BSD",
65
+ }
66
+
67
+ # --- NEW: Filename allowlist and more robust binary detection ---
68
+
69
+ # A set of common extensionless text files that should never be treated as binary.
70
+ TEXT_FILENAMES = {
71
+ "readme",
72
+ "license",
73
+ "copying",
74
+ "notice",
75
+ "authors",
76
+ "contributors",
77
+ "changelog",
78
+ "history",
79
+ "install",
80
+ "makefile",
81
+ "dockerfile",
82
+ "vagrantfile",
83
+ }
84
+
85
+
86
+ def _is_binary_file(filepath: str, chunk_size: int = 1024) -> bool:
87
+ """
88
+ Heuristically determines if a file is binary using a multi-step check.
89
+
90
+ 1. Checks against an allowlist of common text filenames (e.g., 'LICENSE').
91
+ 2. Checks for the presence of NULL bytes.
92
+ 3. Checks the ratio of non-printable text characters.
93
+
94
+ Args:
95
+ filepath: The path to the file to check.
96
+ chunk_size: The number of bytes to read from the beginning of the file.
97
+
98
+ Returns:
99
+ True if the file is likely binary, False otherwise.
100
+ """
101
+ # 1. Check filename allowlist first.
102
+ basename = os.path.basename(filepath).lower()
103
+ if basename in TEXT_FILENAMES:
104
+ return False
105
+
106
+ try:
107
+ with open(filepath, "rb") as f:
108
+ chunk = f.read(chunk_size)
109
+ except IOError:
110
+ return True # Cannot read, so skip it.
111
+
112
+ if not chunk:
113
+ return False # Empty file is not binary.
114
+
115
+ # 2. A null byte is a strong indicator of a binary file.
116
+ if b"\0" in chunk:
117
+ return True
118
+
119
+ # 3. Check the ratio of text characters to total characters.
120
+ # A high percentage of non-printable characters indicates binary data.
121
+ printable = set(bytes(string.printable, "ascii"))
122
+ non_printable_count = sum(1 for byte in chunk if byte not in printable)
123
+
124
+ # If more than 30% of the characters are non-printable, it's likely binary.
125
+ ratio = non_printable_count / len(chunk)
126
+ return ratio > 0.3
127
+
128
+
129
+ def _process_authors_file(
130
+ content: str, locator: str, now: datetime.datetime
131
+ ) -> List[EvidenceRecord]:
132
+ """Processes an AUTHORS file, treating each non-blank line as a potential author."""
133
+ evidence_list = []
134
+ logger.debug(f"Processing AUTHORS file at: {locator}")
135
+ lines = [line.strip() for line in content.splitlines()]
136
+ for line in lines:
137
+ if not line or line.startswith("#"):
138
+ continue
139
+
140
+ parsed = _parse_contact_string(line)
141
+ if not parsed.get("name") and not parsed.get("email"):
142
+ continue
143
+
144
+ value = {"name": parsed["name"], "email": parsed["email"]}
145
+ name_for_slug = parsed["name"] or parsed["email"] or "unknown"
146
+
147
+ record = EvidenceRecord(
148
+ id=generate_evidence_id(
149
+ EvidenceSource.WHEEL,
150
+ EvidenceKind.AUTHOR_TAG,
151
+ locator,
152
+ str(value),
153
+ name_for_slug,
154
+ ),
155
+ source=EvidenceSource.WHEEL,
156
+ locator=locator,
157
+ kind=EvidenceKind.AUTHOR_TAG,
158
+ value=value,
159
+ observed_at=now,
160
+ confidence=0.20, # Higher confidence than a random email
161
+ notes=f"Found author '{line}' in AUTHORS file.",
162
+ )
163
+ evidence_list.append(record)
164
+ logger.debug(f"Found author from AUTHORS file: {line}")
165
+
166
+ return evidence_list
167
+
168
+
169
+ def scan_directory(directory_path: str, locator_prefix: str) -> List[EvidenceRecord]:
170
+ """
171
+ Scans a directory of files for ownership evidence.
172
+
173
+ Args:
174
+ directory_path: The absolute path to the directory to scan.
175
+ locator_prefix: A prefix for the evidence locator (e.g., package name/version).
176
+
177
+ Returns:
178
+ A list of EvidenceRecord objects found in the files.
179
+ """
180
+ evidence_list: List[EvidenceRecord] = []
181
+ now = datetime.datetime.now(datetime.timezone.utc)
182
+
183
+ skip_dirs = {
184
+ ".git",
185
+ "__pycache__",
186
+ ".idea",
187
+ ".vscode",
188
+ "dist",
189
+ "build",
190
+ ".egg-info",
191
+ "node_modules",
192
+ }
193
+ # More comprehensive list of binary extensions
194
+ skip_extensions = {
195
+ ".pyc",
196
+ ".pyo",
197
+ ".so",
198
+ ".pyd",
199
+ ".egg",
200
+ ".whl", # Python
201
+ ".o",
202
+ ".a",
203
+ ".dll",
204
+ ".exe", # Compiled
205
+ ".svg",
206
+ ".png",
207
+ ".jpg",
208
+ ".jpeg",
209
+ ".gif",
210
+ ".ico",
211
+ ".webp", # Images
212
+ ".woff",
213
+ ".woff2",
214
+ ".ttf",
215
+ ".eot",
216
+ ".otf", # Fonts
217
+ ".zip",
218
+ ".tar",
219
+ ".gz",
220
+ ".bz2",
221
+ ".7z",
222
+ ".rar", # Archives
223
+ ".pdf",
224
+ ".doc",
225
+ ".docx",
226
+ ".xls",
227
+ ".xlsx",
228
+ ".ppt",
229
+ ".pptx",
230
+ ".odt", # Docs
231
+ ".mp3",
232
+ ".mp4",
233
+ ".wav",
234
+ ".flac",
235
+ ".ogg",
236
+ ".mov",
237
+ ".avi",
238
+ ".mkv", # Media
239
+ }
240
+
241
+ file_count = 0
242
+ for root, dirs, files in os.walk(directory_path):
243
+ # Modify dirs in-place to prune the search
244
+ dirs[:] = [d for d in dirs if d not in skip_dirs]
245
+
246
+ for filename in files:
247
+ file_path = os.path.join(root, filename)
248
+ relative_path = os.path.relpath(file_path, directory_path)
249
+ file_count += 1
250
+
251
+ _, extension = os.path.splitext(filename)
252
+ if extension.lower() in skip_extensions:
253
+ continue
254
+
255
+ if _is_binary_file(file_path):
256
+ logger.debug(
257
+ f"Skipping binary file detected by content: {relative_path}"
258
+ )
259
+ continue
260
+
261
+ logger.debug(f"Scanning file: {relative_path}")
262
+
263
+ try:
264
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
265
+ content = f.read()
266
+
267
+ locator = f"{locator_prefix}/{relative_path}"
268
+
269
+ # 1. Special handling for AUTHORS files
270
+ if filename.lower().startswith(
271
+ "authors"
272
+ ) or filename.lower().startswith("contributors"):
273
+ evidence_list.extend(_process_authors_file(content, locator, now))
274
+ continue # Don't process this file further for generic matches
275
+
276
+ # Use NER for copyright lines
277
+ for match in COPYRIGHT_RE.finditer(content):
278
+ copyright_text = match.group(1).strip().rstrip(",.")
279
+
280
+ # Try NER first
281
+ entities = ner.extract_entities(copyright_text)
282
+ if entities:
283
+ for entity_name, entity_label in entities:
284
+ if entity_name.lower() not in JUNK_WORDS:
285
+ value: dict[str, str | None] = {
286
+ "holder": entity_name,
287
+ "file": relative_path,
288
+ }
289
+ notes = f"Found copyright holder '{entity_name}' via NER ({entity_label})."
290
+ record = EvidenceRecord(
291
+ id=generate_evidence_id(
292
+ EvidenceSource.WHEEL,
293
+ EvidenceKind.COPYRIGHT,
294
+ locator,
295
+ str(value),
296
+ entity_name,
297
+ ),
298
+ source=EvidenceSource.WHEEL,
299
+ locator=locator,
300
+ kind=EvidenceKind.COPYRIGHT,
301
+ value=value,
302
+ observed_at=now,
303
+ confidence=0.40, # Higher confidence for NER
304
+ notes=notes,
305
+ )
306
+ already_in = False
307
+ for already in evidence_list:
308
+ if already.notes == notes:
309
+ already_in = True
310
+ if not already_in:
311
+ evidence_list.append(record)
312
+ # else:
313
+ # # --- Stricter filtering for the regex fallback ---
314
+ # # 1. Reject if it's too long to be a name.
315
+ # if len(copyright_text) > 50: continue
316
+ # # 2. Reject if it contains common license garbage words.
317
+ # if any(word in copyright_text.lower() for word in JUNK_WORDS): continue
318
+ #
319
+ # value = {"holder": copyright_text, "file": relative_path}
320
+ # record = EvidenceRecord(
321
+ # id=generate_evidence_id(EvidenceSource.WHEEL, EvidenceKind.COPYRIGHT, locator, str(value),
322
+ # copyright_text),
323
+ # source=EvidenceSource.WHEEL, locator=locator, kind=EvidenceKind.COPYRIGHT,
324
+ # value=value, observed_at=now, confidence=0.25,
325
+ # notes=f"Found copyright notice for '{copyright_text}' in file (regex fallback)."
326
+ # )
327
+ # evidence_list.append(record)else:
328
+ # # --- Stricter filtering for the regex fallback ---
329
+ # # 1. Reject if it's too long to be a name.
330
+ # if len(copyright_text) > 50: continue
331
+ # # 2. Reject if it contains common license garbage words.
332
+ # if any(word in copyright_text.lower() for word in JUNK_WORDS): continue
333
+ #
334
+ # value = {"holder": copyright_text, "file": relative_path}
335
+ # record = EvidenceRecord(
336
+ # id=generate_evidence_id(EvidenceSource.WHEEL, EvidenceKind.COPYRIGHT, locator, str(value),
337
+ # copyright_text),
338
+ # source=EvidenceSource.WHEEL, locator=locator, kind=EvidenceKind.COPYRIGHT,
339
+ # value=value, observed_at=now, confidence=0.25,
340
+ # notes=f"Found copyright notice for '{copyright_text}' in file (regex fallback)."
341
+ # )
342
+ # evidence_list.append(record)
343
+
344
+ # 3. Scan for __author__ tags in Python files
345
+ if filename.endswith(".py"):
346
+ for match in AUTHOR_RE.finditer(content):
347
+ author_str = match.group(1).strip()
348
+ parsed = _parse_contact_string(author_str)
349
+ if not parsed.get("name") and not parsed.get("email"):
350
+ continue
351
+
352
+ value = {"name": parsed["name"], "email": parsed["email"]}
353
+ name_for_slug = parsed["name"] or parsed["email"] or "unknown"
354
+ record = EvidenceRecord(
355
+ id=generate_evidence_id(
356
+ EvidenceSource.WHEEL,
357
+ EvidenceKind.AUTHOR_TAG,
358
+ locator,
359
+ str(value),
360
+ name_for_slug,
361
+ ),
362
+ source=EvidenceSource.WHEEL,
363
+ locator=locator,
364
+ kind=EvidenceKind.AUTHOR_TAG,
365
+ value=value,
366
+ observed_at=now,
367
+ confidence=0.20,
368
+ notes=f"Found __author__ tag for '{author_str}' in file.",
369
+ )
370
+ evidence_list.append(record)
371
+
372
+ # 4. Scan for any standalone email address (lower confidence)
373
+ # First, find candidates with regex, then validate them properly.
374
+ for match in EMAIL_RE.finditer(content):
375
+ potential_email = match.group(0)
376
+ if valid_email := is_valid_email(potential_email):
377
+ value = {"name": None, "email": valid_email}
378
+ notes = (
379
+ f"Found validated contact email '{valid_email}' in file."
380
+ )
381
+ record = EvidenceRecord(
382
+ id=generate_evidence_id(
383
+ EvidenceSource.WHEEL,
384
+ EvidenceKind.CONTACT,
385
+ locator,
386
+ str(value),
387
+ valid_email,
388
+ ),
389
+ source=EvidenceSource.WHEEL,
390
+ locator=locator,
391
+ kind=EvidenceKind.CONTACT,
392
+ value=value,
393
+ observed_at=now,
394
+ confidence=0.15, # Slightly higher confidence now that it's validated
395
+ notes=notes,
396
+ )
397
+ already_in = False
398
+ for already in evidence_list:
399
+ if already.notes == notes:
400
+ already_in = True
401
+ if not already_in:
402
+ evidence_list.append(record)
403
+
404
+ except (IOError, UnicodeDecodeError) as e:
405
+ logger.debug(f"Could not read or process file {file_path}: {e}")
406
+ continue
407
+
408
+ logger.info(
409
+ f"Scanned {file_count} files in directory, found {len(evidence_list)} potential evidence records."
410
+ )
411
+ return evidence_list
skip_trace/cli.py ADDED
@@ -0,0 +1,177 @@
1
+ # skip_trace/cli.py
2
+ from __future__ import annotations
3
+
4
+ import sys
5
+ from typing import List, Optional
6
+
7
+ from rich_argparse import RichHelpFormatter
8
+
9
+ from . import __version__
10
+ from .main import run_command
11
+ from .utils.cli_suggestions import SmartParser
12
+
13
+
14
+ def create_parser() -> SmartParser:
15
+ """Creates the main argument parser for the application."""
16
+
17
+ parser = SmartParser(
18
+ prog="skip-trace",
19
+ description="Infer ownership of Python packages from public artifacts and local source.",
20
+ epilog="For more help on a specific command, use: skip-trace <command> -h",
21
+ formatter_class=RichHelpFormatter,
22
+ )
23
+ parser.add_argument(
24
+ "-v", "--version", action="version", version=f"%(prog)s {__version__}"
25
+ )
26
+
27
+ # --- --verbose flag ---
28
+ parser.add_argument(
29
+ "--verbose",
30
+ action="store_const",
31
+ dest="log_level",
32
+ const="DEBUG",
33
+ default="WARNING",
34
+ help="Enable verbose (debug) logging.",
35
+ )
36
+ parser.add_argument(
37
+ "--log-level",
38
+ choices=["ERROR", "WARNING", "INFO", "DEBUG"],
39
+ help="Set the logging level (overridden by --verbose).",
40
+ )
41
+
42
+ fmt = parser.add_mutually_exclusive_group()
43
+ fmt.add_argument(
44
+ "--json",
45
+ dest="output_format",
46
+ action="store_const",
47
+ const="json",
48
+ help="Output results in JSON format.",
49
+ )
50
+ fmt.add_argument(
51
+ "--md",
52
+ dest="output_format",
53
+ action="store_const",
54
+ const="md",
55
+ help="Output results in Markdown format.",
56
+ )
57
+
58
+ parser.add_argument(
59
+ "--no-redact",
60
+ action="store_true",
61
+ help="Do not redact contact information in output.",
62
+ )
63
+ parser.add_argument(
64
+ "--llm-ner",
65
+ choices=["off", "on", "auto"],
66
+ default="auto",
67
+ help="Control LLM-assisted Named Entity Recognition.",
68
+ )
69
+ parser.add_argument(
70
+ "--jobs", type=int, default=None, help="Number of concurrent jobs to run."
71
+ )
72
+ parser.add_argument(
73
+ "--cache-dir", type=str, default=None, help="Path to the cache directory."
74
+ )
75
+
76
+ sub = parser.add_subparsers(dest="command", required=True, title="Commands")
77
+
78
+ # --- `who-owns` subcommand ---
79
+ p_who = sub.add_parser(
80
+ "who-owns", help="Find ownership for a single remote package."
81
+ )
82
+ p_who.add_argument("package", help="The name of the package (e.g., 'requests').")
83
+ p_who.add_argument("--version", help="The specific version of the package.")
84
+
85
+ # --- `venv` subcommand ---
86
+ p_venv = sub.add_parser(
87
+ "venv", help="Scan all packages in a virtual environment (not yet implemented)."
88
+ )
89
+ p_venv.add_argument(
90
+ "--path", help="Path to the Python executable or site-packages of the venv."
91
+ )
92
+
93
+ # --- `reqs` subcommand ---
94
+ p_reqs = sub.add_parser(
95
+ "reqs", help="Scan packages from a requirements file (not yet implemented)."
96
+ )
97
+ p_reqs.add_argument("requirements_file", help="Path to the requirements.txt file.")
98
+
99
+ # --- `explain` subcommand ---
100
+ p_explain = sub.add_parser(
101
+ "explain",
102
+ help="Show the evidence behind an ownership claim (not yet implemented).",
103
+ )
104
+ p_explain.add_argument("package", help="The name of the package.")
105
+ p_explain.add_argument("--id", help="The specific evidence ID to display.")
106
+
107
+ # --- `graph` subcommand ---
108
+ p_graph = sub.add_parser(
109
+ "graph", help="Generate an ownership graph for a package (not yet implemented)."
110
+ )
111
+ p_graph.add_argument("package", help="The name of the package.")
112
+ p_graph.add_argument(
113
+ "--format",
114
+ choices=["dot", "mermaid"],
115
+ default="mermaid",
116
+ help="The output format for the graph.",
117
+ )
118
+
119
+ # --- `cache` subcommand ---
120
+ p_cache = sub.add_parser("cache", help="Manage the local cache.")
121
+ cache_group = p_cache.add_mutually_exclusive_group(required=True)
122
+ cache_group.add_argument(
123
+ "--clear",
124
+ action="store_true",
125
+ help="Clear all cached data (not yet implemented).",
126
+ )
127
+ cache_group.add_argument(
128
+ "--show", action="store_true", help="Show cache statistics and location."
129
+ )
130
+
131
+ # --- `policy` subcommand ---
132
+ p_policy = sub.add_parser(
133
+ "policy", help="Configure and view policy thresholds (not yet implemented)."
134
+ )
135
+ p_policy.add_argument(
136
+ "--min-score", type=float, help="Set the minimum score for a package to 'pass'."
137
+ )
138
+ p_policy.add_argument(
139
+ "--fail-under",
140
+ type=float,
141
+ help="Set the score below which a package is 'anonymous'.",
142
+ )
143
+
144
+ # Set default output format
145
+ parser.set_defaults(output_format="md")
146
+
147
+ return parser
148
+
149
+
150
+ def main(argv: Optional[List[str]] = None) -> int:
151
+ """
152
+ Main entry point for the CLI.
153
+
154
+ Parses arguments and dispatches to the main application logic.
155
+ :param argv: Command line arguments (defaults to sys.argv[1:]).
156
+ :return: Exit code.
157
+ """
158
+ if argv is None:
159
+ argv = sys.argv[1:]
160
+
161
+ parser = create_parser()
162
+ args = parser.parse_args(argv)
163
+
164
+ # For commands that pipe, default to JSON
165
+ if (
166
+ not sys.stdout.isatty()
167
+ and "output_format" in args
168
+ and args.output_format != "json"
169
+ ):
170
+ args.output_format = "json"
171
+
172
+ try:
173
+ return run_command(args)
174
+ except Exception as e:
175
+ # TODO: Add proper logging based on log-level
176
+ print(f"An unexpected error occurred: {e}", file=sys.stderr)
177
+ return 1
@@ -0,0 +1,4 @@
1
+ # skip_trace/collectors/__init__.py
2
+ from . import github, package_files, pypi, whois
3
+
4
+ __all__ = ["github", "pypi", "whois", "package_files"]