cve-sentinel 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1121 @@
1
+ """Import statement scanner for Level 3 analysis.
2
+
3
+ This module scans source code files to find import/require statements
4
+ and maps them to package names for vulnerability detection.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import re
11
+ from abc import ABC, abstractmethod
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ from typing import Dict, List, Optional, Pattern, Set, Type
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @dataclass
20
+ class ImportReference:
21
+ """Reference to an import statement in source code.
22
+
23
+ Attributes:
24
+ package_name: The name of the imported package.
25
+ file_path: Path to the source file.
26
+ line_number: Line number where the import occurs (1-indexed).
27
+ import_statement: The full import statement text.
28
+ ecosystem: The package ecosystem (npm, pypi, etc.).
29
+ """
30
+
31
+ package_name: str
32
+ file_path: Path
33
+ line_number: int
34
+ import_statement: str
35
+ ecosystem: str
36
+
37
+ def to_dict(self) -> Dict:
38
+ """Convert to dictionary for serialization."""
39
+ return {
40
+ "package_name": self.package_name,
41
+ "file_path": str(self.file_path),
42
+ "line_number": self.line_number,
43
+ "import_statement": self.import_statement,
44
+ "ecosystem": self.ecosystem,
45
+ }
46
+
47
+
48
+ class BaseLanguageScanner(ABC):
49
+ """Base class for language-specific import scanners."""
50
+
51
+ # File extensions this scanner handles
52
+ FILE_EXTENSIONS: List[str] = []
53
+
54
+ # Ecosystem name for this scanner
55
+ ECOSYSTEM: str = ""
56
+
57
+ # Default exclude patterns
58
+ DEFAULT_EXCLUDES: List[str] = [
59
+ "node_modules",
60
+ "vendor",
61
+ ".git",
62
+ "__pycache__",
63
+ "venv",
64
+ ".venv",
65
+ "env",
66
+ ".tox",
67
+ "build",
68
+ "dist",
69
+ "target",
70
+ ]
71
+
72
+ def __init__(self, exclude_patterns: Optional[List[str]] = None) -> None:
73
+ """Initialize the scanner.
74
+
75
+ Args:
76
+ exclude_patterns: Additional patterns to exclude from scanning.
77
+ """
78
+ self.exclude_patterns = self.DEFAULT_EXCLUDES.copy()
79
+ if exclude_patterns:
80
+ self.exclude_patterns.extend(exclude_patterns)
81
+
82
+ def _should_exclude(self, path: Path) -> bool:
83
+ """Check if a path should be excluded from scanning.
84
+
85
+ Args:
86
+ path: Path to check.
87
+
88
+ Returns:
89
+ True if the path should be excluded.
90
+ """
91
+ path_str = str(path)
92
+ for pattern in self.exclude_patterns:
93
+ if pattern in path_str:
94
+ return True
95
+ return False
96
+
97
+ def scan_directory(
98
+ self,
99
+ directory: Path,
100
+ max_file_size: int = 1024 * 1024, # 1MB default
101
+ ) -> List[ImportReference]:
102
+ """Scan a directory for import statements.
103
+
104
+ Args:
105
+ directory: Directory to scan.
106
+ max_file_size: Maximum file size in bytes to scan.
107
+
108
+ Returns:
109
+ List of ImportReference objects found.
110
+ """
111
+ references: List[ImportReference] = []
112
+
113
+ if not directory.exists() or not directory.is_dir():
114
+ return references
115
+
116
+ for ext in self.FILE_EXTENSIONS:
117
+ pattern = f"**/*{ext}"
118
+ for file_path in directory.glob(pattern):
119
+ if self._should_exclude(file_path):
120
+ continue
121
+
122
+ if file_path.is_file():
123
+ # Check file size
124
+ try:
125
+ if file_path.stat().st_size > max_file_size:
126
+ logger.debug(f"Skipping large file: {file_path}")
127
+ continue
128
+ except OSError:
129
+ continue
130
+
131
+ file_refs = self.scan_file(file_path)
132
+ references.extend(file_refs)
133
+
134
+ return references
135
+
136
+ def scan_file(self, file_path: Path) -> List[ImportReference]:
137
+ """Scan a single file for import statements.
138
+
139
+ Args:
140
+ file_path: Path to the file to scan.
141
+
142
+ Returns:
143
+ List of ImportReference objects found.
144
+ """
145
+ references: List[ImportReference] = []
146
+
147
+ try:
148
+ content = file_path.read_text(encoding="utf-8", errors="ignore")
149
+ except OSError as e:
150
+ logger.warning(f"Failed to read file {file_path}: {e}")
151
+ return references
152
+
153
+ lines = content.split("\n")
154
+ for line_num, line in enumerate(lines, start=1):
155
+ packages = self._extract_packages(line)
156
+ for pkg_name, statement in packages:
157
+ references.append(
158
+ ImportReference(
159
+ package_name=pkg_name,
160
+ file_path=file_path,
161
+ line_number=line_num,
162
+ import_statement=statement.strip(),
163
+ ecosystem=self.ECOSYSTEM,
164
+ )
165
+ )
166
+
167
+ return references
168
+
169
+ @abstractmethod
170
+ def _extract_packages(self, line: str) -> List[tuple]:
171
+ """Extract package names from a line of code.
172
+
173
+ Args:
174
+ line: A single line of source code.
175
+
176
+ Returns:
177
+ List of tuples (package_name, import_statement).
178
+ """
179
+ pass
180
+
181
+
182
+ class JavaScriptScanner(BaseLanguageScanner):
183
+ """Scanner for JavaScript/TypeScript import statements."""
184
+
185
+ FILE_EXTENSIONS = [".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs"]
186
+ ECOSYSTEM = "npm"
187
+
188
+ # Patterns for JavaScript/TypeScript imports
189
+ # import ... from 'package'
190
+ IMPORT_FROM_PATTERN: Pattern = re.compile(
191
+ r"""import\s+(?:(?:\{[^}]*\}|\*\s+as\s+\w+|\w+)(?:\s*,\s*(?:\{[^}]*\}|\*\s+as\s+\w+|\w+))*\s+from\s+)?['"]([^'"]+)['"]"""
192
+ )
193
+ # require('package')
194
+ REQUIRE_PATTERN: Pattern = re.compile(r"""require\s*\(\s*['"]([^'"]+)['"]\s*\)""")
195
+ # import('package') - dynamic import
196
+ DYNAMIC_IMPORT_PATTERN: Pattern = re.compile(r"""import\s*\(\s*['"]([^'"]+)['"]\s*\)""")
197
+
198
+ def _extract_packages(self, line: str) -> List[tuple]:
199
+ """Extract package names from JavaScript/TypeScript import statements."""
200
+ results: List[tuple] = []
201
+
202
+ # Check for import ... from 'package' or import 'package'
203
+ for match in self.IMPORT_FROM_PATTERN.finditer(line):
204
+ pkg_path = match.group(1)
205
+ pkg_name = self._normalize_package_name(pkg_path)
206
+ if pkg_name:
207
+ results.append((pkg_name, line))
208
+
209
+ # Check for require('package')
210
+ for match in self.REQUIRE_PATTERN.finditer(line):
211
+ pkg_path = match.group(1)
212
+ pkg_name = self._normalize_package_name(pkg_path)
213
+ if pkg_name:
214
+ results.append((pkg_name, line))
215
+
216
+ # Check for import('package') - dynamic import
217
+ for match in self.DYNAMIC_IMPORT_PATTERN.finditer(line):
218
+ pkg_path = match.group(1)
219
+ pkg_name = self._normalize_package_name(pkg_path)
220
+ if pkg_name:
221
+ results.append((pkg_name, line))
222
+
223
+ return results
224
+
225
+ def _normalize_package_name(self, pkg_path: str) -> Optional[str]:
226
+ """Normalize a package path to a package name.
227
+
228
+ Handles:
229
+ - Scoped packages: @scope/package -> @scope/package
230
+ - Subpath imports: package/subpath -> package
231
+ - Relative imports: ./local -> None (excluded)
232
+ - Node built-ins: node:fs -> None (excluded)
233
+
234
+ Args:
235
+ pkg_path: The raw package path from the import.
236
+
237
+ Returns:
238
+ Normalized package name or None if it should be excluded.
239
+ """
240
+ # Exclude relative imports
241
+ if pkg_path.startswith(".") or pkg_path.startswith("/"):
242
+ return None
243
+
244
+ # Exclude node: protocol (built-ins)
245
+ if pkg_path.startswith("node:"):
246
+ return None
247
+
248
+ # Handle scoped packages (@scope/package)
249
+ if pkg_path.startswith("@"):
250
+ parts = pkg_path.split("/")
251
+ if len(parts) >= 2:
252
+ # Return @scope/package, ignore subpaths
253
+ return f"{parts[0]}/{parts[1]}"
254
+ return pkg_path
255
+
256
+ # Regular package - get first part before /
257
+ parts = pkg_path.split("/")
258
+ return parts[0]
259
+
260
+
261
+ class PythonScanner(BaseLanguageScanner):
262
+ """Scanner for Python import statements."""
263
+
264
+ FILE_EXTENSIONS = [".py"]
265
+ ECOSYSTEM = "pypi"
266
+
267
+ # Patterns for Python imports
268
+ # import package or import package as alias
269
+ IMPORT_PATTERN: Pattern = re.compile(r"""^import\s+([\w.]+)""")
270
+ # from package import ... or from package.sub import ...
271
+ FROM_IMPORT_PATTERN: Pattern = re.compile(r"""^from\s+([\w.]+)\s+import\s+""")
272
+
273
+ # Standard library modules to exclude
274
+ STDLIB_MODULES: Set[str] = {
275
+ "abc",
276
+ "aifc",
277
+ "argparse",
278
+ "array",
279
+ "ast",
280
+ "asyncio",
281
+ "atexit",
282
+ "base64",
283
+ "bdb",
284
+ "binascii",
285
+ "binhex",
286
+ "bisect",
287
+ "builtins",
288
+ "bz2",
289
+ "calendar",
290
+ "cgi",
291
+ "cgitb",
292
+ "chunk",
293
+ "cmath",
294
+ "cmd",
295
+ "code",
296
+ "codecs",
297
+ "codeop",
298
+ "collections",
299
+ "colorsys",
300
+ "compileall",
301
+ "concurrent",
302
+ "configparser",
303
+ "contextlib",
304
+ "contextvars",
305
+ "copy",
306
+ "copyreg",
307
+ "cProfile",
308
+ "crypt",
309
+ "csv",
310
+ "ctypes",
311
+ "curses",
312
+ "dataclasses",
313
+ "datetime",
314
+ "dbm",
315
+ "decimal",
316
+ "difflib",
317
+ "dis",
318
+ "distutils",
319
+ "doctest",
320
+ "email",
321
+ "encodings",
322
+ "enum",
323
+ "errno",
324
+ "faulthandler",
325
+ "fcntl",
326
+ "filecmp",
327
+ "fileinput",
328
+ "fnmatch",
329
+ "fractions",
330
+ "ftplib",
331
+ "functools",
332
+ "gc",
333
+ "getopt",
334
+ "getpass",
335
+ "gettext",
336
+ "glob",
337
+ "graphlib",
338
+ "grp",
339
+ "gzip",
340
+ "hashlib",
341
+ "heapq",
342
+ "hmac",
343
+ "html",
344
+ "http",
345
+ "imaplib",
346
+ "imghdr",
347
+ "imp",
348
+ "importlib",
349
+ "inspect",
350
+ "io",
351
+ "ipaddress",
352
+ "itertools",
353
+ "json",
354
+ "keyword",
355
+ "lib2to3",
356
+ "linecache",
357
+ "locale",
358
+ "logging",
359
+ "lzma",
360
+ "mailbox",
361
+ "mailcap",
362
+ "marshal",
363
+ "math",
364
+ "mimetypes",
365
+ "mmap",
366
+ "modulefinder",
367
+ "multiprocessing",
368
+ "netrc",
369
+ "nis",
370
+ "nntplib",
371
+ "numbers",
372
+ "operator",
373
+ "optparse",
374
+ "os",
375
+ "ossaudiodev",
376
+ "pathlib",
377
+ "pdb",
378
+ "pickle",
379
+ "pickletools",
380
+ "pipes",
381
+ "pkgutil",
382
+ "platform",
383
+ "plistlib",
384
+ "poplib",
385
+ "posix",
386
+ "posixpath",
387
+ "pprint",
388
+ "profile",
389
+ "pstats",
390
+ "pty",
391
+ "pwd",
392
+ "py_compile",
393
+ "pyclbr",
394
+ "pydoc",
395
+ "queue",
396
+ "quopri",
397
+ "random",
398
+ "re",
399
+ "readline",
400
+ "reprlib",
401
+ "resource",
402
+ "rlcompleter",
403
+ "runpy",
404
+ "sched",
405
+ "secrets",
406
+ "select",
407
+ "selectors",
408
+ "shelve",
409
+ "shlex",
410
+ "shutil",
411
+ "signal",
412
+ "site",
413
+ "smtpd",
414
+ "smtplib",
415
+ "sndhdr",
416
+ "socket",
417
+ "socketserver",
418
+ "spwd",
419
+ "sqlite3",
420
+ "ssl",
421
+ "stat",
422
+ "statistics",
423
+ "string",
424
+ "stringprep",
425
+ "struct",
426
+ "subprocess",
427
+ "sunau",
428
+ "symtable",
429
+ "sys",
430
+ "sysconfig",
431
+ "syslog",
432
+ "tabnanny",
433
+ "tarfile",
434
+ "telnetlib",
435
+ "tempfile",
436
+ "termios",
437
+ "test",
438
+ "textwrap",
439
+ "threading",
440
+ "time",
441
+ "timeit",
442
+ "tkinter",
443
+ "token",
444
+ "tokenize",
445
+ "trace",
446
+ "traceback",
447
+ "tracemalloc",
448
+ "tty",
449
+ "turtle",
450
+ "turtledemo",
451
+ "types",
452
+ "typing",
453
+ "unicodedata",
454
+ "unittest",
455
+ "urllib",
456
+ "uu",
457
+ "uuid",
458
+ "venv",
459
+ "warnings",
460
+ "wave",
461
+ "weakref",
462
+ "webbrowser",
463
+ "winreg",
464
+ "winsound",
465
+ "wsgiref",
466
+ "xdrlib",
467
+ "xml",
468
+ "xmlrpc",
469
+ "zipapp",
470
+ "zipfile",
471
+ "zipimport",
472
+ "zlib",
473
+ "_thread",
474
+ }
475
+
476
+ def _extract_packages(self, line: str) -> List[tuple]:
477
+ """Extract package names from Python import statements."""
478
+ results: List[tuple] = []
479
+ line_stripped = line.strip()
480
+
481
+ # Check for 'import package'
482
+ match = self.IMPORT_PATTERN.match(line_stripped)
483
+ if match:
484
+ module_path = match.group(1)
485
+ pkg_name = self._normalize_package_name(module_path)
486
+ if pkg_name:
487
+ results.append((pkg_name, line))
488
+
489
+ # Check for 'from package import ...'
490
+ match = self.FROM_IMPORT_PATTERN.match(line_stripped)
491
+ if match:
492
+ module_path = match.group(1)
493
+ pkg_name = self._normalize_package_name(module_path)
494
+ if pkg_name:
495
+ results.append((pkg_name, line))
496
+
497
+ return results
498
+
499
+ def _normalize_package_name(self, module_path: str) -> Optional[str]:
500
+ """Normalize a module path to a package name.
501
+
502
+ Args:
503
+ module_path: The module path from the import (e.g., 'package.submodule').
504
+
505
+ Returns:
506
+ Package name or None if it's a standard library module.
507
+ """
508
+ # Get the top-level package
509
+ parts = module_path.split(".")
510
+ top_level = parts[0]
511
+
512
+ # Exclude standard library modules
513
+ if top_level in self.STDLIB_MODULES:
514
+ return None
515
+
516
+ # Exclude relative imports (shouldn't match our pattern, but safety check)
517
+ if top_level.startswith("_") and top_level != "_":
518
+ return None
519
+
520
+ return top_level
521
+
522
+
523
+ class GoScanner(BaseLanguageScanner):
524
+ """Scanner for Go import statements."""
525
+
526
+ FILE_EXTENSIONS = [".go"]
527
+ ECOSYSTEM = "go"
528
+
529
+ # Patterns for Go imports
530
+ # import "package"
531
+ SINGLE_IMPORT_PATTERN: Pattern = re.compile(r"""^\s*import\s+(?:\w+\s+)?["']([^"']+)["']""")
532
+ # import ( "package" ) - inside block
533
+ BLOCK_IMPORT_PATTERN: Pattern = re.compile(r"""^\s*(?:\w+\s+)?["']([^"']+)["']""")
534
+
535
+ # Standard library prefixes to exclude
536
+ STDLIB_PREFIXES: List[str] = [
537
+ "archive/",
538
+ "bufio",
539
+ "bytes",
540
+ "compress/",
541
+ "container/",
542
+ "context",
543
+ "crypto/",
544
+ "database/",
545
+ "debug/",
546
+ "embed",
547
+ "encoding/",
548
+ "errors",
549
+ "expvar",
550
+ "flag",
551
+ "fmt",
552
+ "go/",
553
+ "hash/",
554
+ "html/",
555
+ "image/",
556
+ "index/",
557
+ "io",
558
+ "log/",
559
+ "math/",
560
+ "mime/",
561
+ "net/",
562
+ "os",
563
+ "path/",
564
+ "plugin",
565
+ "reflect",
566
+ "regexp",
567
+ "runtime",
568
+ "sort",
569
+ "strconv",
570
+ "strings",
571
+ "sync",
572
+ "syscall",
573
+ "testing",
574
+ "text/",
575
+ "time",
576
+ "unicode",
577
+ "unsafe",
578
+ ]
579
+
580
+ def __init__(self, exclude_patterns: Optional[List[str]] = None) -> None:
581
+ super().__init__(exclude_patterns)
582
+ self._in_import_block = False
583
+
584
+ def scan_file(self, file_path: Path) -> List[ImportReference]:
585
+ """Override to handle import blocks."""
586
+ references: List[ImportReference] = []
587
+
588
+ try:
589
+ content = file_path.read_text(encoding="utf-8", errors="ignore")
590
+ except OSError as e:
591
+ logger.warning(f"Failed to read file {file_path}: {e}")
592
+ return references
593
+
594
+ lines = content.split("\n")
595
+ in_import_block = False
596
+
597
+ for line_num, line in enumerate(lines, start=1):
598
+ stripped = line.strip()
599
+
600
+ # Check for start of import block
601
+ if stripped.startswith("import ("):
602
+ in_import_block = True
603
+ continue
604
+
605
+ # Check for end of import block
606
+ if in_import_block and stripped == ")":
607
+ in_import_block = False
608
+ continue
609
+
610
+ # Process imports
611
+ if in_import_block:
612
+ # Inside import block
613
+ match = self.BLOCK_IMPORT_PATTERN.match(line)
614
+ if match:
615
+ pkg_path = match.group(1)
616
+ pkg_name = self._normalize_package_name(pkg_path)
617
+ if pkg_name:
618
+ references.append(
619
+ ImportReference(
620
+ package_name=pkg_name,
621
+ file_path=file_path,
622
+ line_number=line_num,
623
+ import_statement=line.strip(),
624
+ ecosystem=self.ECOSYSTEM,
625
+ )
626
+ )
627
+ else:
628
+ # Single import
629
+ match = self.SINGLE_IMPORT_PATTERN.match(line)
630
+ if match:
631
+ pkg_path = match.group(1)
632
+ pkg_name = self._normalize_package_name(pkg_path)
633
+ if pkg_name:
634
+ references.append(
635
+ ImportReference(
636
+ package_name=pkg_name,
637
+ file_path=file_path,
638
+ line_number=line_num,
639
+ import_statement=line.strip(),
640
+ ecosystem=self.ECOSYSTEM,
641
+ )
642
+ )
643
+
644
+ return references
645
+
646
+ def _extract_packages(self, line: str) -> List[tuple]:
647
+ """Not used for Go - overridden scan_file instead."""
648
+ return []
649
+
650
+ def _normalize_package_name(self, pkg_path: str) -> Optional[str]:
651
+ """Normalize a Go import path to a package identifier.
652
+
653
+ Args:
654
+ pkg_path: The import path (e.g., 'github.com/user/repo/pkg').
655
+
656
+ Returns:
657
+ The module path or None if it's a standard library package.
658
+ """
659
+ # Exclude standard library
660
+ for prefix in self.STDLIB_PREFIXES:
661
+ if pkg_path == prefix.rstrip("/") or pkg_path.startswith(prefix):
662
+ return None
663
+
664
+ # For external packages, return the full module path
665
+ # Go modules typically use the first 3 parts: host/user/repo
666
+ if "/" in pkg_path:
667
+ parts = pkg_path.split("/")
668
+ if len(parts) >= 3:
669
+ return "/".join(parts[:3])
670
+ return pkg_path
671
+
672
+ return None
673
+
674
+
675
+ class JavaScanner(BaseLanguageScanner):
676
+ """Scanner for Java import statements."""
677
+
678
+ FILE_EXTENSIONS = [".java"]
679
+ ECOSYSTEM = "maven"
680
+
681
+ # Pattern for Java imports
682
+ # import package.Class; or import package.*;
683
+ IMPORT_PATTERN: Pattern = re.compile(r"""^\s*import\s+(?:static\s+)?([\w.]+)(?:\.\*)?;""")
684
+
685
+ # Java standard library and common internal packages to exclude
686
+ STDLIB_PACKAGES: List[str] = [
687
+ "java.",
688
+ "javax.",
689
+ "sun.",
690
+ "com.sun.",
691
+ "jdk.",
692
+ ]
693
+
694
+ def _extract_packages(self, line: str) -> List[tuple]:
695
+ """Extract package names from Java import statements."""
696
+ results: List[tuple] = []
697
+
698
+ match = self.IMPORT_PATTERN.match(line.strip())
699
+ if match:
700
+ import_path = match.group(1)
701
+ pkg_name = self._normalize_package_name(import_path)
702
+ if pkg_name:
703
+ results.append((pkg_name, line))
704
+
705
+ return results
706
+
707
+ def _normalize_package_name(self, import_path: str) -> Optional[str]:
708
+ """Normalize a Java import path to a package identifier.
709
+
710
+ Args:
711
+ import_path: The import path (e.g., 'org.apache.commons.lang3.StringUtils').
712
+
713
+ Returns:
714
+ Group:Artifact format or None if standard library.
715
+ """
716
+ # Exclude standard library packages
717
+ for prefix in self.STDLIB_PACKAGES:
718
+ if import_path.startswith(prefix):
719
+ return None
720
+
721
+ # Java package naming convention typically uses reversed domain
722
+ # We return the first 2-3 parts as package identifier
723
+ parts = import_path.split(".")
724
+ if len(parts) >= 2:
725
+ # Common pattern: org.groupid.artifactid or com.groupid.artifactid
726
+ return ".".join(parts[: min(3, len(parts))])
727
+
728
+ return None
729
+
730
+
731
+ class RubyScanner(BaseLanguageScanner):
732
+ """Scanner for Ruby require statements."""
733
+
734
+ FILE_EXTENSIONS = [".rb"]
735
+ ECOSYSTEM = "rubygems"
736
+
737
+ # Patterns for Ruby requires
738
+ # require 'package' or require "package"
739
+ REQUIRE_PATTERN: Pattern = re.compile(r"""^\s*require\s+['"]([^'"]+)['"]""")
740
+ # require_relative should be excluded
741
+ REQUIRE_RELATIVE_PATTERN: Pattern = re.compile(r"""^\s*require_relative\s+""")
742
+
743
+ # Ruby standard library modules to exclude
744
+ STDLIB_MODULES: Set[str] = {
745
+ "abbrev",
746
+ "base64",
747
+ "benchmark",
748
+ "bigdecimal",
749
+ "cgi",
750
+ "cmath",
751
+ "coverage",
752
+ "csv",
753
+ "date",
754
+ "dbm",
755
+ "debug",
756
+ "delegate",
757
+ "digest",
758
+ "drb",
759
+ "english",
760
+ "erb",
761
+ "etc",
762
+ "extmk",
763
+ "fcntl",
764
+ "fiddle",
765
+ "fileutils",
766
+ "find",
767
+ "forwardable",
768
+ "gdbm",
769
+ "getoptlong",
770
+ "io",
771
+ "ipaddr",
772
+ "irb",
773
+ "json",
774
+ "logger",
775
+ "matrix",
776
+ "minitest",
777
+ "mkmf",
778
+ "monitor",
779
+ "mutex_m",
780
+ "net",
781
+ "nkf",
782
+ "objspace",
783
+ "observer",
784
+ "open-uri",
785
+ "open3",
786
+ "openssl",
787
+ "optparse",
788
+ "ostruct",
789
+ "pathname",
790
+ "pp",
791
+ "prettyprint",
792
+ "prime",
793
+ "pstore",
794
+ "psych",
795
+ "pty",
796
+ "racc",
797
+ "rake",
798
+ "rdoc",
799
+ "readline",
800
+ "reline",
801
+ "resolv",
802
+ "resolv-replace",
803
+ "rexml",
804
+ "rinda",
805
+ "ripper",
806
+ "rss",
807
+ "rubygems",
808
+ "scanf",
809
+ "sdbm",
810
+ "securerandom",
811
+ "set",
812
+ "shellwords",
813
+ "singleton",
814
+ "socket",
815
+ "stringio",
816
+ "strscan",
817
+ "syslog",
818
+ "tempfile",
819
+ "thwait",
820
+ "time",
821
+ "timeout",
822
+ "tmpdir",
823
+ "tracer",
824
+ "tsort",
825
+ "un",
826
+ "unicode_normalize",
827
+ "uri",
828
+ "weakref",
829
+ "webrick",
830
+ "yaml",
831
+ "zlib",
832
+ }
833
+
834
+ def _extract_packages(self, line: str) -> List[tuple]:
835
+ """Extract package names from Ruby require statements."""
836
+ results: List[tuple] = []
837
+ stripped = line.strip()
838
+
839
+ # Skip require_relative
840
+ if self.REQUIRE_RELATIVE_PATTERN.match(stripped):
841
+ return results
842
+
843
+ match = self.REQUIRE_PATTERN.match(stripped)
844
+ if match:
845
+ gem_path = match.group(1)
846
+ pkg_name = self._normalize_package_name(gem_path)
847
+ if pkg_name:
848
+ results.append((pkg_name, line))
849
+
850
+ return results
851
+
852
+ def _normalize_package_name(self, gem_path: str) -> Optional[str]:
853
+ """Normalize a gem path to a package name.
854
+
855
+ Args:
856
+ gem_path: The required path (e.g., 'rails' or 'active_support/core_ext').
857
+
858
+ Returns:
859
+ Gem name or None if it's a standard library module.
860
+ """
861
+ # Get the top-level gem name
862
+ parts = gem_path.split("/")
863
+ gem_name = parts[0]
864
+
865
+ # Exclude standard library
866
+ if gem_name in self.STDLIB_MODULES:
867
+ return None
868
+
869
+ return gem_name
870
+
871
+
872
+ class RustScanner(BaseLanguageScanner):
873
+ """Scanner for Rust use statements."""
874
+
875
+ FILE_EXTENSIONS = [".rs"]
876
+ ECOSYSTEM = "crates.io"
877
+
878
+ # Patterns for Rust
879
+ # use crate::...; or use package::...;
880
+ USE_PATTERN: Pattern = re.compile(r"""^\s*use\s+([\w]+)(?:::|;)""")
881
+ # extern crate package;
882
+ EXTERN_CRATE_PATTERN: Pattern = re.compile(r"""^\s*extern\s+crate\s+([\w]+)""")
883
+
884
+ # Rust standard library and internal crates to exclude
885
+ STDLIB_CRATES: Set[str] = {
886
+ "std",
887
+ "core",
888
+ "alloc",
889
+ "proc_macro",
890
+ "test",
891
+ "crate",
892
+ "self",
893
+ "super",
894
+ }
895
+
896
+ def _extract_packages(self, line: str) -> List[tuple]:
897
+ """Extract crate names from Rust use/extern statements."""
898
+ results: List[tuple] = []
899
+ stripped = line.strip()
900
+
901
+ # Check for 'use crate::...'
902
+ match = self.USE_PATTERN.match(stripped)
903
+ if match:
904
+ crate_name = match.group(1)
905
+ if crate_name not in self.STDLIB_CRATES:
906
+ results.append((crate_name, line))
907
+
908
+ # Check for 'extern crate ...'
909
+ match = self.EXTERN_CRATE_PATTERN.match(stripped)
910
+ if match:
911
+ crate_name = match.group(1)
912
+ if crate_name not in self.STDLIB_CRATES:
913
+ results.append((crate_name, line))
914
+
915
+ return results
916
+
917
+ def _normalize_package_name(self, crate_name: str) -> Optional[str]:
918
+ """Normalize a crate name."""
919
+ if crate_name in self.STDLIB_CRATES:
920
+ return None
921
+ return crate_name
922
+
923
+
924
+ class PHPScanner(BaseLanguageScanner):
925
+ """Scanner for PHP use statements."""
926
+
927
+ FILE_EXTENSIONS = [".php"]
928
+ ECOSYSTEM = "packagist"
929
+
930
+ # Patterns for PHP
931
+ # use Namespace\Class;
932
+ USE_PATTERN: Pattern = re.compile(r"""^\s*use\s+([\w\\]+)""")
933
+
934
+ # PHP internal namespaces to exclude
935
+ INTERNAL_NAMESPACES: List[str] = [
936
+ "Exception",
937
+ "Error",
938
+ "Throwable",
939
+ "Iterator",
940
+ "Generator",
941
+ "Closure",
942
+ "stdClass",
943
+ "DateTime",
944
+ "DateTimeImmutable",
945
+ "DateInterval",
946
+ "DatePeriod",
947
+ "DateTimeZone",
948
+ ]
949
+
950
+ def _extract_packages(self, line: str) -> List[tuple]:
951
+ """Extract package names from PHP use statements."""
952
+ results: List[tuple] = []
953
+ stripped = line.strip()
954
+
955
+ # Skip require/include statements (vendor autoload)
956
+ if "require" in stripped or "include" in stripped:
957
+ return results
958
+
959
+ match = self.USE_PATTERN.match(stripped)
960
+ if match:
961
+ namespace = match.group(1)
962
+ pkg_name = self._normalize_package_name(namespace)
963
+ if pkg_name:
964
+ results.append((pkg_name, line))
965
+
966
+ return results
967
+
968
+ def _normalize_package_name(self, namespace: str) -> Optional[str]:
969
+ """Normalize a PHP namespace to a package name.
970
+
971
+ Args:
972
+ namespace: The use namespace (e.g., 'Symfony\\Component\\HttpFoundation').
973
+
974
+ Returns:
975
+ Vendor/package format or None if internal.
976
+ """
977
+ # Replace backslashes with forward slashes
978
+ namespace = namespace.replace("\\", "/")
979
+ parts = namespace.split("/")
980
+
981
+ # Skip internal PHP classes
982
+ if parts[0] in self.INTERNAL_NAMESPACES:
983
+ return None
984
+
985
+ # Packagist convention: vendor/package
986
+ if len(parts) >= 2:
987
+ return f"{parts[0].lower()}/{parts[1].lower()}"
988
+
989
+ return None
990
+
991
+
992
+ class ImportScanner:
993
+ """Main import scanner that coordinates language-specific scanners."""
994
+
995
+ # Mapping of ecosystems to their scanners
996
+ SCANNERS: Dict[str, Type[BaseLanguageScanner]] = {
997
+ "npm": JavaScriptScanner,
998
+ "pypi": PythonScanner,
999
+ "go": GoScanner,
1000
+ "maven": JavaScanner,
1001
+ "rubygems": RubyScanner,
1002
+ "crates.io": RustScanner,
1003
+ "packagist": PHPScanner,
1004
+ }
1005
+
1006
+ # File extension to ecosystem mapping
1007
+ EXTENSION_MAP: Dict[str, str] = {
1008
+ ".js": "npm",
1009
+ ".jsx": "npm",
1010
+ ".ts": "npm",
1011
+ ".tsx": "npm",
1012
+ ".mjs": "npm",
1013
+ ".cjs": "npm",
1014
+ ".py": "pypi",
1015
+ ".go": "go",
1016
+ ".java": "maven",
1017
+ ".rb": "rubygems",
1018
+ ".rs": "crates.io",
1019
+ ".php": "packagist",
1020
+ }
1021
+
1022
+ def __init__(
1023
+ self,
1024
+ ecosystems: Optional[List[str]] = None,
1025
+ exclude_patterns: Optional[List[str]] = None,
1026
+ ) -> None:
1027
+ """Initialize the import scanner.
1028
+
1029
+ Args:
1030
+ ecosystems: List of ecosystems to scan for. If None, all are enabled.
1031
+ exclude_patterns: Additional patterns to exclude from scanning.
1032
+ """
1033
+ self.exclude_patterns = exclude_patterns
1034
+ self.active_scanners: Dict[str, BaseLanguageScanner] = {}
1035
+
1036
+ # Initialize scanners for requested ecosystems
1037
+ target_ecosystems = ecosystems or list(self.SCANNERS.keys())
1038
+ for eco in target_ecosystems:
1039
+ if eco in self.SCANNERS:
1040
+ self.active_scanners[eco] = self.SCANNERS[eco](exclude_patterns)
1041
+
1042
+ def scan_directory(
1043
+ self,
1044
+ directory: Path,
1045
+ max_file_size: int = 1024 * 1024,
1046
+ ) -> Dict[str, List[ImportReference]]:
1047
+ """Scan a directory for import statements across all languages.
1048
+
1049
+ Args:
1050
+ directory: Directory to scan.
1051
+ max_file_size: Maximum file size in bytes to scan.
1052
+
1053
+ Returns:
1054
+ Dictionary mapping ecosystems to their import references.
1055
+ """
1056
+ results: Dict[str, List[ImportReference]] = {}
1057
+
1058
+ for ecosystem, scanner in self.active_scanners.items():
1059
+ refs = scanner.scan_directory(directory, max_file_size)
1060
+ if refs:
1061
+ results[ecosystem] = refs
1062
+
1063
+ return results
1064
+
1065
+ def scan_file(self, file_path: Path) -> List[ImportReference]:
1066
+ """Scan a single file for import statements.
1067
+
1068
+ Args:
1069
+ file_path: Path to the file to scan.
1070
+
1071
+ Returns:
1072
+ List of ImportReference objects found.
1073
+ """
1074
+ ext = file_path.suffix.lower()
1075
+ ecosystem = self.EXTENSION_MAP.get(ext)
1076
+
1077
+ if ecosystem and ecosystem in self.active_scanners:
1078
+ return self.active_scanners[ecosystem].scan_file(file_path)
1079
+
1080
+ return []
1081
+
1082
+ def get_imports_for_package(
1083
+ self,
1084
+ package_name: str,
1085
+ references: List[ImportReference],
1086
+ ) -> List[ImportReference]:
1087
+ """Filter import references for a specific package.
1088
+
1089
+ Args:
1090
+ package_name: Package name to filter for.
1091
+ references: List of all import references.
1092
+
1093
+ Returns:
1094
+ List of references for the specified package.
1095
+ """
1096
+ return [ref for ref in references if ref.package_name == package_name]
1097
+
1098
+ @staticmethod
1099
+ def get_supported_extensions() -> List[str]:
1100
+ """Get list of supported file extensions."""
1101
+ return list(ImportScanner.EXTENSION_MAP.keys())
1102
+
1103
+ @staticmethod
1104
+ def get_supported_ecosystems() -> List[str]:
1105
+ """Get list of supported ecosystems."""
1106
+ return list(ImportScanner.SCANNERS.keys())
1107
+
1108
+
1109
+ def get_scanner_for_ecosystem(ecosystem: str) -> Optional[BaseLanguageScanner]:
1110
+ """Get a scanner instance for a specific ecosystem.
1111
+
1112
+ Args:
1113
+ ecosystem: The ecosystem name.
1114
+
1115
+ Returns:
1116
+ Scanner instance or None if not supported.
1117
+ """
1118
+ scanner_class = ImportScanner.SCANNERS.get(ecosystem)
1119
+ if scanner_class:
1120
+ return scanner_class()
1121
+ return None