fc-data 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. datasmith/__init__.py +330 -0
  2. datasmith/__init__.pyi +194 -0
  3. datasmith/agents/__init__.py +31 -0
  4. datasmith/agents/classifiers.py +272 -0
  5. datasmith/agents/codex.py +25 -0
  6. datasmith/agents/config.py +108 -0
  7. datasmith/agents/extractors.py +197 -0
  8. datasmith/agents/installed/README.md +52 -0
  9. datasmith/agents/installed/__init__.py +22 -0
  10. datasmith/agents/installed/base.py +240 -0
  11. datasmith/agents/installed/claude.py +134 -0
  12. datasmith/agents/installed/codex.py +91 -0
  13. datasmith/agents/installed/gemini.py +118 -0
  14. datasmith/agents/installed/none.py +27 -0
  15. datasmith/agents/sandbox.py +547 -0
  16. datasmith/agents/synthesizer.py +439 -0
  17. datasmith/agents/templates/AGENTS.md.j2 +150 -0
  18. datasmith/agents/templates/sandbox_verify.py +428 -0
  19. datasmith/docker/__init__.py +31 -0
  20. datasmith/docker/context.py +112 -0
  21. datasmith/docker/images.py +158 -0
  22. datasmith/docker/publish.py +56 -0
  23. datasmith/docker/templates/Dockerfile.base +26 -0
  24. datasmith/docker/templates/Dockerfile.pr +42 -0
  25. datasmith/docker/templates/Dockerfile.repo +11 -0
  26. datasmith/docker/templates/docker_build_base.sh +780 -0
  27. datasmith/docker/templates/docker_build_env.sh +309 -0
  28. datasmith/docker/templates/docker_build_final.sh +106 -0
  29. datasmith/docker/templates/docker_build_pkg.sh +99 -0
  30. datasmith/docker/templates/docker_build_run.sh +124 -0
  31. datasmith/docker/templates/entrypoint.sh +62 -0
  32. datasmith/docker/templates/parser.py +1405 -0
  33. datasmith/docker/templates/profile.sh +199 -0
  34. datasmith/docker/templates/pytest_runner.py +692 -0
  35. datasmith/docker/templates/run-tests.sh +197 -0
  36. datasmith/docker/verifiers.py +131 -0
  37. datasmith/filters.py +154 -0
  38. datasmith/github/__init__.py +22 -0
  39. datasmith/github/client.py +333 -0
  40. datasmith/github/hooks.py +50 -0
  41. datasmith/github/links.py +110 -0
  42. datasmith/github/models.py +206 -0
  43. datasmith/github/render.py +173 -0
  44. datasmith/github/search.py +66 -0
  45. datasmith/github/templates/comment.md.j2 +5 -0
  46. datasmith/github/templates/final.md.j2 +66 -0
  47. datasmith/github/templates/issues.md.j2 +21 -0
  48. datasmith/github/templates/repo.md.j2 +1 -0
  49. datasmith/preflight.py +162 -0
  50. datasmith/publish/__init__.py +13 -0
  51. datasmith/publish/huggingface.py +104 -0
  52. datasmith/publish/pipeline.py +60 -0
  53. datasmith/publish/records.py +91 -0
  54. datasmith/py.typed +1 -0
  55. datasmith/resolution/__init__.py +14 -0
  56. datasmith/resolution/blocklist.py +145 -0
  57. datasmith/resolution/cache.py +120 -0
  58. datasmith/resolution/constants.py +277 -0
  59. datasmith/resolution/dependency_resolver.py +174 -0
  60. datasmith/resolution/git_utils.py +378 -0
  61. datasmith/resolution/import_analyzer.py +66 -0
  62. datasmith/resolution/metadata_parser.py +412 -0
  63. datasmith/resolution/models.py +41 -0
  64. datasmith/resolution/orchestrator.py +522 -0
  65. datasmith/resolution/package_filters.py +312 -0
  66. datasmith/resolution/python_manager.py +110 -0
  67. datasmith/runners/__init__.py +15 -0
  68. datasmith/runners/base.py +112 -0
  69. datasmith/runners/classify_prs.py +48 -0
  70. datasmith/runners/render_problems.py +113 -0
  71. datasmith/runners/resolve_packages.py +66 -0
  72. datasmith/runners/scrape_commits.py +166 -0
  73. datasmith/runners/scrape_repos.py +44 -0
  74. datasmith/runners/synthesize_images.py +310 -0
  75. datasmith/update/__init__.py +5 -0
  76. datasmith/update/cli.py +169 -0
  77. datasmith/update/offline.py +173 -0
  78. datasmith/update/pipeline.py +497 -0
  79. datasmith/utils/__init__.py +18 -0
  80. datasmith/utils/core.py +67 -0
  81. datasmith/utils/db.py +156 -0
  82. datasmith/utils/tokens.py +65 -0
  83. fc_data-0.2.0.dist-info/METADATA +441 -0
  84. fc_data-0.2.0.dist-info/RECORD +87 -0
  85. fc_data-0.2.0.dist-info/WHEEL +4 -0
  86. fc_data-0.2.0.dist-info/entry_points.txt +2 -0
  87. fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,91 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ from typing import Any
5
+
6
+ import pyarrow as pa # type: ignore[import-untyped]
7
+ import pyarrow.parquet as pq # type: ignore[import-untyped]
8
+
9
+ from datasmith.github.models import FormulaCodeRecord
10
+ from datasmith.utils import get_logger
11
+ from datasmith.utils.db import fetch_all
12
+
13
+ logger = get_logger("publish.records")
14
+
15
+
16
+ def records_to_parquet(records: list[FormulaCodeRecord]) -> bytes:
17
+ """Serialize FormulaCodeRecords to Parquet bytes via pyarrow."""
18
+ if not records:
19
+ return b""
20
+
21
+ rows = [r.model_dump(mode="json") for r in records]
22
+
23
+ # Build schema from first record
24
+ table = pa.Table.from_pylist(rows)
25
+ buf = io.BytesIO()
26
+ pq.write_table(table, buf)
27
+ return buf.getvalue()
28
+
29
+
30
+ def records_from_parquet(data: bytes) -> list[FormulaCodeRecord]:
31
+ """Deserialize Parquet bytes back to FormulaCodeRecords."""
32
+ if not data:
33
+ return []
34
+ buf = io.BytesIO(data)
35
+ table = pq.read_table(buf)
36
+ rows = table.to_pylist()
37
+ return [FormulaCodeRecord(**row) for row in rows]
38
+
39
+
40
+ def records_from_supabase(
41
+ start_date: str | None = None,
42
+ end_date: str | None = None,
43
+ unpublished_only: bool = True,
44
+ ) -> list[FormulaCodeRecord]:
45
+ """Query Supabase for FormulaCodeRecords, optionally filtered by date and publish status."""
46
+ filters: dict[str, Any] = {"is_performance_commit": True}
47
+ is_null: list[str] = []
48
+ gte_filters: dict[str, Any] = {}
49
+ lte_filters: dict[str, Any] = {}
50
+
51
+ if unpublished_only:
52
+ is_null.append("published_at")
53
+ if start_date:
54
+ gte_filters["merged_at"] = start_date
55
+ if end_date:
56
+ lte_filters["merged_at"] = end_date
57
+
58
+ rows = fetch_all(
59
+ "pull_requests",
60
+ select="*",
61
+ filters=filters,
62
+ is_null=is_null or None,
63
+ gte_filters=gte_filters or None,
64
+ lte_filters=lte_filters or None,
65
+ )
66
+
67
+ records: list[FormulaCodeRecord] = []
68
+ for row in rows:
69
+ try:
70
+ records.append(
71
+ FormulaCodeRecord(
72
+ owner=row["owner"],
73
+ repo=row["repo"],
74
+ issue_number=row["issue_number"],
75
+ task_id=f"{row['owner']}__{row['repo']}-{row['issue_number']}",
76
+ gt_hash=row.get("merge_commit_sha", ""),
77
+ base_commit=row.get("base_sha", ""),
78
+ date=row.get("merged_at"),
79
+ instructions=row.get("rendered_problem", ""),
80
+ classification=row.get("classification", ""),
81
+ difficulty=row.get("difficulty", ""),
82
+ container_name=row.get("container_name", ""),
83
+ patch=row.get("patch", ""),
84
+ )
85
+ )
86
+ except Exception:
87
+ logger.warning(
88
+ "Failed to create record for %s/%s#%s", row.get("owner"), row.get("repo"), row.get("issue_number")
89
+ )
90
+
91
+ return records
datasmith/py.typed ADDED
@@ -0,0 +1 @@
1
+ # This file marks the datasmith package as typed for mypy and other type checkers
@@ -0,0 +1,14 @@
1
+ """Resolution package for analyzing commits and resolving dependencies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ __all__ = ["analyze_commit"]
8
+
9
+
10
+ def analyze_commit(sha: str, repo_name: str, bypass_cache: bool = False) -> dict[str, Any] | None:
11
+ """Lazy wrapper around :func:`datasmith.resolution.orchestrator.analyze_commit`."""
12
+ from .orchestrator import analyze_commit as _analyze_commit
13
+
14
+ return _analyze_commit(sha=sha, repo_name=repo_name, bypass_cache=bypass_cache)
@@ -0,0 +1,145 @@
1
+ """Dynamic blocklist for packages that don't exist on PyPI or can't be resolved."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ import threading
8
+
9
+ from datasmith.utils import get_logger
10
+
11
+ from .constants import GIT_CACHE_DIR
12
+
13
+ logger = get_logger("resolution.blocklist")
14
+
15
+
16
+ def normalize_package_name(name: str) -> str:
17
+ """Normalize a package name according to PEP 503."""
18
+ return re.sub(r"[-_.]+", "-", name).lower()
19
+
20
+
21
+ BLOCKLIST_PATH = GIT_CACHE_DIR / "package_blocklist.json"
22
+
23
+ _blocklist_lock = threading.Lock()
24
+ _blocklist_cache: set[str] | None = None
25
+
26
+
27
+ def _load_blocklist() -> set[str]:
28
+ """Load the blocklist from disk."""
29
+ if not BLOCKLIST_PATH.exists():
30
+ return set()
31
+ try:
32
+ with BLOCKLIST_PATH.open("r") as f:
33
+ data = json.load(f)
34
+ return set(data.get("blocked_packages", []))
35
+ except Exception as e:
36
+ logger.warning("Failed to load blocklist from %s: %s", BLOCKLIST_PATH, e)
37
+ return set()
38
+
39
+
40
+ def _save_blocklist(blocklist: set[str]) -> None:
41
+ """Save the blocklist to disk."""
42
+ try:
43
+ BLOCKLIST_PATH.parent.mkdir(parents=True, exist_ok=True)
44
+ with BLOCKLIST_PATH.open("w") as f:
45
+ json.dump(
46
+ {
47
+ "blocked_packages": sorted(blocklist),
48
+ "description": "Packages that don't exist on PyPI or can't be resolved",
49
+ },
50
+ f,
51
+ indent=2,
52
+ )
53
+ except Exception as e:
54
+ logger.warning("Failed to save blocklist to %s: %s", BLOCKLIST_PATH, e)
55
+
56
+
57
+ def get_blocklist() -> set[str]:
58
+ """Get the current blocklist of packages to filter out."""
59
+ global _blocklist_cache
60
+
61
+ with _blocklist_lock:
62
+ if _blocklist_cache is None:
63
+ _blocklist_cache = _load_blocklist()
64
+ return _blocklist_cache.copy()
65
+
66
+
67
+ def add_to_blocklist(package_name: str) -> bool:
68
+ """Add a package to the blocklist. Returns True if newly added."""
69
+ global _blocklist_cache
70
+
71
+ if not package_name or not package_name.strip():
72
+ return False
73
+
74
+ package_name = normalize_package_name(package_name.strip())
75
+
76
+ with _blocklist_lock:
77
+ blocklist = _load_blocklist()
78
+ if package_name in blocklist:
79
+ return False
80
+ blocklist.add(package_name)
81
+ _save_blocklist(blocklist)
82
+ _blocklist_cache = blocklist.copy()
83
+ logger.info("Added '%s' to package blocklist", package_name)
84
+ return True
85
+
86
+
87
+ def extract_failing_package(error_log: str) -> str | None:
88
+ """Extract the package name that caused a resolution failure from uv error logs."""
89
+ if not error_log:
90
+ return None
91
+
92
+ match = re.search(r"Because ([\w\-]+) was not found in the package registry", error_log)
93
+ if match:
94
+ return match.group(1)
95
+
96
+ match = re.search(r"Because there are no versions of ([\w\-]+)", error_log)
97
+ if match:
98
+ return match.group(1)
99
+
100
+ match = re.search(
101
+ r"Because you require ([\w\-]+)==[\d\.]+ and \1[><=!]+[\d\.]+, we can conclude",
102
+ error_log,
103
+ )
104
+ if match:
105
+ pkg = match.group(1)
106
+ if re.match(r"^\d+[\-\d]+$", pkg) or pkg in {"uninstall", "install"}:
107
+ return pkg
108
+
109
+ return None
110
+
111
+
112
+ def should_retry_without_package(error_log: str) -> bool:
113
+ """Determine if a resolution failure should trigger a retry without the failing package."""
114
+ if not error_log:
115
+ return False
116
+ if "was not found in the package registry" in error_log:
117
+ return True
118
+ if "Because there are no versions of" in error_log:
119
+ return True
120
+ if "Failed to build" in error_log:
121
+ return False
122
+ if "Failed to download" in error_log:
123
+ return False
124
+ return False
125
+
126
+
127
+ def remove_package_from_requirements(requirements: list[str], package_name: str) -> tuple[list[str], bool]:
128
+ """Remove all requirements for a given package from a list."""
129
+ if not package_name:
130
+ return requirements, False
131
+
132
+ package_name_normalized = normalize_package_name(package_name)
133
+ filtered: list[str] = []
134
+ was_removed = False
135
+
136
+ for req in requirements:
137
+ pkg_match = re.match(r"^([a-zA-Z0-9]([a-zA-Z0-9._-]*[a-zA-Z0-9])?)", req)
138
+ if pkg_match:
139
+ req_pkg_name = pkg_match.group(1)
140
+ if normalize_package_name(req_pkg_name) == package_name_normalized:
141
+ was_removed = True
142
+ continue
143
+ filtered.append(req)
144
+
145
+ return filtered, was_removed
@@ -0,0 +1,120 @@
1
+ """SQLite-backed caching helpers for resolution results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import functools
7
+ import os
8
+ import pickle
9
+ import re
10
+ import sqlite3
11
+ import threading
12
+ from collections.abc import Iterator
13
+ from typing import Callable, ParamSpec, TypeVar, cast
14
+
15
+ _cache_lock = threading.Lock()
16
+ _P = ParamSpec("_P")
17
+ _T = TypeVar("_T")
18
+
19
+
20
+ @contextlib.contextmanager
21
+ def _file_lock(lock_path: str) -> Iterator[None]:
22
+ """Cross-process exclusive lock using a sidecar .lock file."""
23
+ lockfile = lock_path + ".lock"
24
+ fd = os.open(lockfile, os.O_CREAT | os.O_RDWR, 0o644)
25
+ try:
26
+ try:
27
+ import fcntl # POSIX
28
+
29
+ fcntl.flock(fd, fcntl.LOCK_EX)
30
+ yield
31
+ fcntl.flock(fd, fcntl.LOCK_UN)
32
+ except ImportError: # pragma: no cover
33
+ import msvcrt
34
+
35
+ msvcrt.locking(fd, msvcrt.LK_LOCK, 1) # type: ignore[attr-defined]
36
+ yield
37
+ msvcrt.locking(fd, msvcrt.LK_UNLCK, 1) # type: ignore[attr-defined]
38
+ finally:
39
+ os.close(fd)
40
+
41
+
42
+ def _get_db_connection(db_loc: str) -> sqlite3.Connection:
43
+ """Open a SQLite connection configured for concurrent workloads."""
44
+ conn = sqlite3.connect(db_loc, timeout=30, isolation_level=None)
45
+ jm = conn.execute("PRAGMA journal_mode=WAL;").fetchone()[0].lower()
46
+ if jm != "wal":
47
+ conn.execute("PRAGMA journal_mode=TRUNCATE;")
48
+ conn.execute("PRAGMA synchronous=FULL;")
49
+ else:
50
+ conn.execute("PRAGMA synchronous=NORMAL;")
51
+ conn.execute("PRAGMA busy_timeout=30000;")
52
+ return conn
53
+
54
+
55
+ def cache_completion(db_loc: str, table_name: str = "cache") -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
56
+ """Cache function results in a SQLite table keyed by args/kwargs.
57
+
58
+ Passing ``bypass_cache=True`` to the wrapped function forces a refresh and
59
+ overwrites the cached result.
60
+ """
61
+ if not re.match(r"^\w+$", table_name):
62
+ raise ValueError("table_name must be alphanumeric/underscore only")
63
+
64
+ def decorator(func: Callable[_P, _T]) -> Callable[_P, _T]:
65
+ @functools.wraps(func)
66
+ def wrapped(*args: _P.args, **kwargs: _P.kwargs) -> _T:
67
+ bypass = cast(bool, kwargs.pop("bypass_cache", False))
68
+ key_kwargs = dict(sorted(kwargs.items()))
69
+
70
+ conn = _get_db_connection(db_loc)
71
+ try:
72
+ function_name = func.__name__
73
+
74
+ create_sql = (
75
+ f"CREATE TABLE IF NOT EXISTS {table_name} ("
76
+ " function_name TEXT NOT NULL,"
77
+ " argument_blob BLOB NOT NULL,"
78
+ " result_blob BLOB,"
79
+ " created_at TEXT DEFAULT CURRENT_TIMESTAMP,"
80
+ " updated_at TEXT DEFAULT CURRENT_TIMESTAMP,"
81
+ " PRIMARY KEY (function_name, argument_blob)"
82
+ ")"
83
+ )
84
+ with _file_lock(db_loc), _cache_lock:
85
+ conn.execute(create_sql)
86
+
87
+ set_clause = "result_blob=excluded.result_blob, updated_at=CURRENT_TIMESTAMP"
88
+ args_blob = pickle.dumps((function_name, args, key_kwargs))
89
+
90
+ if not bypass:
91
+ with _cache_lock:
92
+ row = conn.execute(
93
+ f"SELECT result_blob FROM {table_name} WHERE function_name=? AND argument_blob=?", # noqa: S608
94
+ (function_name, args_blob),
95
+ ).fetchone()
96
+ if row is not None:
97
+ return cast(_T, pickle.loads(row[0])) # noqa: S301
98
+
99
+ if "bypass_cache" in func.__code__.co_varnames:
100
+ kwargs["bypass_cache"] = bypass
101
+
102
+ result = func(*args, **kwargs)
103
+ result_blob = pickle.dumps(result)
104
+
105
+ with _file_lock(db_loc), _cache_lock:
106
+ conn.execute(
107
+ f"""
108
+ INSERT INTO {table_name}(function_name, argument_blob, result_blob)
109
+ VALUES(?, ?, ?)
110
+ ON CONFLICT(function_name, argument_blob) DO UPDATE SET {set_clause}
111
+ """,
112
+ (function_name, args_blob, result_blob),
113
+ )
114
+ return result
115
+ finally:
116
+ conn.close()
117
+
118
+ return wrapped
119
+
120
+ return decorator
@@ -0,0 +1,277 @@
1
+ """Constants used in dependency resolution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ # Regular expressions
11
+ ASV_REGEX = re.compile(r"(^|/)\.?asv[^/]*\.jsonc?$")
12
+ REQ_TXT_REGEX = re.compile(r"(^|/)(?:constraints(?:\.[-\w]+)?|requirements.*)\.txt$")
13
+ ANSI_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
14
+ EXTRA_MARKER_RE = re.compile(r';\s*extra\s*==\s*["\']([^"\']+)["\']')
15
+
16
+ # File names
17
+ PYPROJECT = "pyproject.toml"
18
+ SETUP_CFG = "setup.cfg"
19
+ SETUP_PY = "setup.py"
20
+ ENV_YML_NAMES = {"environment.yml", "environment.yaml"}
21
+
22
+ # Cache location for SQLite caches
23
+ CACHE_LOCATION: str = os.getenv("CACHE_LOCATION", "cache.db")
24
+
25
+ # Git cache directory
26
+ GIT_CACHE_DIR = Path(os.getenv("GIT_CACHE_DIR", str(Path(CACHE_LOCATION).parent / "git"))).expanduser()
27
+ GIT_CACHE_DIR.mkdir(parents=True, exist_ok=True)
28
+
29
+ # Import name to PyPI package name mapping
30
+ SPECIAL_IMPORT_TO_PYPI = {
31
+ "sklearn": "scikit-learn",
32
+ "PIL": "Pillow",
33
+ "cv2": "opencv-python",
34
+ "yaml": "PyYAML",
35
+ "bs4": "beautifulsoup4",
36
+ "Crypto": "pycryptodome",
37
+ }
38
+
39
+ # Conda-only and system packages that don't exist on PyPI
40
+ CONDA_SYSTEM_PACKAGES = {
41
+ "pkg-config",
42
+ "compilers",
43
+ "c-compiler",
44
+ "cxx-compiler",
45
+ "fortran-compiler",
46
+ "gcc",
47
+ "gxx",
48
+ "gfortran",
49
+ "clang",
50
+ "clangxx",
51
+ "make",
52
+ "cmake",
53
+ "autoconf",
54
+ "automake",
55
+ "libtool",
56
+ "m4",
57
+ "patch",
58
+ "bison",
59
+ "flex",
60
+ }
61
+
62
+ # Packages that are stdlib or not installable via PyPI
63
+ NOT_REQUIREMENTS = {
64
+ # py2 names / stdlib modules seen in repos
65
+ "configparser",
66
+ "ConfigParser",
67
+ "asyncore",
68
+ "distutils",
69
+ "sqlite3",
70
+ "tkinter",
71
+ "Tkinter",
72
+ "cStringIO",
73
+ "urllib",
74
+ "urllib2",
75
+ # setuptools internals
76
+ "pkg_resources",
77
+ # platform frameworks (macOS)
78
+ "AppKit",
79
+ "Foundation",
80
+ # Build/packaging tools (not runtime dependencies)
81
+ "py2exe",
82
+ "cx_Freeze",
83
+ "py2app",
84
+ "nuitka",
85
+ # CLI verbs / interpreter references that slip through tokenization
86
+ "python",
87
+ "Python",
88
+ "python3",
89
+ "install",
90
+ "0-29-32",
91
+ "1-0",
92
+ "1-2",
93
+ "1-22-0",
94
+ "1-3-2",
95
+ "2-18-4",
96
+ "2024-1-1",
97
+ "3-0-0a10",
98
+ "absl",
99
+ "afl",
100
+ "allel",
101
+ "cartopy-userconfig",
102
+ "closest-peak-direction-getter",
103
+ "conans",
104
+ "cprofile",
105
+ "dask-core",
106
+ "dateutil",
107
+ "dbe",
108
+ "deepchecks-metrics",
109
+ "geopandas-base",
110
+ "interpnd",
111
+ "jpeg-ls",
112
+ "libblas",
113
+ "libpantab",
114
+ "libwriter",
115
+ "mo-pack",
116
+ "mpl-toolkits",
117
+ "pylab",
118
+ "pyqt4",
119
+ "pytables",
120
+ "skbuild",
121
+ "sklearnex",
122
+ "skspatial",
123
+ "system",
124
+ "tunits",
125
+ "vcr",
126
+ "0-29-21",
127
+ "0-29-33",
128
+ "1-11-2",
129
+ "1-12",
130
+ "1-14-0",
131
+ "1-23-5",
132
+ "1-8-1",
133
+ "3-0",
134
+ "3-0-0a11",
135
+ "3-1-2",
136
+ "59-2-0",
137
+ "c-distances-openmp",
138
+ "column-parsers",
139
+ "copy-reg",
140
+ "cpickle",
141
+ "cryptodome",
142
+ "cupyx",
143
+ "h5r",
144
+ "h5s",
145
+ "h5t",
146
+ "givens-elimination",
147
+ "imp",
148
+ "libreader",
149
+ "nattype",
150
+ "omniscidbe",
151
+ "openjpeg",
152
+ "patoolib",
153
+ "peerplaysbase",
154
+ "probabilistic-direction-getter",
155
+ "pyhdk",
156
+ "pymake",
157
+ "pyqt",
158
+ "sksparse",
159
+ "splitting",
160
+ "stringio",
161
+ "uninstall",
162
+ "urlparse",
163
+ "0-29-30",
164
+ "1-9-1",
165
+ "2-2",
166
+ "3-0-5",
167
+ "3-2-0",
168
+ "backports",
169
+ "cdms2",
170
+ "flatted",
171
+ "h5z",
172
+ "pnetdicom",
173
+ "pypocketfft",
174
+ "vectorized",
175
+ "voyager-ext",
176
+ }
177
+
178
+ # Well-known PyPI packages (allowlist for common names)
179
+ ALLOWLIST_COMMON_PYPI = {
180
+ "numpy",
181
+ "scipy",
182
+ "pandas",
183
+ "matplotlib",
184
+ "xarray",
185
+ "shapely",
186
+ "fiona",
187
+ "pyproj",
188
+ "rtree",
189
+ "torch",
190
+ "functorch",
191
+ "pytest",
192
+ "ipython",
193
+ "IPython",
194
+ "ipykernel",
195
+ "ipywidgets",
196
+ "Cython",
197
+ "cython",
198
+ "numba",
199
+ "scikit-learn",
200
+ "sklearn",
201
+ "sympy",
202
+ "h5py",
203
+ "Pillow",
204
+ "pillow",
205
+ "networkx",
206
+ "dask",
207
+ "seaborn",
208
+ "xgboost",
209
+ "statsmodels",
210
+ "pyarrow",
211
+ "geopandas",
212
+ "cartopy",
213
+ "tqdm",
214
+ "psycopg2",
215
+ "sqlalchemy",
216
+ "SQLAlchemy",
217
+ "requests",
218
+ "setuptools",
219
+ "wheel",
220
+ "pip",
221
+ "sphinx",
222
+ "nbsphinx",
223
+ "sphinx-gallery",
224
+ "black",
225
+ "isort",
226
+ "flake8",
227
+ "hypothesis",
228
+ "pyqt5",
229
+ "pyqt4",
230
+ "qtpy",
231
+ "jupyter",
232
+ "pooch",
233
+ "graphviz",
234
+ "numexpr",
235
+ }
236
+
237
+ # Generic names that are likely local modules, not PyPI packages
238
+ GENERIC_LOCAL_NAMES = {
239
+ "lib",
240
+ "libs",
241
+ "utils",
242
+ "util",
243
+ "utilities",
244
+ "core",
245
+ "helpers",
246
+ "helper",
247
+ "common",
248
+ "base",
249
+ "tools",
250
+ "tool",
251
+ "config",
252
+ "configs",
253
+ "constants",
254
+ "const",
255
+ "types",
256
+ "models",
257
+ "model",
258
+ "tests",
259
+ "test",
260
+ "testing",
261
+ "benchmarks",
262
+ "benchmark",
263
+ "examples",
264
+ "example",
265
+ "scripts",
266
+ "script",
267
+ "data",
268
+ "docs",
269
+ "doc",
270
+ "documentation",
271
+ }
272
+
273
+ # Python stdlib modules (Python 3.10+)
274
+ try:
275
+ STDLIB = set(sys.stdlib_module_names)
276
+ except Exception: # pragma: no cover
277
+ STDLIB = set()