lean-explore 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lean_explore/__init__.py +14 -1
- lean_explore/api/__init__.py +12 -1
- lean_explore/api/client.py +60 -80
- lean_explore/cli/__init__.py +10 -1
- lean_explore/cli/data_commands.py +157 -479
- lean_explore/cli/display.py +171 -0
- lean_explore/cli/main.py +51 -608
- lean_explore/config.py +244 -0
- lean_explore/extract/__init__.py +5 -0
- lean_explore/extract/__main__.py +368 -0
- lean_explore/extract/doc_gen4.py +200 -0
- lean_explore/extract/doc_parser.py +499 -0
- lean_explore/extract/embeddings.py +371 -0
- lean_explore/extract/github.py +110 -0
- lean_explore/extract/index.py +317 -0
- lean_explore/extract/informalize.py +653 -0
- lean_explore/extract/package_config.py +59 -0
- lean_explore/extract/package_registry.py +45 -0
- lean_explore/extract/package_utils.py +105 -0
- lean_explore/extract/types.py +25 -0
- lean_explore/mcp/__init__.py +11 -1
- lean_explore/mcp/app.py +14 -46
- lean_explore/mcp/server.py +20 -35
- lean_explore/mcp/tools.py +70 -177
- lean_explore/models/__init__.py +9 -0
- lean_explore/models/search_db.py +76 -0
- lean_explore/models/search_types.py +53 -0
- lean_explore/search/__init__.py +32 -0
- lean_explore/search/engine.py +655 -0
- lean_explore/search/scoring.py +156 -0
- lean_explore/search/service.py +68 -0
- lean_explore/search/tokenization.py +71 -0
- lean_explore/util/__init__.py +28 -0
- lean_explore/util/embedding_client.py +92 -0
- lean_explore/util/logging.py +22 -0
- lean_explore/util/openrouter_client.py +63 -0
- lean_explore/util/reranker_client.py +189 -0
- {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/METADATA +55 -10
- lean_explore-1.0.0.dist-info/RECORD +43 -0
- {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/WHEEL +1 -1
- lean_explore-1.0.0.dist-info/entry_points.txt +2 -0
- lean_explore/cli/agent.py +0 -781
- lean_explore/cli/config_utils.py +0 -481
- lean_explore/defaults.py +0 -114
- lean_explore/local/__init__.py +0 -1
- lean_explore/local/search.py +0 -1050
- lean_explore/local/service.py +0 -392
- lean_explore/shared/__init__.py +0 -1
- lean_explore/shared/models/__init__.py +0 -1
- lean_explore/shared/models/api.py +0 -117
- lean_explore/shared/models/db.py +0 -396
- lean_explore-0.2.2.dist-info/RECORD +0 -26
- lean_explore-0.2.2.dist-info/entry_points.txt +0 -2
- {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""Documentation generation using doc-gen4 for each package.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to run doc-gen4 on each package workspace
|
|
4
|
+
to generate Lean documentation data for the extraction pipeline.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import shutil
|
|
10
|
+
import subprocess
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from lean_explore.extract.github import extract_lean_version
|
|
14
|
+
from lean_explore.extract.package_config import PackageConfig
|
|
15
|
+
from lean_explore.extract.package_registry import PACKAGE_REGISTRY
|
|
16
|
+
from lean_explore.extract.package_utils import (
|
|
17
|
+
get_extraction_order,
|
|
18
|
+
get_package_toolchain,
|
|
19
|
+
update_lakefile_docgen_version,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _clear_workspace_cache(workspace_path: Path) -> None:
|
|
26
|
+
"""Clear entire Lake cache to force complete rebuild.
|
|
27
|
+
|
|
28
|
+
Removes the .lake/ directory and lake-manifest.json to ensure:
|
|
29
|
+
1. Fresh dependency resolution (latest compatible versions)
|
|
30
|
+
2. Fresh doc-gen4 output (regenerated BMP files)
|
|
31
|
+
3. No stale build artifacts
|
|
32
|
+
|
|
33
|
+
Use this for nightly updates to get a clean build from scratch.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
workspace_path: Path to the package workspace.
|
|
37
|
+
"""
|
|
38
|
+
manifest = workspace_path / "lake-manifest.json"
|
|
39
|
+
if manifest.exists():
|
|
40
|
+
logger.info(f"Removing {manifest}")
|
|
41
|
+
manifest.unlink()
|
|
42
|
+
|
|
43
|
+
lake_dir = workspace_path / ".lake"
|
|
44
|
+
if lake_dir.exists():
|
|
45
|
+
logger.info(f"Removing {lake_dir} to force complete rebuild")
|
|
46
|
+
shutil.rmtree(lake_dir)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _get_doc_lib_names(package_name: str) -> list[str]:
|
|
50
|
+
"""Get the library names to run doc-gen4 on for a package.
|
|
51
|
+
|
|
52
|
+
Some packages have custom extract wrappers, others use upstream libraries directly.
|
|
53
|
+
"""
|
|
54
|
+
lib_names: dict[str, list[str]] = {
|
|
55
|
+
"mathlib": ["MathExtract"],
|
|
56
|
+
"physlean": ["PhysExtract"],
|
|
57
|
+
"flt": ["FLTExtract"],
|
|
58
|
+
"formal-conjectures": ["FormalConjectures", "FormalConjecturesForMathlib"],
|
|
59
|
+
"cslib": ["CslibExtract"],
|
|
60
|
+
}
|
|
61
|
+
return lib_names.get(package_name, [f"{package_name.title()}Extract"])
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _setup_workspace(package_config: PackageConfig) -> tuple[str, str]:
|
|
65
|
+
"""Fetch toolchain from GitHub and update lakefile.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Tuple of (lean_toolchain, git_ref).
|
|
69
|
+
"""
|
|
70
|
+
workspace_path = Path("lean") / package_config.name
|
|
71
|
+
lakefile_path = workspace_path / "lakefile.lean"
|
|
72
|
+
toolchain_file = workspace_path / "lean-toolchain"
|
|
73
|
+
|
|
74
|
+
lean_toolchain, git_ref = get_package_toolchain(package_config)
|
|
75
|
+
lean_version = extract_lean_version(lean_toolchain)
|
|
76
|
+
|
|
77
|
+
update_lakefile_docgen_version(lakefile_path, lean_version)
|
|
78
|
+
toolchain_file.write_text(lean_toolchain + "\n")
|
|
79
|
+
|
|
80
|
+
return lean_toolchain, git_ref
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _run_lake_for_package(package_name: str, verbose: bool = False) -> None:
|
|
84
|
+
"""Run lake update, cache get, and doc-gen4 for a package."""
|
|
85
|
+
workspace_path = Path("lean") / package_name
|
|
86
|
+
package_config = PACKAGE_REGISTRY[package_name]
|
|
87
|
+
env = os.environ.copy()
|
|
88
|
+
env["MATHLIB_NO_CACHE_ON_UPDATE"] = "1"
|
|
89
|
+
|
|
90
|
+
logger.info(f"[{package_name}] Running lake update...")
|
|
91
|
+
result = subprocess.run(
|
|
92
|
+
["lake", "update"],
|
|
93
|
+
cwd=workspace_path,
|
|
94
|
+
capture_output=True,
|
|
95
|
+
text=True,
|
|
96
|
+
env=env,
|
|
97
|
+
)
|
|
98
|
+
if verbose and result.stdout:
|
|
99
|
+
logger.info(result.stdout)
|
|
100
|
+
if result.returncode != 0:
|
|
101
|
+
logger.error(result.stderr)
|
|
102
|
+
raise RuntimeError(f"lake update failed for {package_name}")
|
|
103
|
+
|
|
104
|
+
# Fetch mathlib cache for packages that depend on mathlib
|
|
105
|
+
if "mathlib" in package_config.depends_on or package_name == "mathlib":
|
|
106
|
+
logger.info(f"[{package_name}] Fetching mathlib cache...")
|
|
107
|
+
result = subprocess.run(
|
|
108
|
+
["lake", "exe", "cache", "get"],
|
|
109
|
+
cwd=workspace_path,
|
|
110
|
+
capture_output=True,
|
|
111
|
+
text=True,
|
|
112
|
+
env=env,
|
|
113
|
+
)
|
|
114
|
+
if verbose and result.stdout:
|
|
115
|
+
logger.info(result.stdout)
|
|
116
|
+
if result.returncode != 0:
|
|
117
|
+
logger.warning(f"[{package_name}] Cache fetch failed (non-fatal)")
|
|
118
|
+
|
|
119
|
+
logger.info(f"[{package_name}] Running lake build...")
|
|
120
|
+
process = subprocess.Popen(
|
|
121
|
+
["lake", "build"],
|
|
122
|
+
cwd=workspace_path,
|
|
123
|
+
stdout=subprocess.PIPE,
|
|
124
|
+
stderr=subprocess.STDOUT,
|
|
125
|
+
text=True,
|
|
126
|
+
bufsize=1,
|
|
127
|
+
env=env,
|
|
128
|
+
)
|
|
129
|
+
if process.stdout:
|
|
130
|
+
for line in process.stdout:
|
|
131
|
+
print(line, end="", flush=True)
|
|
132
|
+
if process.wait() != 0:
|
|
133
|
+
raise RuntimeError(f"lake build failed for {package_name}")
|
|
134
|
+
|
|
135
|
+
lib_names = _get_doc_lib_names(package_name)
|
|
136
|
+
for lib_name in lib_names:
|
|
137
|
+
logger.info(f"[{package_name}] Running doc-gen4 ({lib_name}:docs)...")
|
|
138
|
+
|
|
139
|
+
process = subprocess.Popen(
|
|
140
|
+
["lake", "build", f"{lib_name}:docs"],
|
|
141
|
+
cwd=workspace_path,
|
|
142
|
+
stdout=subprocess.PIPE,
|
|
143
|
+
stderr=subprocess.STDOUT,
|
|
144
|
+
text=True,
|
|
145
|
+
bufsize=1,
|
|
146
|
+
env=env,
|
|
147
|
+
)
|
|
148
|
+
if process.stdout:
|
|
149
|
+
for line in process.stdout:
|
|
150
|
+
print(line, end="", flush=True)
|
|
151
|
+
returncode = process.wait()
|
|
152
|
+
if returncode != 0:
|
|
153
|
+
logger.warning(
|
|
154
|
+
f"[{package_name}] doc-gen4 had failures for {lib_name} "
|
|
155
|
+
"(continuing with generated docs)"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
async def run_doc_gen4(
|
|
160
|
+
packages: list[str] | None = None,
|
|
161
|
+
setup: bool = True,
|
|
162
|
+
fresh: bool = False,
|
|
163
|
+
verbose: bool = False,
|
|
164
|
+
) -> None:
|
|
165
|
+
"""Run doc-gen4 for each package to generate documentation data.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
packages: List of package names to process. If None, processes all packages
|
|
169
|
+
in dependency order.
|
|
170
|
+
setup: Whether to fetch toolchain and update lakefile before building.
|
|
171
|
+
fresh: Clear cached dependencies to force fresh resolution. Use this for
|
|
172
|
+
nightly updates to get the latest compatible versions of all packages.
|
|
173
|
+
verbose: Enable verbose logging.
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
RuntimeError: If any build step fails.
|
|
177
|
+
"""
|
|
178
|
+
if packages is None:
|
|
179
|
+
packages = get_extraction_order()
|
|
180
|
+
|
|
181
|
+
logger.info(f"Running doc-gen4 for packages: {', '.join(packages)}")
|
|
182
|
+
|
|
183
|
+
for package_name in packages:
|
|
184
|
+
if package_name not in PACKAGE_REGISTRY:
|
|
185
|
+
raise ValueError(f"Unknown package: {package_name}")
|
|
186
|
+
|
|
187
|
+
config = PACKAGE_REGISTRY[package_name]
|
|
188
|
+
workspace_path = Path("lean") / package_name
|
|
189
|
+
logger.info(f"\n{'='*50}\nPackage: {package_name}\n{'='*50}")
|
|
190
|
+
|
|
191
|
+
if fresh:
|
|
192
|
+
_clear_workspace_cache(workspace_path)
|
|
193
|
+
|
|
194
|
+
if setup:
|
|
195
|
+
toolchain, ref = _setup_workspace(config)
|
|
196
|
+
logger.info(f"Toolchain: {toolchain}, ref: {ref}")
|
|
197
|
+
|
|
198
|
+
_run_lake_for_package(package_name, verbose)
|
|
199
|
+
|
|
200
|
+
logger.info("doc-gen4 generation complete for all packages")
|
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
"""Parser for Lean doc-gen4 output files.
|
|
2
|
+
|
|
3
|
+
This module parses doc-gen4 JSON data and extracts Lean source code
|
|
4
|
+
to produce Declaration objects ready for database insertion.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from rich.progress import (
|
|
13
|
+
BarColumn,
|
|
14
|
+
Progress,
|
|
15
|
+
SpinnerColumn,
|
|
16
|
+
TaskProgressColumn,
|
|
17
|
+
TextColumn,
|
|
18
|
+
TimeRemainingColumn,
|
|
19
|
+
)
|
|
20
|
+
from sqlalchemy.dialects.postgresql import insert
|
|
21
|
+
from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession
|
|
22
|
+
|
|
23
|
+
from lean_explore.extract.types import Declaration
|
|
24
|
+
from lean_explore.models import Declaration as DBDeclaration
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _strip_lean_comments(source_text: str) -> str:
|
|
30
|
+
"""Strip Lean comments from source text for comparison.
|
|
31
|
+
|
|
32
|
+
Removes:
|
|
33
|
+
- Line comments: -- to end of line
|
|
34
|
+
- Block comments: /- ... -/ (including nested)
|
|
35
|
+
- Doc comments: /-- ... -/ (just a special form of block comments)
|
|
36
|
+
|
|
37
|
+
Returns normalized text with collapsed whitespace for reliable comparison.
|
|
38
|
+
"""
|
|
39
|
+
result = []
|
|
40
|
+
i = 0
|
|
41
|
+
length = len(source_text)
|
|
42
|
+
|
|
43
|
+
while i < length:
|
|
44
|
+
# Check for block comment (includes doc comments /-- ... -/)
|
|
45
|
+
if i < length - 1 and source_text[i : i + 2] == "/-":
|
|
46
|
+
# Skip the opening /-
|
|
47
|
+
i += 2
|
|
48
|
+
nesting_level = 1
|
|
49
|
+
while i < length and nesting_level > 0:
|
|
50
|
+
if i < length - 1 and source_text[i : i + 2] == "/-":
|
|
51
|
+
nesting_level += 1
|
|
52
|
+
i += 2
|
|
53
|
+
elif i < length - 1 and source_text[i : i + 2] == "-/":
|
|
54
|
+
nesting_level -= 1
|
|
55
|
+
i += 2
|
|
56
|
+
else:
|
|
57
|
+
i += 1
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
# Check for line comment
|
|
61
|
+
if i < length - 1 and source_text[i : i + 2] == "--":
|
|
62
|
+
# Skip to end of line
|
|
63
|
+
while i < length and source_text[i] != "\n":
|
|
64
|
+
i += 1
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
result.append(source_text[i])
|
|
68
|
+
i += 1
|
|
69
|
+
|
|
70
|
+
# Normalize whitespace: collapse multiple spaces/newlines into single space
|
|
71
|
+
text = "".join(result)
|
|
72
|
+
return " ".join(text.split())
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _filter_auto_generated_projections(
|
|
76
|
+
declarations: list[Declaration],
|
|
77
|
+
) -> tuple[list[Declaration], int]:
|
|
78
|
+
"""Filter out auto-generated 'to*' projections that share source text with parent.
|
|
79
|
+
|
|
80
|
+
When a Lean structure extends another, it automatically generates projections
|
|
81
|
+
like `Scheme.toLocallyRingedSpace` that point to the same source location as
|
|
82
|
+
the parent `Scheme` structure. These should be filtered out.
|
|
83
|
+
|
|
84
|
+
However, legitimate definitions like `IsOpenImmersion.toScheme` have their
|
|
85
|
+
own unique source text and should be kept.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
declarations: List of all extracted declarations.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Tuple of (filtered declarations, count of removed projections).
|
|
92
|
+
"""
|
|
93
|
+
# Build a map of stripped source text -> list of declaration names
|
|
94
|
+
source_to_names: dict[str, list[str]] = {}
|
|
95
|
+
for declaration in declarations:
|
|
96
|
+
stripped = _strip_lean_comments(declaration.source_text)
|
|
97
|
+
if stripped not in source_to_names:
|
|
98
|
+
source_to_names[stripped] = []
|
|
99
|
+
source_to_names[stripped].append(declaration.name)
|
|
100
|
+
|
|
101
|
+
filtered = []
|
|
102
|
+
removed_count = 0
|
|
103
|
+
|
|
104
|
+
for declaration in declarations:
|
|
105
|
+
short_name = declaration.name.rsplit(".", 1)[-1]
|
|
106
|
+
|
|
107
|
+
# Check if this looks like a 'toFoo' projection (to + uppercase letter)
|
|
108
|
+
is_to_projection = (
|
|
109
|
+
len(short_name) > 2
|
|
110
|
+
and short_name.startswith("to")
|
|
111
|
+
and short_name[2].isupper()
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if is_to_projection:
|
|
115
|
+
stripped = _strip_lean_comments(declaration.source_text)
|
|
116
|
+
declarations_with_same_source = source_to_names.get(stripped, [])
|
|
117
|
+
|
|
118
|
+
# If other declarations share this source text, this is auto-generated
|
|
119
|
+
if len(declarations_with_same_source) > 1:
|
|
120
|
+
removed_count += 1
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
filtered.append(declaration)
|
|
124
|
+
|
|
125
|
+
return filtered, removed_count
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _build_package_cache(
|
|
129
|
+
lean_root: str | Path, workspace_name: str | None = None
|
|
130
|
+
) -> dict[str, Path]:
|
|
131
|
+
"""Build a cache of package names to their actual directories.
|
|
132
|
+
|
|
133
|
+
When workspace_name is provided, only includes packages from that specific
|
|
134
|
+
workspace's .lake/packages directory. This ensures source files are resolved
|
|
135
|
+
from the correct workspace, avoiding version mismatches between workspaces.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
lean_root: Root directory containing package workspaces.
|
|
139
|
+
workspace_name: If provided, only include packages from this workspace.
|
|
140
|
+
If None, includes packages from all workspaces (legacy behavior).
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Dictionary mapping lowercase package names to their directory paths.
|
|
144
|
+
"""
|
|
145
|
+
from lean_explore.extract.package_utils import get_extraction_order
|
|
146
|
+
|
|
147
|
+
lean_root = Path(lean_root)
|
|
148
|
+
cache = {}
|
|
149
|
+
|
|
150
|
+
# Determine which workspaces to scan
|
|
151
|
+
workspaces = [workspace_name] if workspace_name else get_extraction_order()
|
|
152
|
+
|
|
153
|
+
# Collect packages from workspace(s)
|
|
154
|
+
for ws_name in workspaces:
|
|
155
|
+
packages_directory = lean_root / ws_name / ".lake" / "packages"
|
|
156
|
+
if packages_directory.exists():
|
|
157
|
+
for package_directory in packages_directory.iterdir():
|
|
158
|
+
if package_directory.is_dir():
|
|
159
|
+
cache[package_directory.name.lower()] = package_directory
|
|
160
|
+
|
|
161
|
+
# Add toolchain - use specified workspace or find first available
|
|
162
|
+
if workspace_name:
|
|
163
|
+
toolchain_workspaces = [workspace_name]
|
|
164
|
+
else:
|
|
165
|
+
toolchain_workspaces = get_extraction_order()
|
|
166
|
+
for ws_name in toolchain_workspaces:
|
|
167
|
+
toolchain_file = lean_root / ws_name / "lean-toolchain"
|
|
168
|
+
if toolchain_file.exists():
|
|
169
|
+
version = toolchain_file.read_text().strip().split(":")[-1]
|
|
170
|
+
toolchain_path = (
|
|
171
|
+
Path.home()
|
|
172
|
+
/ ".elan"
|
|
173
|
+
/ "toolchains"
|
|
174
|
+
/ f"leanprover--lean4---{version}"
|
|
175
|
+
/ "src"
|
|
176
|
+
/ "lean"
|
|
177
|
+
)
|
|
178
|
+
if toolchain_path.exists():
|
|
179
|
+
cache["lean4"] = toolchain_path
|
|
180
|
+
break
|
|
181
|
+
|
|
182
|
+
return cache
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _extract_dependencies_from_html(html: str) -> list[str]:
|
|
186
|
+
"""Extract dependency names from HTML declaration header."""
|
|
187
|
+
href_pattern = r'href="[^"]*#([^"]+)"'
|
|
188
|
+
matches = re.findall(href_pattern, html)
|
|
189
|
+
|
|
190
|
+
dependencies = []
|
|
191
|
+
seen = set()
|
|
192
|
+
for match in matches:
|
|
193
|
+
if match not in seen:
|
|
194
|
+
dependencies.append(match)
|
|
195
|
+
seen.add(match)
|
|
196
|
+
|
|
197
|
+
return dependencies
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _read_source_lines(file_path: str | Path, line_start: int, line_end: int) -> str:
|
|
201
|
+
"""Read specific lines from a source file.
|
|
202
|
+
|
|
203
|
+
If the extracted text is just an attribute (like @[to_additive]), extends
|
|
204
|
+
the range to include the full declaration.
|
|
205
|
+
"""
|
|
206
|
+
file_path = Path(file_path)
|
|
207
|
+
with open(file_path, encoding="utf-8") as f:
|
|
208
|
+
lines = f.readlines()
|
|
209
|
+
if line_start > len(lines) or line_end > len(lines):
|
|
210
|
+
raise ValueError(
|
|
211
|
+
f"Line range {line_start}-{line_end} out of bounds for {file_path}"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
result = "".join(lines[line_start - 1 : line_end])
|
|
215
|
+
|
|
216
|
+
# If result starts with an attribute, extend to get the full declaration
|
|
217
|
+
stripped = result.strip()
|
|
218
|
+
if stripped.startswith("@["):
|
|
219
|
+
extended_end = line_end
|
|
220
|
+
while extended_end < len(lines):
|
|
221
|
+
extended_end += 1
|
|
222
|
+
extended_result = "".join(lines[line_start - 1 : extended_end])
|
|
223
|
+
if any(
|
|
224
|
+
kw in extended_result
|
|
225
|
+
for kw in [
|
|
226
|
+
" def ",
|
|
227
|
+
" theorem ",
|
|
228
|
+
" lemma ",
|
|
229
|
+
" instance ",
|
|
230
|
+
" class ",
|
|
231
|
+
" structure ",
|
|
232
|
+
" inductive ",
|
|
233
|
+
" abbrev ",
|
|
234
|
+
":=",
|
|
235
|
+
]
|
|
236
|
+
):
|
|
237
|
+
return extended_result.rstrip()
|
|
238
|
+
return "".join(lines[line_start - 1 : extended_end]).rstrip()
|
|
239
|
+
|
|
240
|
+
return result
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _extract_source_text(
|
|
244
|
+
source_link: str, lean_root: str | Path, package_cache: dict[str, Path]
|
|
245
|
+
) -> str:
|
|
246
|
+
"""Extract source text from a Lean file given a GitHub source link."""
|
|
247
|
+
lean_root = Path(lean_root)
|
|
248
|
+
match = re.search(
|
|
249
|
+
r"github\.com/([^/]+)/([^/]+)/blob/[^/]+/(.+\.lean)#L(\d+)-L(\d+)",
|
|
250
|
+
source_link,
|
|
251
|
+
)
|
|
252
|
+
if not match:
|
|
253
|
+
raise ValueError(f"Could not parse source link: {source_link}")
|
|
254
|
+
|
|
255
|
+
(
|
|
256
|
+
organization_name,
|
|
257
|
+
package_name,
|
|
258
|
+
file_path_string,
|
|
259
|
+
line_start_string,
|
|
260
|
+
line_end_string,
|
|
261
|
+
) = match.groups()
|
|
262
|
+
line_start = int(line_start_string)
|
|
263
|
+
line_end = int(line_end_string)
|
|
264
|
+
|
|
265
|
+
candidates = []
|
|
266
|
+
|
|
267
|
+
for variant in [
|
|
268
|
+
package_name.lower(),
|
|
269
|
+
package_name.rstrip("0123456789").lower(),
|
|
270
|
+
package_name.replace("-", "").lower(),
|
|
271
|
+
]:
|
|
272
|
+
if variant in package_cache:
|
|
273
|
+
if variant == "lean4" and file_path_string.startswith("src/"):
|
|
274
|
+
adjusted_path = file_path_string[4:]
|
|
275
|
+
else:
|
|
276
|
+
adjusted_path = file_path_string
|
|
277
|
+
candidates.append(package_cache[variant] / adjusted_path)
|
|
278
|
+
|
|
279
|
+
candidates.append(lean_root / file_path_string)
|
|
280
|
+
|
|
281
|
+
for candidate in candidates:
|
|
282
|
+
if candidate.exists():
|
|
283
|
+
return _read_source_lines(candidate, line_start, line_end)
|
|
284
|
+
|
|
285
|
+
for package_directory in package_cache.values():
|
|
286
|
+
candidate = package_directory / file_path_string
|
|
287
|
+
if candidate.exists():
|
|
288
|
+
return _read_source_lines(candidate, line_start, line_end)
|
|
289
|
+
|
|
290
|
+
raise FileNotFoundError(
|
|
291
|
+
f"Could not find {file_path_string} for package {package_name}"
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _parse_declarations_from_files(
|
|
296
|
+
bmp_files: list[Path],
|
|
297
|
+
lean_root: Path,
|
|
298
|
+
package_cache: dict[str, Path],
|
|
299
|
+
allowed_module_prefixes: list[str],
|
|
300
|
+
) -> list[Declaration]:
|
|
301
|
+
"""Parse declarations from doc-gen4 BMP files.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
bmp_files: List of paths to BMP files containing declaration data.
|
|
305
|
+
lean_root: Root directory of the Lean project.
|
|
306
|
+
package_cache: Dictionary mapping package names to their directories.
|
|
307
|
+
allowed_module_prefixes: Module prefixes to extract (e.g., ["Mathlib"]).
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
List of parsed Declaration objects.
|
|
311
|
+
"""
|
|
312
|
+
declarations = []
|
|
313
|
+
|
|
314
|
+
with Progress(
|
|
315
|
+
SpinnerColumn(),
|
|
316
|
+
TextColumn("[progress.description]{task.description}"),
|
|
317
|
+
BarColumn(),
|
|
318
|
+
TaskProgressColumn(),
|
|
319
|
+
TimeRemainingColumn(),
|
|
320
|
+
) as progress:
|
|
321
|
+
task = progress.add_task("[cyan]Parsing BMP files...", total=len(bmp_files))
|
|
322
|
+
|
|
323
|
+
for file_path in bmp_files:
|
|
324
|
+
with open(file_path, encoding="utf-8") as f:
|
|
325
|
+
data = json.load(f)
|
|
326
|
+
|
|
327
|
+
module_name = data["name"]
|
|
328
|
+
|
|
329
|
+
# Only extract modules matching the allowed prefixes for this workspace
|
|
330
|
+
# Use prefix + "." to avoid "Lean" matching "LeanSearchClient"
|
|
331
|
+
matches_prefix = any(
|
|
332
|
+
module_name == prefix or module_name.startswith(prefix + ".")
|
|
333
|
+
for prefix in allowed_module_prefixes
|
|
334
|
+
)
|
|
335
|
+
if not matches_prefix:
|
|
336
|
+
progress.update(task, advance=1)
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
for declaration_data in data.get("declarations", []):
|
|
340
|
+
information = declaration_data["info"]
|
|
341
|
+
source_text = _extract_source_text(
|
|
342
|
+
information["sourceLink"], lean_root, package_cache
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
header_html = declaration_data.get("header", "")
|
|
346
|
+
dependencies = _extract_dependencies_from_html(header_html)
|
|
347
|
+
|
|
348
|
+
# Filter out self-references from dependencies
|
|
349
|
+
declaration_name = information["name"]
|
|
350
|
+
filtered_dependencies = [
|
|
351
|
+
d for d in dependencies if d != declaration_name
|
|
352
|
+
]
|
|
353
|
+
|
|
354
|
+
# Skip auto-generated .mk constructors
|
|
355
|
+
if declaration_name.endswith(".mk"):
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
declarations.append(
|
|
359
|
+
Declaration(
|
|
360
|
+
name=declaration_name,
|
|
361
|
+
module=module_name,
|
|
362
|
+
docstring=information.get("doc"),
|
|
363
|
+
source_text=source_text,
|
|
364
|
+
source_link=information["sourceLink"],
|
|
365
|
+
dependencies=filtered_dependencies or None,
|
|
366
|
+
)
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
progress.update(task, advance=1)
|
|
370
|
+
|
|
371
|
+
return declarations
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
async def _insert_declarations_batch(
|
|
375
|
+
session: AsyncSession, declarations: list[Declaration], batch_size: int = 1000
|
|
376
|
+
) -> int:
|
|
377
|
+
"""Insert declarations into database in batches.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
session: Active database session.
|
|
381
|
+
declarations: List of declarations to insert.
|
|
382
|
+
batch_size: Number of declarations to insert per batch.
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
Number of declarations successfully inserted.
|
|
386
|
+
"""
|
|
387
|
+
inserted_count = 0
|
|
388
|
+
|
|
389
|
+
with Progress(
|
|
390
|
+
SpinnerColumn(),
|
|
391
|
+
TextColumn("[progress.description]{task.description}"),
|
|
392
|
+
BarColumn(),
|
|
393
|
+
TaskProgressColumn(),
|
|
394
|
+
TimeRemainingColumn(),
|
|
395
|
+
) as progress:
|
|
396
|
+
task = progress.add_task(
|
|
397
|
+
"[green]Inserting declarations into database...",
|
|
398
|
+
total=len(declarations),
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
async with session.begin():
|
|
402
|
+
for i in range(0, len(declarations), batch_size):
|
|
403
|
+
batch = declarations[i : i + batch_size]
|
|
404
|
+
|
|
405
|
+
for declaration in batch:
|
|
406
|
+
dependencies_json = (
|
|
407
|
+
json.dumps(declaration.dependencies)
|
|
408
|
+
if declaration.dependencies
|
|
409
|
+
else None
|
|
410
|
+
)
|
|
411
|
+
statement = (
|
|
412
|
+
insert(DBDeclaration)
|
|
413
|
+
.values(
|
|
414
|
+
name=declaration.name,
|
|
415
|
+
module=declaration.module,
|
|
416
|
+
docstring=declaration.docstring,
|
|
417
|
+
source_text=declaration.source_text,
|
|
418
|
+
source_link=declaration.source_link,
|
|
419
|
+
dependencies=dependencies_json,
|
|
420
|
+
)
|
|
421
|
+
.on_conflict_do_nothing(index_elements=["name"])
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
result = await session.execute(statement)
|
|
425
|
+
inserted_count += result.rowcount
|
|
426
|
+
progress.update(task, advance=1)
|
|
427
|
+
|
|
428
|
+
return inserted_count
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> None:
|
|
432
|
+
"""Extract all declarations from doc-gen4 data and load into database.
|
|
433
|
+
|
|
434
|
+
Looks for BMP files in each package's .lake/build/doc-data directory.
|
|
435
|
+
Extracts only declarations matching the package's configured module_prefixes,
|
|
436
|
+
ensuring each package's declarations come from its own workspace.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
engine: SQLAlchemy async engine for database connection.
|
|
440
|
+
batch_size: Number of declarations to insert per database transaction.
|
|
441
|
+
"""
|
|
442
|
+
from lean_explore.extract.package_registry import PACKAGE_REGISTRY
|
|
443
|
+
from lean_explore.extract.package_utils import get_extraction_order
|
|
444
|
+
|
|
445
|
+
lean_root = Path("lean")
|
|
446
|
+
all_declarations = []
|
|
447
|
+
|
|
448
|
+
# Process each workspace separately with its own package cache
|
|
449
|
+
for package_name in get_extraction_order():
|
|
450
|
+
package_config = PACKAGE_REGISTRY[package_name]
|
|
451
|
+
doc_data_dir = lean_root / package_name / ".lake" / "build" / "doc-data"
|
|
452
|
+
|
|
453
|
+
if not doc_data_dir.exists():
|
|
454
|
+
logger.warning(f"No doc-data directory for {package_name}: {doc_data_dir}")
|
|
455
|
+
continue
|
|
456
|
+
|
|
457
|
+
bmp_files = sorted(doc_data_dir.glob("**/*.bmp"))
|
|
458
|
+
logger.info(f"Found {len(bmp_files)} BMP files in {package_name}")
|
|
459
|
+
|
|
460
|
+
if not bmp_files:
|
|
461
|
+
continue
|
|
462
|
+
|
|
463
|
+
# Build workspace-specific package cache to avoid version mismatches
|
|
464
|
+
package_cache = _build_package_cache(lean_root, package_name)
|
|
465
|
+
logger.info(
|
|
466
|
+
f"Built package cache for {package_name} with {len(package_cache)} packages"
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
declarations = _parse_declarations_from_files(
|
|
470
|
+
bmp_files, lean_root, package_cache, package_config.module_prefixes
|
|
471
|
+
)
|
|
472
|
+
logger.info(
|
|
473
|
+
f"Extracted {len(declarations)} declarations from {package_name} "
|
|
474
|
+
f"(prefixes: {package_config.module_prefixes})"
|
|
475
|
+
)
|
|
476
|
+
all_declarations.extend(declarations)
|
|
477
|
+
|
|
478
|
+
if not all_declarations:
|
|
479
|
+
raise FileNotFoundError("No declarations extracted from any package workspace")
|
|
480
|
+
|
|
481
|
+
logger.info(f"Total declarations extracted: {len(all_declarations)}")
|
|
482
|
+
|
|
483
|
+
# Filter out auto-generated 'to*' projections that share source with parent
|
|
484
|
+
all_declarations, projection_count = _filter_auto_generated_projections(
|
|
485
|
+
all_declarations
|
|
486
|
+
)
|
|
487
|
+
if projection_count > 0:
|
|
488
|
+
logger.info(f"Filtered {projection_count} auto-generated 'to*' projections")
|
|
489
|
+
|
|
490
|
+
async with AsyncSession(engine) as session:
|
|
491
|
+
inserted_count = await _insert_declarations_batch(
|
|
492
|
+
session, all_declarations, batch_size
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
skipped = len(all_declarations) - inserted_count
|
|
496
|
+
logger.info(
|
|
497
|
+
f"Inserted {inserted_count} new declarations into database "
|
|
498
|
+
f"(skipped {skipped} duplicates)"
|
|
499
|
+
)
|