lean-explore 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. lean_explore/__init__.py +14 -1
  2. lean_explore/api/__init__.py +12 -1
  3. lean_explore/api/client.py +64 -176
  4. lean_explore/cli/__init__.py +10 -1
  5. lean_explore/cli/data_commands.py +157 -479
  6. lean_explore/cli/display.py +171 -0
  7. lean_explore/cli/main.py +51 -608
  8. lean_explore/config.py +244 -0
  9. lean_explore/extract/__init__.py +5 -0
  10. lean_explore/extract/__main__.py +368 -0
  11. lean_explore/extract/doc_gen4.py +200 -0
  12. lean_explore/extract/doc_parser.py +499 -0
  13. lean_explore/extract/embeddings.py +371 -0
  14. lean_explore/extract/github.py +110 -0
  15. lean_explore/extract/index.py +317 -0
  16. lean_explore/extract/informalize.py +653 -0
  17. lean_explore/extract/package_config.py +59 -0
  18. lean_explore/extract/package_registry.py +45 -0
  19. lean_explore/extract/package_utils.py +105 -0
  20. lean_explore/extract/types.py +25 -0
  21. lean_explore/mcp/__init__.py +11 -1
  22. lean_explore/mcp/app.py +14 -46
  23. lean_explore/mcp/server.py +20 -35
  24. lean_explore/mcp/tools.py +70 -205
  25. lean_explore/models/__init__.py +9 -0
  26. lean_explore/models/search_db.py +76 -0
  27. lean_explore/models/search_types.py +53 -0
  28. lean_explore/search/__init__.py +32 -0
  29. lean_explore/search/engine.py +655 -0
  30. lean_explore/search/scoring.py +156 -0
  31. lean_explore/search/service.py +68 -0
  32. lean_explore/search/tokenization.py +71 -0
  33. lean_explore/util/__init__.py +28 -0
  34. lean_explore/util/embedding_client.py +92 -0
  35. lean_explore/util/logging.py +22 -0
  36. lean_explore/util/openrouter_client.py +63 -0
  37. lean_explore/util/reranker_client.py +189 -0
  38. {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/METADATA +32 -9
  39. lean_explore-1.0.0.dist-info/RECORD +43 -0
  40. {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/WHEEL +1 -1
  41. lean_explore-1.0.0.dist-info/entry_points.txt +2 -0
  42. lean_explore/cli/agent.py +0 -788
  43. lean_explore/cli/config_utils.py +0 -481
  44. lean_explore/defaults.py +0 -114
  45. lean_explore/local/__init__.py +0 -1
  46. lean_explore/local/search.py +0 -1050
  47. lean_explore/local/service.py +0 -479
  48. lean_explore/shared/__init__.py +0 -1
  49. lean_explore/shared/models/__init__.py +0 -1
  50. lean_explore/shared/models/api.py +0 -117
  51. lean_explore/shared/models/db.py +0 -396
  52. lean_explore-0.3.0.dist-info/RECORD +0 -26
  53. lean_explore-0.3.0.dist-info/entry_points.txt +0 -2
  54. {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/licenses/LICENSE +0 -0
  55. {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,200 @@
1
+ """Documentation generation using doc-gen4 for each package.
2
+
3
+ This module provides functionality to run doc-gen4 on each package workspace
4
+ to generate Lean documentation data for the extraction pipeline.
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ import shutil
10
+ import subprocess
11
+ from pathlib import Path
12
+
13
+ from lean_explore.extract.github import extract_lean_version
14
+ from lean_explore.extract.package_config import PackageConfig
15
+ from lean_explore.extract.package_registry import PACKAGE_REGISTRY
16
+ from lean_explore.extract.package_utils import (
17
+ get_extraction_order,
18
+ get_package_toolchain,
19
+ update_lakefile_docgen_version,
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def _clear_workspace_cache(workspace_path: Path) -> None:
26
+ """Clear entire Lake cache to force complete rebuild.
27
+
28
+ Removes the .lake/ directory and lake-manifest.json to ensure:
29
+ 1. Fresh dependency resolution (latest compatible versions)
30
+ 2. Fresh doc-gen4 output (regenerated BMP files)
31
+ 3. No stale build artifacts
32
+
33
+ Use this for nightly updates to get a clean build from scratch.
34
+
35
+ Args:
36
+ workspace_path: Path to the package workspace.
37
+ """
38
+ manifest = workspace_path / "lake-manifest.json"
39
+ if manifest.exists():
40
+ logger.info(f"Removing {manifest}")
41
+ manifest.unlink()
42
+
43
+ lake_dir = workspace_path / ".lake"
44
+ if lake_dir.exists():
45
+ logger.info(f"Removing {lake_dir} to force complete rebuild")
46
+ shutil.rmtree(lake_dir)
47
+
48
+
49
+ def _get_doc_lib_names(package_name: str) -> list[str]:
50
+ """Get the library names to run doc-gen4 on for a package.
51
+
52
+ Some packages have custom extract wrappers, others use upstream libraries directly.
53
+ """
54
+ lib_names: dict[str, list[str]] = {
55
+ "mathlib": ["MathExtract"],
56
+ "physlean": ["PhysExtract"],
57
+ "flt": ["FLTExtract"],
58
+ "formal-conjectures": ["FormalConjectures", "FormalConjecturesForMathlib"],
59
+ "cslib": ["CslibExtract"],
60
+ }
61
+ return lib_names.get(package_name, [f"{package_name.title()}Extract"])
62
+
63
+
64
+ def _setup_workspace(package_config: PackageConfig) -> tuple[str, str]:
65
+ """Fetch toolchain from GitHub and update lakefile.
66
+
67
+ Returns:
68
+ Tuple of (lean_toolchain, git_ref).
69
+ """
70
+ workspace_path = Path("lean") / package_config.name
71
+ lakefile_path = workspace_path / "lakefile.lean"
72
+ toolchain_file = workspace_path / "lean-toolchain"
73
+
74
+ lean_toolchain, git_ref = get_package_toolchain(package_config)
75
+ lean_version = extract_lean_version(lean_toolchain)
76
+
77
+ update_lakefile_docgen_version(lakefile_path, lean_version)
78
+ toolchain_file.write_text(lean_toolchain + "\n")
79
+
80
+ return lean_toolchain, git_ref
81
+
82
+
83
+ def _run_lake_for_package(package_name: str, verbose: bool = False) -> None:
84
+ """Run lake update, cache get, and doc-gen4 for a package."""
85
+ workspace_path = Path("lean") / package_name
86
+ package_config = PACKAGE_REGISTRY[package_name]
87
+ env = os.environ.copy()
88
+ env["MATHLIB_NO_CACHE_ON_UPDATE"] = "1"
89
+
90
+ logger.info(f"[{package_name}] Running lake update...")
91
+ result = subprocess.run(
92
+ ["lake", "update"],
93
+ cwd=workspace_path,
94
+ capture_output=True,
95
+ text=True,
96
+ env=env,
97
+ )
98
+ if verbose and result.stdout:
99
+ logger.info(result.stdout)
100
+ if result.returncode != 0:
101
+ logger.error(result.stderr)
102
+ raise RuntimeError(f"lake update failed for {package_name}")
103
+
104
+ # Fetch mathlib cache for packages that depend on mathlib
105
+ if "mathlib" in package_config.depends_on or package_name == "mathlib":
106
+ logger.info(f"[{package_name}] Fetching mathlib cache...")
107
+ result = subprocess.run(
108
+ ["lake", "exe", "cache", "get"],
109
+ cwd=workspace_path,
110
+ capture_output=True,
111
+ text=True,
112
+ env=env,
113
+ )
114
+ if verbose and result.stdout:
115
+ logger.info(result.stdout)
116
+ if result.returncode != 0:
117
+ logger.warning(f"[{package_name}] Cache fetch failed (non-fatal)")
118
+
119
+ logger.info(f"[{package_name}] Running lake build...")
120
+ process = subprocess.Popen(
121
+ ["lake", "build"],
122
+ cwd=workspace_path,
123
+ stdout=subprocess.PIPE,
124
+ stderr=subprocess.STDOUT,
125
+ text=True,
126
+ bufsize=1,
127
+ env=env,
128
+ )
129
+ if process.stdout:
130
+ for line in process.stdout:
131
+ print(line, end="", flush=True)
132
+ if process.wait() != 0:
133
+ raise RuntimeError(f"lake build failed for {package_name}")
134
+
135
+ lib_names = _get_doc_lib_names(package_name)
136
+ for lib_name in lib_names:
137
+ logger.info(f"[{package_name}] Running doc-gen4 ({lib_name}:docs)...")
138
+
139
+ process = subprocess.Popen(
140
+ ["lake", "build", f"{lib_name}:docs"],
141
+ cwd=workspace_path,
142
+ stdout=subprocess.PIPE,
143
+ stderr=subprocess.STDOUT,
144
+ text=True,
145
+ bufsize=1,
146
+ env=env,
147
+ )
148
+ if process.stdout:
149
+ for line in process.stdout:
150
+ print(line, end="", flush=True)
151
+ returncode = process.wait()
152
+ if returncode != 0:
153
+ logger.warning(
154
+ f"[{package_name}] doc-gen4 had failures for {lib_name} "
155
+ "(continuing with generated docs)"
156
+ )
157
+
158
+
159
+ async def run_doc_gen4(
160
+ packages: list[str] | None = None,
161
+ setup: bool = True,
162
+ fresh: bool = False,
163
+ verbose: bool = False,
164
+ ) -> None:
165
+ """Run doc-gen4 for each package to generate documentation data.
166
+
167
+ Args:
168
+ packages: List of package names to process. If None, processes all packages
169
+ in dependency order.
170
+ setup: Whether to fetch toolchain and update lakefile before building.
171
+ fresh: Clear cached dependencies to force fresh resolution. Use this for
172
+ nightly updates to get the latest compatible versions of all packages.
173
+ verbose: Enable verbose logging.
174
+
175
+ Raises:
176
+ RuntimeError: If any build step fails.
177
+ """
178
+ if packages is None:
179
+ packages = get_extraction_order()
180
+
181
+ logger.info(f"Running doc-gen4 for packages: {', '.join(packages)}")
182
+
183
+ for package_name in packages:
184
+ if package_name not in PACKAGE_REGISTRY:
185
+ raise ValueError(f"Unknown package: {package_name}")
186
+
187
+ config = PACKAGE_REGISTRY[package_name]
188
+ workspace_path = Path("lean") / package_name
189
+ logger.info(f"\n{'='*50}\nPackage: {package_name}\n{'='*50}")
190
+
191
+ if fresh:
192
+ _clear_workspace_cache(workspace_path)
193
+
194
+ if setup:
195
+ toolchain, ref = _setup_workspace(config)
196
+ logger.info(f"Toolchain: {toolchain}, ref: {ref}")
197
+
198
+ _run_lake_for_package(package_name, verbose)
199
+
200
+ logger.info("doc-gen4 generation complete for all packages")
@@ -0,0 +1,499 @@
1
+ """Parser for Lean doc-gen4 output files.
2
+
3
+ This module parses doc-gen4 JSON data and extracts Lean source code
4
+ to produce Declaration objects ready for database insertion.
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ import re
10
+ from pathlib import Path
11
+
12
+ from rich.progress import (
13
+ BarColumn,
14
+ Progress,
15
+ SpinnerColumn,
16
+ TaskProgressColumn,
17
+ TextColumn,
18
+ TimeRemainingColumn,
19
+ )
20
+ from sqlalchemy.dialects.postgresql import insert
21
+ from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession
22
+
23
+ from lean_explore.extract.types import Declaration
24
+ from lean_explore.models import Declaration as DBDeclaration
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ def _strip_lean_comments(source_text: str) -> str:
30
+ """Strip Lean comments from source text for comparison.
31
+
32
+ Removes:
33
+ - Line comments: -- to end of line
34
+ - Block comments: /- ... -/ (including nested)
35
+ - Doc comments: /-- ... -/ (just a special form of block comments)
36
+
37
+ Returns normalized text with collapsed whitespace for reliable comparison.
38
+ """
39
+ result = []
40
+ i = 0
41
+ length = len(source_text)
42
+
43
+ while i < length:
44
+ # Check for block comment (includes doc comments /-- ... -/)
45
+ if i < length - 1 and source_text[i : i + 2] == "/-":
46
+ # Skip the opening /-
47
+ i += 2
48
+ nesting_level = 1
49
+ while i < length and nesting_level > 0:
50
+ if i < length - 1 and source_text[i : i + 2] == "/-":
51
+ nesting_level += 1
52
+ i += 2
53
+ elif i < length - 1 and source_text[i : i + 2] == "-/":
54
+ nesting_level -= 1
55
+ i += 2
56
+ else:
57
+ i += 1
58
+ continue
59
+
60
+ # Check for line comment
61
+ if i < length - 1 and source_text[i : i + 2] == "--":
62
+ # Skip to end of line
63
+ while i < length and source_text[i] != "\n":
64
+ i += 1
65
+ continue
66
+
67
+ result.append(source_text[i])
68
+ i += 1
69
+
70
+ # Normalize whitespace: collapse multiple spaces/newlines into single space
71
+ text = "".join(result)
72
+ return " ".join(text.split())
73
+
74
+
75
+ def _filter_auto_generated_projections(
76
+ declarations: list[Declaration],
77
+ ) -> tuple[list[Declaration], int]:
78
+ """Filter out auto-generated 'to*' projections that share source text with parent.
79
+
80
+ When a Lean structure extends another, it automatically generates projections
81
+ like `Scheme.toLocallyRingedSpace` that point to the same source location as
82
+ the parent `Scheme` structure. These should be filtered out.
83
+
84
+ However, legitimate definitions like `IsOpenImmersion.toScheme` have their
85
+ own unique source text and should be kept.
86
+
87
+ Args:
88
+ declarations: List of all extracted declarations.
89
+
90
+ Returns:
91
+ Tuple of (filtered declarations, count of removed projections).
92
+ """
93
+ # Build a map of stripped source text -> list of declaration names
94
+ source_to_names: dict[str, list[str]] = {}
95
+ for declaration in declarations:
96
+ stripped = _strip_lean_comments(declaration.source_text)
97
+ if stripped not in source_to_names:
98
+ source_to_names[stripped] = []
99
+ source_to_names[stripped].append(declaration.name)
100
+
101
+ filtered = []
102
+ removed_count = 0
103
+
104
+ for declaration in declarations:
105
+ short_name = declaration.name.rsplit(".", 1)[-1]
106
+
107
+ # Check if this looks like a 'toFoo' projection (to + uppercase letter)
108
+ is_to_projection = (
109
+ len(short_name) > 2
110
+ and short_name.startswith("to")
111
+ and short_name[2].isupper()
112
+ )
113
+
114
+ if is_to_projection:
115
+ stripped = _strip_lean_comments(declaration.source_text)
116
+ declarations_with_same_source = source_to_names.get(stripped, [])
117
+
118
+ # If other declarations share this source text, this is auto-generated
119
+ if len(declarations_with_same_source) > 1:
120
+ removed_count += 1
121
+ continue
122
+
123
+ filtered.append(declaration)
124
+
125
+ return filtered, removed_count
126
+
127
+
128
+ def _build_package_cache(
129
+ lean_root: str | Path, workspace_name: str | None = None
130
+ ) -> dict[str, Path]:
131
+ """Build a cache of package names to their actual directories.
132
+
133
+ When workspace_name is provided, only includes packages from that specific
134
+ workspace's .lake/packages directory. This ensures source files are resolved
135
+ from the correct workspace, avoiding version mismatches between workspaces.
136
+
137
+ Args:
138
+ lean_root: Root directory containing package workspaces.
139
+ workspace_name: If provided, only include packages from this workspace.
140
+ If None, includes packages from all workspaces (legacy behavior).
141
+
142
+ Returns:
143
+ Dictionary mapping lowercase package names to their directory paths.
144
+ """
145
+ from lean_explore.extract.package_utils import get_extraction_order
146
+
147
+ lean_root = Path(lean_root)
148
+ cache = {}
149
+
150
+ # Determine which workspaces to scan
151
+ workspaces = [workspace_name] if workspace_name else get_extraction_order()
152
+
153
+ # Collect packages from workspace(s)
154
+ for ws_name in workspaces:
155
+ packages_directory = lean_root / ws_name / ".lake" / "packages"
156
+ if packages_directory.exists():
157
+ for package_directory in packages_directory.iterdir():
158
+ if package_directory.is_dir():
159
+ cache[package_directory.name.lower()] = package_directory
160
+
161
+ # Add toolchain - use specified workspace or find first available
162
+ if workspace_name:
163
+ toolchain_workspaces = [workspace_name]
164
+ else:
165
+ toolchain_workspaces = get_extraction_order()
166
+ for ws_name in toolchain_workspaces:
167
+ toolchain_file = lean_root / ws_name / "lean-toolchain"
168
+ if toolchain_file.exists():
169
+ version = toolchain_file.read_text().strip().split(":")[-1]
170
+ toolchain_path = (
171
+ Path.home()
172
+ / ".elan"
173
+ / "toolchains"
174
+ / f"leanprover--lean4---{version}"
175
+ / "src"
176
+ / "lean"
177
+ )
178
+ if toolchain_path.exists():
179
+ cache["lean4"] = toolchain_path
180
+ break
181
+
182
+ return cache
183
+
184
+
185
+ def _extract_dependencies_from_html(html: str) -> list[str]:
186
+ """Extract dependency names from HTML declaration header."""
187
+ href_pattern = r'href="[^"]*#([^"]+)"'
188
+ matches = re.findall(href_pattern, html)
189
+
190
+ dependencies = []
191
+ seen = set()
192
+ for match in matches:
193
+ if match not in seen:
194
+ dependencies.append(match)
195
+ seen.add(match)
196
+
197
+ return dependencies
198
+
199
+
200
+ def _read_source_lines(file_path: str | Path, line_start: int, line_end: int) -> str:
201
+ """Read specific lines from a source file.
202
+
203
+ If the extracted text is just an attribute (like @[to_additive]), extends
204
+ the range to include the full declaration.
205
+ """
206
+ file_path = Path(file_path)
207
+ with open(file_path, encoding="utf-8") as f:
208
+ lines = f.readlines()
209
+ if line_start > len(lines) or line_end > len(lines):
210
+ raise ValueError(
211
+ f"Line range {line_start}-{line_end} out of bounds for {file_path}"
212
+ )
213
+
214
+ result = "".join(lines[line_start - 1 : line_end])
215
+
216
+ # If result starts with an attribute, extend to get the full declaration
217
+ stripped = result.strip()
218
+ if stripped.startswith("@["):
219
+ extended_end = line_end
220
+ while extended_end < len(lines):
221
+ extended_end += 1
222
+ extended_result = "".join(lines[line_start - 1 : extended_end])
223
+ if any(
224
+ kw in extended_result
225
+ for kw in [
226
+ " def ",
227
+ " theorem ",
228
+ " lemma ",
229
+ " instance ",
230
+ " class ",
231
+ " structure ",
232
+ " inductive ",
233
+ " abbrev ",
234
+ ":=",
235
+ ]
236
+ ):
237
+ return extended_result.rstrip()
238
+ return "".join(lines[line_start - 1 : extended_end]).rstrip()
239
+
240
+ return result
241
+
242
+
243
+ def _extract_source_text(
244
+ source_link: str, lean_root: str | Path, package_cache: dict[str, Path]
245
+ ) -> str:
246
+ """Extract source text from a Lean file given a GitHub source link."""
247
+ lean_root = Path(lean_root)
248
+ match = re.search(
249
+ r"github\.com/([^/]+)/([^/]+)/blob/[^/]+/(.+\.lean)#L(\d+)-L(\d+)",
250
+ source_link,
251
+ )
252
+ if not match:
253
+ raise ValueError(f"Could not parse source link: {source_link}")
254
+
255
+ (
256
+ organization_name,
257
+ package_name,
258
+ file_path_string,
259
+ line_start_string,
260
+ line_end_string,
261
+ ) = match.groups()
262
+ line_start = int(line_start_string)
263
+ line_end = int(line_end_string)
264
+
265
+ candidates = []
266
+
267
+ for variant in [
268
+ package_name.lower(),
269
+ package_name.rstrip("0123456789").lower(),
270
+ package_name.replace("-", "").lower(),
271
+ ]:
272
+ if variant in package_cache:
273
+ if variant == "lean4" and file_path_string.startswith("src/"):
274
+ adjusted_path = file_path_string[4:]
275
+ else:
276
+ adjusted_path = file_path_string
277
+ candidates.append(package_cache[variant] / adjusted_path)
278
+
279
+ candidates.append(lean_root / file_path_string)
280
+
281
+ for candidate in candidates:
282
+ if candidate.exists():
283
+ return _read_source_lines(candidate, line_start, line_end)
284
+
285
+ for package_directory in package_cache.values():
286
+ candidate = package_directory / file_path_string
287
+ if candidate.exists():
288
+ return _read_source_lines(candidate, line_start, line_end)
289
+
290
+ raise FileNotFoundError(
291
+ f"Could not find {file_path_string} for package {package_name}"
292
+ )
293
+
294
+
295
+ def _parse_declarations_from_files(
296
+ bmp_files: list[Path],
297
+ lean_root: Path,
298
+ package_cache: dict[str, Path],
299
+ allowed_module_prefixes: list[str],
300
+ ) -> list[Declaration]:
301
+ """Parse declarations from doc-gen4 BMP files.
302
+
303
+ Args:
304
+ bmp_files: List of paths to BMP files containing declaration data.
305
+ lean_root: Root directory of the Lean project.
306
+ package_cache: Dictionary mapping package names to their directories.
307
+ allowed_module_prefixes: Module prefixes to extract (e.g., ["Mathlib"]).
308
+
309
+ Returns:
310
+ List of parsed Declaration objects.
311
+ """
312
+ declarations = []
313
+
314
+ with Progress(
315
+ SpinnerColumn(),
316
+ TextColumn("[progress.description]{task.description}"),
317
+ BarColumn(),
318
+ TaskProgressColumn(),
319
+ TimeRemainingColumn(),
320
+ ) as progress:
321
+ task = progress.add_task("[cyan]Parsing BMP files...", total=len(bmp_files))
322
+
323
+ for file_path in bmp_files:
324
+ with open(file_path, encoding="utf-8") as f:
325
+ data = json.load(f)
326
+
327
+ module_name = data["name"]
328
+
329
+ # Only extract modules matching the allowed prefixes for this workspace
330
+ # Use prefix + "." to avoid "Lean" matching "LeanSearchClient"
331
+ matches_prefix = any(
332
+ module_name == prefix or module_name.startswith(prefix + ".")
333
+ for prefix in allowed_module_prefixes
334
+ )
335
+ if not matches_prefix:
336
+ progress.update(task, advance=1)
337
+ continue
338
+
339
+ for declaration_data in data.get("declarations", []):
340
+ information = declaration_data["info"]
341
+ source_text = _extract_source_text(
342
+ information["sourceLink"], lean_root, package_cache
343
+ )
344
+
345
+ header_html = declaration_data.get("header", "")
346
+ dependencies = _extract_dependencies_from_html(header_html)
347
+
348
+ # Filter out self-references from dependencies
349
+ declaration_name = information["name"]
350
+ filtered_dependencies = [
351
+ d for d in dependencies if d != declaration_name
352
+ ]
353
+
354
+ # Skip auto-generated .mk constructors
355
+ if declaration_name.endswith(".mk"):
356
+ continue
357
+
358
+ declarations.append(
359
+ Declaration(
360
+ name=declaration_name,
361
+ module=module_name,
362
+ docstring=information.get("doc"),
363
+ source_text=source_text,
364
+ source_link=information["sourceLink"],
365
+ dependencies=filtered_dependencies or None,
366
+ )
367
+ )
368
+
369
+ progress.update(task, advance=1)
370
+
371
+ return declarations
372
+
373
+
374
+ async def _insert_declarations_batch(
375
+ session: AsyncSession, declarations: list[Declaration], batch_size: int = 1000
376
+ ) -> int:
377
+ """Insert declarations into database in batches.
378
+
379
+ Args:
380
+ session: Active database session.
381
+ declarations: List of declarations to insert.
382
+ batch_size: Number of declarations to insert per batch.
383
+
384
+ Returns:
385
+ Number of declarations successfully inserted.
386
+ """
387
+ inserted_count = 0
388
+
389
+ with Progress(
390
+ SpinnerColumn(),
391
+ TextColumn("[progress.description]{task.description}"),
392
+ BarColumn(),
393
+ TaskProgressColumn(),
394
+ TimeRemainingColumn(),
395
+ ) as progress:
396
+ task = progress.add_task(
397
+ "[green]Inserting declarations into database...",
398
+ total=len(declarations),
399
+ )
400
+
401
+ async with session.begin():
402
+ for i in range(0, len(declarations), batch_size):
403
+ batch = declarations[i : i + batch_size]
404
+
405
+ for declaration in batch:
406
+ dependencies_json = (
407
+ json.dumps(declaration.dependencies)
408
+ if declaration.dependencies
409
+ else None
410
+ )
411
+ statement = (
412
+ insert(DBDeclaration)
413
+ .values(
414
+ name=declaration.name,
415
+ module=declaration.module,
416
+ docstring=declaration.docstring,
417
+ source_text=declaration.source_text,
418
+ source_link=declaration.source_link,
419
+ dependencies=dependencies_json,
420
+ )
421
+ .on_conflict_do_nothing(index_elements=["name"])
422
+ )
423
+
424
+ result = await session.execute(statement)
425
+ inserted_count += result.rowcount
426
+ progress.update(task, advance=1)
427
+
428
+ return inserted_count
429
+
430
+
431
+ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> None:
432
+ """Extract all declarations from doc-gen4 data and load into database.
433
+
434
+ Looks for BMP files in each package's .lake/build/doc-data directory.
435
+ Extracts only declarations matching the package's configured module_prefixes,
436
+ ensuring each package's declarations come from its own workspace.
437
+
438
+ Args:
439
+ engine: SQLAlchemy async engine for database connection.
440
+ batch_size: Number of declarations to insert per database transaction.
441
+ """
442
+ from lean_explore.extract.package_registry import PACKAGE_REGISTRY
443
+ from lean_explore.extract.package_utils import get_extraction_order
444
+
445
+ lean_root = Path("lean")
446
+ all_declarations = []
447
+
448
+ # Process each workspace separately with its own package cache
449
+ for package_name in get_extraction_order():
450
+ package_config = PACKAGE_REGISTRY[package_name]
451
+ doc_data_dir = lean_root / package_name / ".lake" / "build" / "doc-data"
452
+
453
+ if not doc_data_dir.exists():
454
+ logger.warning(f"No doc-data directory for {package_name}: {doc_data_dir}")
455
+ continue
456
+
457
+ bmp_files = sorted(doc_data_dir.glob("**/*.bmp"))
458
+ logger.info(f"Found {len(bmp_files)} BMP files in {package_name}")
459
+
460
+ if not bmp_files:
461
+ continue
462
+
463
+ # Build workspace-specific package cache to avoid version mismatches
464
+ package_cache = _build_package_cache(lean_root, package_name)
465
+ logger.info(
466
+ f"Built package cache for {package_name} with {len(package_cache)} packages"
467
+ )
468
+
469
+ declarations = _parse_declarations_from_files(
470
+ bmp_files, lean_root, package_cache, package_config.module_prefixes
471
+ )
472
+ logger.info(
473
+ f"Extracted {len(declarations)} declarations from {package_name} "
474
+ f"(prefixes: {package_config.module_prefixes})"
475
+ )
476
+ all_declarations.extend(declarations)
477
+
478
+ if not all_declarations:
479
+ raise FileNotFoundError("No declarations extracted from any package workspace")
480
+
481
+ logger.info(f"Total declarations extracted: {len(all_declarations)}")
482
+
483
+ # Filter out auto-generated 'to*' projections that share source with parent
484
+ all_declarations, projection_count = _filter_auto_generated_projections(
485
+ all_declarations
486
+ )
487
+ if projection_count > 0:
488
+ logger.info(f"Filtered {projection_count} auto-generated 'to*' projections")
489
+
490
+ async with AsyncSession(engine) as session:
491
+ inserted_count = await _insert_declarations_batch(
492
+ session, all_declarations, batch_size
493
+ )
494
+
495
+ skipped = len(all_declarations) - inserted_count
496
+ logger.info(
497
+ f"Inserted {inserted_count} new declarations into database "
498
+ f"(skipped {skipped} duplicates)"
499
+ )