lean-explore 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lean_explore/__init__.py +14 -1
- lean_explore/api/__init__.py +12 -1
- lean_explore/api/client.py +64 -176
- lean_explore/cli/__init__.py +10 -1
- lean_explore/cli/data_commands.py +157 -479
- lean_explore/cli/display.py +171 -0
- lean_explore/cli/main.py +51 -608
- lean_explore/config.py +244 -0
- lean_explore/extract/__init__.py +5 -0
- lean_explore/extract/__main__.py +368 -0
- lean_explore/extract/doc_gen4.py +200 -0
- lean_explore/extract/doc_parser.py +499 -0
- lean_explore/extract/embeddings.py +371 -0
- lean_explore/extract/github.py +110 -0
- lean_explore/extract/index.py +317 -0
- lean_explore/extract/informalize.py +653 -0
- lean_explore/extract/package_config.py +59 -0
- lean_explore/extract/package_registry.py +45 -0
- lean_explore/extract/package_utils.py +105 -0
- lean_explore/extract/types.py +25 -0
- lean_explore/mcp/__init__.py +11 -1
- lean_explore/mcp/app.py +14 -46
- lean_explore/mcp/server.py +20 -35
- lean_explore/mcp/tools.py +70 -205
- lean_explore/models/__init__.py +9 -0
- lean_explore/models/search_db.py +76 -0
- lean_explore/models/search_types.py +53 -0
- lean_explore/search/__init__.py +32 -0
- lean_explore/search/engine.py +655 -0
- lean_explore/search/scoring.py +156 -0
- lean_explore/search/service.py +68 -0
- lean_explore/search/tokenization.py +71 -0
- lean_explore/util/__init__.py +28 -0
- lean_explore/util/embedding_client.py +92 -0
- lean_explore/util/logging.py +22 -0
- lean_explore/util/openrouter_client.py +63 -0
- lean_explore/util/reranker_client.py +189 -0
- {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/METADATA +32 -9
- lean_explore-1.0.0.dist-info/RECORD +43 -0
- {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/WHEEL +1 -1
- lean_explore-1.0.0.dist-info/entry_points.txt +2 -0
- lean_explore/cli/agent.py +0 -788
- lean_explore/cli/config_utils.py +0 -481
- lean_explore/defaults.py +0 -114
- lean_explore/local/__init__.py +0 -1
- lean_explore/local/search.py +0 -1050
- lean_explore/local/service.py +0 -479
- lean_explore/shared/__init__.py +0 -1
- lean_explore/shared/models/__init__.py +0 -1
- lean_explore/shared/models/api.py +0 -117
- lean_explore/shared/models/db.py +0 -396
- lean_explore-0.3.0.dist-info/RECORD +0 -26
- lean_explore-0.3.0.dist-info/entry_points.txt +0 -2
- {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/top_level.txt +0 -0
lean_explore/config.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# src/lean_explore/config.py
|
|
2
|
+
|
|
3
|
+
"""Centralized configuration for lean_explore.
|
|
4
|
+
|
|
5
|
+
This module provides all configuration settings including paths, URLs,
|
|
6
|
+
and other constants used throughout the application.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import pathlib
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _get_active_cache_version() -> str:
|
|
14
|
+
"""Get the active cache version from the version file or environment.
|
|
15
|
+
|
|
16
|
+
The version is determined by (in order of priority):
|
|
17
|
+
1. LEAN_EXPLORE_VERSION environment variable
|
|
18
|
+
2. Contents of ~/.lean_explore/active_version file (set by data fetch)
|
|
19
|
+
3. Default fallback version
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
The active version string (e.g., "2025.01.27" or "v4.24.0").
|
|
23
|
+
"""
|
|
24
|
+
env_version = os.getenv("LEAN_EXPLORE_VERSION")
|
|
25
|
+
if env_version:
|
|
26
|
+
return env_version
|
|
27
|
+
|
|
28
|
+
version_file = pathlib.Path.home() / ".lean_explore" / "active_version"
|
|
29
|
+
if version_file.exists():
|
|
30
|
+
return version_file.read_text().strip()
|
|
31
|
+
|
|
32
|
+
return "v4.24.0"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _get_data_directory() -> pathlib.Path:
|
|
36
|
+
"""Get the data directory path."""
|
|
37
|
+
return pathlib.Path(
|
|
38
|
+
os.getenv(
|
|
39
|
+
"LEAN_EXPLORE_DATA_DIR",
|
|
40
|
+
pathlib.Path(__file__).parent.parent.parent / "data",
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _get_timestamped_directories(data_directory: pathlib.Path) -> list[pathlib.Path]:
|
|
46
|
+
"""Get all timestamped extraction directories sorted by name descending."""
|
|
47
|
+
import re
|
|
48
|
+
|
|
49
|
+
if not data_directory.exists():
|
|
50
|
+
return []
|
|
51
|
+
|
|
52
|
+
timestamp_pattern = re.compile(r"^\d{8}_\d{6}$")
|
|
53
|
+
timestamped_directories = [
|
|
54
|
+
directory
|
|
55
|
+
for directory in data_directory.iterdir()
|
|
56
|
+
if directory.is_dir() and timestamp_pattern.match(directory.name)
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
timestamped_directories.sort(key=lambda d: d.name, reverse=True)
|
|
60
|
+
return timestamped_directories
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _resolve_active_data_path(
|
|
64
|
+
data_directory: pathlib.Path, active_version: str
|
|
65
|
+
) -> pathlib.Path:
|
|
66
|
+
"""Resolve the active data path using the best available source.
|
|
67
|
+
|
|
68
|
+
Priority:
|
|
69
|
+
1. DATA_DIRECTORY if it contains lean_explore.db directly
|
|
70
|
+
2. Most recent timestamped extraction directory (YYYYMMDD_HHMMSS)
|
|
71
|
+
3. DATA_DIRECTORY / ACTIVE_VERSION as fallback
|
|
72
|
+
"""
|
|
73
|
+
if (data_directory / "lean_explore.db").exists():
|
|
74
|
+
return data_directory
|
|
75
|
+
|
|
76
|
+
timestamped_dirs = _get_timestamped_directories(data_directory)
|
|
77
|
+
if timestamped_dirs:
|
|
78
|
+
latest = timestamped_dirs[0]
|
|
79
|
+
if (latest / "lean_explore.db").exists():
|
|
80
|
+
return latest
|
|
81
|
+
|
|
82
|
+
return data_directory / active_version
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class Config:
|
|
86
|
+
"""Application-wide configuration settings."""
|
|
87
|
+
|
|
88
|
+
CACHE_DIRECTORY: pathlib.Path = pathlib.Path(
|
|
89
|
+
os.getenv(
|
|
90
|
+
"LEAN_EXPLORE_CACHE_DIR",
|
|
91
|
+
pathlib.Path.home() / ".lean_explore" / "cache",
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
"""Cache directory for downloaded data (used by search engine and MCP server).
|
|
95
|
+
|
|
96
|
+
Can be overridden with LEAN_EXPLORE_CACHE_DIR environment variable.
|
|
97
|
+
Default: ~/.lean_explore/cache
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
DATA_DIRECTORY: pathlib.Path = _get_data_directory()
|
|
101
|
+
"""Local data directory for extraction pipeline output.
|
|
102
|
+
|
|
103
|
+
Can be overridden with LEAN_EXPLORE_DATA_DIR environment variable.
|
|
104
|
+
Default: <repo-root>/data
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
DEFAULT_LEAN_VERSION: str = "4.24.0"
|
|
108
|
+
"""Lean version for database naming and dependency resolution."""
|
|
109
|
+
|
|
110
|
+
ACTIVE_VERSION: str = _get_active_cache_version()
|
|
111
|
+
"""Active version identifier for cached data (e.g., "2025.01.27").
|
|
112
|
+
|
|
113
|
+
Determined by LEAN_EXPLORE_VERSION env var, ~/.lean_explore/active_version
|
|
114
|
+
file, or defaults to v4.24.0.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
ACTIVE_CACHE_PATH: pathlib.Path = CACHE_DIRECTORY / ACTIVE_VERSION
|
|
118
|
+
"""Directory for the active version's cached data files."""
|
|
119
|
+
|
|
120
|
+
ACTIVE_DATA_PATH: pathlib.Path = _resolve_active_data_path(
|
|
121
|
+
DATA_DIRECTORY, ACTIVE_VERSION
|
|
122
|
+
)
|
|
123
|
+
"""Directory for the active version's local data files.
|
|
124
|
+
|
|
125
|
+
Resolved using (in priority order):
|
|
126
|
+
1. DATA_DIRECTORY if it contains lean_explore.db directly
|
|
127
|
+
2. Most recent timestamped extraction directory (YYYYMMDD_HHMMSS)
|
|
128
|
+
3. DATA_DIRECTORY / ACTIVE_VERSION as fallback
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
# =========================================================================
|
|
132
|
+
# Timestamped Extraction Directory Methods
|
|
133
|
+
# =========================================================================
|
|
134
|
+
|
|
135
|
+
@staticmethod
|
|
136
|
+
def _get_timestamped_directories() -> list[pathlib.Path]:
|
|
137
|
+
"""Get all timestamped extraction directories sorted by name descending."""
|
|
138
|
+
return _get_timestamped_directories(Config.DATA_DIRECTORY)
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def get_latest_extraction_path() -> pathlib.Path | None:
|
|
142
|
+
"""Get the most recent timestamped extraction directory.
|
|
143
|
+
|
|
144
|
+
Looks for directories matching YYYYMMDD_HHMMSS pattern in DATA_DIRECTORY.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Path to most recent extraction directory, or None if none exist.
|
|
148
|
+
"""
|
|
149
|
+
timestamped_directories = Config._get_timestamped_directories()
|
|
150
|
+
return timestamped_directories[0] if timestamped_directories else None
|
|
151
|
+
|
|
152
|
+
DATABASE_PATH: pathlib.Path = ACTIVE_CACHE_PATH / "lean_explore.db"
|
|
153
|
+
"""Path to SQLite database file in cache (used by search engine)."""
|
|
154
|
+
|
|
155
|
+
FAISS_INDEX_PATH: pathlib.Path = ACTIVE_CACHE_PATH / "informalization_faiss.index"
|
|
156
|
+
"""Path to FAISS index file in cache (using informalization embeddings)."""
|
|
157
|
+
|
|
158
|
+
FAISS_IDS_MAP_PATH: pathlib.Path = (
|
|
159
|
+
ACTIVE_CACHE_PATH / "informalization_faiss_ids_map.json"
|
|
160
|
+
)
|
|
161
|
+
"""Path to FAISS ID mapping file in cache."""
|
|
162
|
+
|
|
163
|
+
BM25_SPACED_PATH: pathlib.Path = ACTIVE_CACHE_PATH / "bm25_name_spaced"
|
|
164
|
+
"""Path to BM25 spaced tokenization index directory in cache."""
|
|
165
|
+
|
|
166
|
+
BM25_RAW_PATH: pathlib.Path = ACTIVE_CACHE_PATH / "bm25_name_raw"
|
|
167
|
+
"""Path to BM25 raw tokenization index directory in cache."""
|
|
168
|
+
|
|
169
|
+
BM25_IDS_MAP_PATH: pathlib.Path = ACTIVE_CACHE_PATH / "bm25_ids_map.json"
|
|
170
|
+
"""Path to BM25 ID mapping file in cache."""
|
|
171
|
+
|
|
172
|
+
DATABASE_URL: str = f"sqlite+aiosqlite:///{DATABASE_PATH}"
|
|
173
|
+
"""Async SQLAlchemy database URL for SQLite (used by search engine)."""
|
|
174
|
+
|
|
175
|
+
EXTRACTION_DATABASE_PATH: pathlib.Path = ACTIVE_DATA_PATH / "lean_explore.db"
|
|
176
|
+
"""Path to SQLite database file in data directory (used by extraction)."""
|
|
177
|
+
|
|
178
|
+
EXTRACTION_DATABASE_URL: str = f"sqlite+aiosqlite:///{EXTRACTION_DATABASE_PATH}"
|
|
179
|
+
"""Async SQLAlchemy database URL for extraction pipeline."""
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def get_latest_database_path() -> pathlib.Path | None:
|
|
183
|
+
"""Get the path to the most recent extraction database.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
Path to lean_explore.db in the most recent extraction, or None.
|
|
187
|
+
"""
|
|
188
|
+
latest = Config.get_latest_extraction_path()
|
|
189
|
+
if latest:
|
|
190
|
+
database_path = latest / "lean_explore.db"
|
|
191
|
+
if database_path.exists():
|
|
192
|
+
return database_path
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
@staticmethod
|
|
196
|
+
def create_timestamped_extraction_path() -> pathlib.Path:
|
|
197
|
+
"""Create a new timestamped extraction directory.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Path to the newly created directory (YYYYMMDD_HHMMSS format).
|
|
201
|
+
"""
|
|
202
|
+
from datetime import datetime
|
|
203
|
+
|
|
204
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
205
|
+
extraction_path = Config.DATA_DIRECTORY / timestamp
|
|
206
|
+
extraction_path.mkdir(parents=True, exist_ok=True)
|
|
207
|
+
return extraction_path
|
|
208
|
+
|
|
209
|
+
# =========================================================================
|
|
210
|
+
# Package Workspace Paths
|
|
211
|
+
# =========================================================================
|
|
212
|
+
|
|
213
|
+
PACKAGES_ROOT: pathlib.Path = pathlib.Path(
|
|
214
|
+
os.getenv(
|
|
215
|
+
"LEAN_EXPLORE_PACKAGES_ROOT",
|
|
216
|
+
pathlib.Path(__file__).parent.parent.parent / "lean",
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
"""Root directory for per-package Lean workspaces.
|
|
220
|
+
|
|
221
|
+
Can be overridden with LEAN_EXPLORE_PACKAGES_ROOT environment variable.
|
|
222
|
+
Default: <repo-root>/lean
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
EXTRACT_PACKAGES: set[str] = {
|
|
226
|
+
"batteries",
|
|
227
|
+
"init",
|
|
228
|
+
"lean4",
|
|
229
|
+
"mathlib",
|
|
230
|
+
"physlean",
|
|
231
|
+
"std",
|
|
232
|
+
}
|
|
233
|
+
"""Set of package names to extract from doc-gen4 output."""
|
|
234
|
+
|
|
235
|
+
MANIFEST_URL: str = (
|
|
236
|
+
"https://pub-48b75babc4664808b15520033423c765.r2.dev/manifest.json"
|
|
237
|
+
)
|
|
238
|
+
"""Remote URL for the data toolchain manifest."""
|
|
239
|
+
|
|
240
|
+
R2_ASSETS_BASE_URL: str = "https://pub-48b75babc4664808b15520033423c765.r2.dev"
|
|
241
|
+
"""Base URL for Cloudflare R2 asset storage."""
|
|
242
|
+
|
|
243
|
+
API_BASE_URL: str = "https://www.leanexplore.com/api/v2"
|
|
244
|
+
"""Base URL for the LeanExplore remote API service."""
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
"""Pipeline orchestration for Lean declaration extraction and enrichment.
|
|
2
|
+
|
|
3
|
+
This module provides functions to coordinate the complete data extraction pipeline:
|
|
4
|
+
1. Extract declarations from doc-gen4 output
|
|
5
|
+
2. Generate informal natural language descriptions
|
|
6
|
+
3. Generate vector embeddings for semantic search
|
|
7
|
+
4. Build FAISS indices for vector similarity search
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import click
|
|
16
|
+
from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine
|
|
17
|
+
|
|
18
|
+
from lean_explore.config import Config
|
|
19
|
+
from lean_explore.models import Base
|
|
20
|
+
from lean_explore.util.logging import setup_logging
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def _create_database_schema(engine: AsyncEngine) -> None:
|
|
26
|
+
"""Create database tables if they don't exist.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
engine: SQLAlchemy async engine instance.
|
|
30
|
+
"""
|
|
31
|
+
logger.info("Creating database schema...")
|
|
32
|
+
async with engine.begin() as connection:
|
|
33
|
+
await connection.run_sync(Base.metadata.create_all)
|
|
34
|
+
logger.info("Database schema created successfully")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
async def _run_doc_gen4_step(fresh: bool = False) -> None:
|
|
38
|
+
"""Run doc-gen4 to generate documentation.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
fresh: Clear cached dependencies to force fresh resolution.
|
|
42
|
+
"""
|
|
43
|
+
from lean_explore.extract.doc_gen4 import run_doc_gen4
|
|
44
|
+
|
|
45
|
+
logger.info("Running doc-gen4...")
|
|
46
|
+
await run_doc_gen4(fresh=fresh)
|
|
47
|
+
logger.info("doc-gen4 complete")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
async def _run_extract_step(engine: AsyncEngine) -> None:
|
|
51
|
+
"""Extract declarations from doc-gen4 output."""
|
|
52
|
+
from lean_explore.extract.doc_parser import extract_declarations
|
|
53
|
+
|
|
54
|
+
logger.info("Step 1: Extracting declarations from doc-gen4...")
|
|
55
|
+
await extract_declarations(engine)
|
|
56
|
+
logger.info("Declaration extraction complete")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
async def _run_informalize_step(
|
|
60
|
+
engine: AsyncEngine,
|
|
61
|
+
model: str,
|
|
62
|
+
batch_size: int,
|
|
63
|
+
max_concurrent: int,
|
|
64
|
+
limit: int | None,
|
|
65
|
+
) -> None:
|
|
66
|
+
"""Generate informal descriptions for declarations."""
|
|
67
|
+
from lean_explore.extract.informalize import informalize_declarations
|
|
68
|
+
|
|
69
|
+
logger.info("Step 2: Generating informal descriptions...")
|
|
70
|
+
await informalize_declarations(
|
|
71
|
+
engine,
|
|
72
|
+
model=model,
|
|
73
|
+
commit_batch_size=batch_size,
|
|
74
|
+
max_concurrent=max_concurrent,
|
|
75
|
+
limit=limit,
|
|
76
|
+
)
|
|
77
|
+
logger.info("Informalization complete")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def _run_embeddings_step(
|
|
81
|
+
engine: AsyncEngine,
|
|
82
|
+
model_name: str,
|
|
83
|
+
batch_size: int,
|
|
84
|
+
limit: int | None,
|
|
85
|
+
max_seq_length: int,
|
|
86
|
+
) -> None:
|
|
87
|
+
"""Generate embeddings for all declaration fields."""
|
|
88
|
+
from lean_explore.extract.embeddings import generate_embeddings
|
|
89
|
+
|
|
90
|
+
logger.info("Step 3: Generating embeddings...")
|
|
91
|
+
await generate_embeddings(
|
|
92
|
+
engine,
|
|
93
|
+
model_name=model_name,
|
|
94
|
+
batch_size=batch_size,
|
|
95
|
+
limit=limit,
|
|
96
|
+
max_seq_length=max_seq_length,
|
|
97
|
+
)
|
|
98
|
+
logger.info("Embedding generation complete")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def _run_index_step(engine: AsyncEngine, extraction_path: Path) -> None:
|
|
102
|
+
"""Build search indices (FAISS and BM25).
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
engine: SQLAlchemy async engine instance.
|
|
106
|
+
extraction_path: Directory to save indices (same as database location).
|
|
107
|
+
"""
|
|
108
|
+
from lean_explore.extract.index import build_bm25_indices, build_faiss_indices
|
|
109
|
+
|
|
110
|
+
logger.info("Step 4: Building search indices...")
|
|
111
|
+
await build_faiss_indices(engine, output_directory=extraction_path)
|
|
112
|
+
await build_bm25_indices(engine, output_directory=extraction_path)
|
|
113
|
+
logger.info("Index building complete")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
async def run_pipeline(
|
|
117
|
+
database_url: str,
|
|
118
|
+
extraction_path: Path,
|
|
119
|
+
run_doc_gen4: bool = False,
|
|
120
|
+
fresh: bool = False,
|
|
121
|
+
parse_docs: bool = True,
|
|
122
|
+
informalize: bool = True,
|
|
123
|
+
embeddings: bool = True,
|
|
124
|
+
index: bool = True,
|
|
125
|
+
informalize_model: str = "google/gemini-3-flash-preview",
|
|
126
|
+
informalize_batch_size: int = 1000,
|
|
127
|
+
informalize_max_concurrent: int = 100,
|
|
128
|
+
informalize_limit: int | None = None,
|
|
129
|
+
embedding_model: str = "Qwen/Qwen3-Embedding-0.6B",
|
|
130
|
+
embedding_batch_size: int = 250,
|
|
131
|
+
embedding_limit: int | None = None,
|
|
132
|
+
embedding_max_seq_length: int = 512,
|
|
133
|
+
verbose: bool = False,
|
|
134
|
+
) -> None:
|
|
135
|
+
"""Run the Lean declaration extraction and enrichment pipeline.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
database_url: SQLite database URL (e.g., sqlite+aiosqlite:///path/to/db)
|
|
139
|
+
extraction_path: Directory containing the extraction (for saving indices).
|
|
140
|
+
run_doc_gen4: Run doc-gen4 to generate documentation before parsing
|
|
141
|
+
fresh: Clear cached dependencies to force fresh resolution (for nightly updates)
|
|
142
|
+
parse_docs: Run doc-gen4 parsing step
|
|
143
|
+
informalize: Run informalization step
|
|
144
|
+
embeddings: Run embeddings generation step
|
|
145
|
+
index: Run FAISS index building step
|
|
146
|
+
informalize_model: LLM model for generating informalizations
|
|
147
|
+
informalize_batch_size: Commit batch size for informalization
|
|
148
|
+
informalize_max_concurrent: Maximum concurrent informalization requests
|
|
149
|
+
informalize_limit: Limit number of declarations to informalize
|
|
150
|
+
embedding_model: Sentence transformer model for embeddings
|
|
151
|
+
embedding_batch_size: Batch size for embedding generation
|
|
152
|
+
embedding_limit: Limit number of declarations for embeddings
|
|
153
|
+
embedding_max_seq_length: Max sequence length for embeddings (lower=less mem)
|
|
154
|
+
verbose: Enable verbose logging
|
|
155
|
+
"""
|
|
156
|
+
setup_logging(verbose)
|
|
157
|
+
|
|
158
|
+
# Validate OpenRouter API key if informalization is needed
|
|
159
|
+
if informalize:
|
|
160
|
+
if not os.getenv("OPENROUTER_API_KEY"):
|
|
161
|
+
logger.error(
|
|
162
|
+
"OPENROUTER_API_KEY environment variable is required for "
|
|
163
|
+
"informalization"
|
|
164
|
+
)
|
|
165
|
+
raise RuntimeError("OPENROUTER_API_KEY not set")
|
|
166
|
+
|
|
167
|
+
steps_enabled = []
|
|
168
|
+
if parse_docs:
|
|
169
|
+
steps_enabled.append("parse-docs")
|
|
170
|
+
if informalize:
|
|
171
|
+
steps_enabled.append("informalize")
|
|
172
|
+
if embeddings:
|
|
173
|
+
steps_enabled.append("embeddings")
|
|
174
|
+
if index:
|
|
175
|
+
steps_enabled.append("index")
|
|
176
|
+
|
|
177
|
+
logger.info("Starting Lean Explore extraction pipeline")
|
|
178
|
+
logger.info(f"Database URL: {database_url}")
|
|
179
|
+
logger.info(f"Steps to run: {', '.join(steps_enabled)}")
|
|
180
|
+
|
|
181
|
+
engine = create_async_engine(database_url, echo=verbose)
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
await _create_database_schema(engine)
|
|
185
|
+
|
|
186
|
+
if run_doc_gen4:
|
|
187
|
+
await _run_doc_gen4_step(fresh=fresh)
|
|
188
|
+
|
|
189
|
+
if parse_docs:
|
|
190
|
+
await _run_extract_step(engine)
|
|
191
|
+
|
|
192
|
+
if informalize:
|
|
193
|
+
await _run_informalize_step(
|
|
194
|
+
engine,
|
|
195
|
+
informalize_model,
|
|
196
|
+
informalize_batch_size,
|
|
197
|
+
informalize_max_concurrent,
|
|
198
|
+
informalize_limit,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if embeddings:
|
|
202
|
+
await _run_embeddings_step(
|
|
203
|
+
engine,
|
|
204
|
+
embedding_model,
|
|
205
|
+
embedding_batch_size,
|
|
206
|
+
embedding_limit,
|
|
207
|
+
embedding_max_seq_length,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if index:
|
|
211
|
+
await _run_index_step(engine, extraction_path)
|
|
212
|
+
|
|
213
|
+
logger.info("Pipeline completed successfully!")
|
|
214
|
+
|
|
215
|
+
finally:
|
|
216
|
+
await engine.dispose()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
@click.command()
|
|
220
|
+
@click.option(
|
|
221
|
+
"--run-doc-gen4",
|
|
222
|
+
is_flag=True,
|
|
223
|
+
help="Run doc-gen4 to generate documentation before parsing",
|
|
224
|
+
)
|
|
225
|
+
@click.option(
|
|
226
|
+
"--fresh",
|
|
227
|
+
is_flag=True,
|
|
228
|
+
help="Clear cached dependencies to fetch latest versions (use for nightly updates)",
|
|
229
|
+
)
|
|
230
|
+
@click.option(
|
|
231
|
+
"--parse-docs/--no-parse-docs",
|
|
232
|
+
default=None,
|
|
233
|
+
help="Run doc-gen4 parsing step (creates new timestamped directory)",
|
|
234
|
+
)
|
|
235
|
+
@click.option(
|
|
236
|
+
"--informalize/--no-informalize",
|
|
237
|
+
default=None,
|
|
238
|
+
help="Run informalization step (uses latest extraction)",
|
|
239
|
+
)
|
|
240
|
+
@click.option(
|
|
241
|
+
"--embeddings/--no-embeddings",
|
|
242
|
+
default=None,
|
|
243
|
+
help="Run embeddings generation step (uses latest extraction)",
|
|
244
|
+
)
|
|
245
|
+
@click.option(
|
|
246
|
+
"--index/--no-index",
|
|
247
|
+
default=None,
|
|
248
|
+
help="Run FAISS index building step (uses latest extraction)",
|
|
249
|
+
)
|
|
250
|
+
@click.option(
|
|
251
|
+
"--informalize-model",
|
|
252
|
+
default="google/gemini-3-flash-preview",
|
|
253
|
+
help="LLM model for generating informalizations",
|
|
254
|
+
)
|
|
255
|
+
@click.option(
|
|
256
|
+
"--informalize-max-concurrent",
|
|
257
|
+
type=int,
|
|
258
|
+
default=100,
|
|
259
|
+
help="Maximum concurrent informalization requests",
|
|
260
|
+
)
|
|
261
|
+
@click.option(
|
|
262
|
+
"--informalize-limit",
|
|
263
|
+
type=int,
|
|
264
|
+
default=None,
|
|
265
|
+
help="Limit number of declarations to informalize (for testing)",
|
|
266
|
+
)
|
|
267
|
+
@click.option(
|
|
268
|
+
"--embedding-model",
|
|
269
|
+
default="Qwen/Qwen3-Embedding-0.6B",
|
|
270
|
+
help="Sentence transformer model for embeddings",
|
|
271
|
+
)
|
|
272
|
+
@click.option(
|
|
273
|
+
"--embedding-batch-size",
|
|
274
|
+
type=int,
|
|
275
|
+
default=250,
|
|
276
|
+
help="Batch size for embedding generation (lower = less memory, default 250)",
|
|
277
|
+
)
|
|
278
|
+
@click.option(
|
|
279
|
+
"--embedding-limit",
|
|
280
|
+
type=int,
|
|
281
|
+
default=None,
|
|
282
|
+
help="Limit number of declarations for embeddings (for testing)",
|
|
283
|
+
)
|
|
284
|
+
@click.option(
|
|
285
|
+
"--embedding-max-seq-length",
|
|
286
|
+
type=int,
|
|
287
|
+
default=512,
|
|
288
|
+
help="Max sequence length for embeddings (lower = less memory, default 512)",
|
|
289
|
+
)
|
|
290
|
+
@click.option("--verbose", is_flag=True, help="Enable verbose logging")
|
|
291
|
+
def main(
|
|
292
|
+
run_doc_gen4: bool,
|
|
293
|
+
fresh: bool,
|
|
294
|
+
parse_docs: bool | None,
|
|
295
|
+
informalize: bool | None,
|
|
296
|
+
embeddings: bool | None,
|
|
297
|
+
index: bool | None,
|
|
298
|
+
informalize_model: str,
|
|
299
|
+
informalize_max_concurrent: int,
|
|
300
|
+
informalize_limit: int | None,
|
|
301
|
+
embedding_model: str,
|
|
302
|
+
embedding_batch_size: int,
|
|
303
|
+
embedding_limit: int | None,
|
|
304
|
+
embedding_max_seq_length: int,
|
|
305
|
+
verbose: bool,
|
|
306
|
+
) -> None:
|
|
307
|
+
"""Run the Lean declaration extraction and enrichment pipeline.
|
|
308
|
+
|
|
309
|
+
Extraction creates timestamped directories (YYYYMMDD_HHMMSS format).
|
|
310
|
+
Subsequent steps (informalize, embeddings, index) use the latest extraction.
|
|
311
|
+
"""
|
|
312
|
+
# Determine if any flags were explicitly set (including --run-doc-gen4)
|
|
313
|
+
step_flags = [run_doc_gen4, parse_docs, informalize, embeddings, index]
|
|
314
|
+
any_flag_explicitly_set = run_doc_gen4 or any(
|
|
315
|
+
flag is not None for flag in step_flags[1:]
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# If no flags were explicitly set, run all pipeline steps by default
|
|
319
|
+
# Otherwise, only run what was explicitly requested
|
|
320
|
+
if not any_flag_explicitly_set:
|
|
321
|
+
parse_docs = informalize = embeddings = index = True
|
|
322
|
+
else:
|
|
323
|
+
parse_docs = parse_docs if parse_docs is not None else False
|
|
324
|
+
informalize = informalize if informalize is not None else False
|
|
325
|
+
embeddings = embeddings if embeddings is not None else False
|
|
326
|
+
index = index if index is not None else False
|
|
327
|
+
|
|
328
|
+
# Determine extraction directory
|
|
329
|
+
if parse_docs:
|
|
330
|
+
# Create new timestamped directory for fresh extraction
|
|
331
|
+
extraction_path = Config.create_timestamped_extraction_path()
|
|
332
|
+
logger.info(f"Created new extraction directory: {extraction_path}")
|
|
333
|
+
else:
|
|
334
|
+
# Use latest existing extraction for subsequent steps
|
|
335
|
+
extraction_path = Config.get_latest_extraction_path()
|
|
336
|
+
if extraction_path is None:
|
|
337
|
+
raise click.ClickException(
|
|
338
|
+
"No existing extraction found. Run with --parse-docs first."
|
|
339
|
+
)
|
|
340
|
+
logger.info(f"Using existing extraction: {extraction_path}")
|
|
341
|
+
|
|
342
|
+
database_path = extraction_path / "lean_explore.db"
|
|
343
|
+
database_url = f"sqlite+aiosqlite:///{database_path}"
|
|
344
|
+
|
|
345
|
+
asyncio.run(
|
|
346
|
+
run_pipeline(
|
|
347
|
+
database_url=database_url,
|
|
348
|
+
extraction_path=extraction_path,
|
|
349
|
+
run_doc_gen4=run_doc_gen4,
|
|
350
|
+
fresh=fresh,
|
|
351
|
+
parse_docs=parse_docs,
|
|
352
|
+
informalize=informalize,
|
|
353
|
+
embeddings=embeddings,
|
|
354
|
+
index=index,
|
|
355
|
+
informalize_model=informalize_model,
|
|
356
|
+
informalize_max_concurrent=informalize_max_concurrent,
|
|
357
|
+
informalize_limit=informalize_limit,
|
|
358
|
+
embedding_model=embedding_model,
|
|
359
|
+
embedding_batch_size=embedding_batch_size,
|
|
360
|
+
embedding_limit=embedding_limit,
|
|
361
|
+
embedding_max_seq_length=embedding_max_seq_length,
|
|
362
|
+
verbose=verbose,
|
|
363
|
+
)
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
if __name__ == "__main__":
|
|
368
|
+
main()
|