lean-explore 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. lean_explore/__init__.py +14 -1
  2. lean_explore/api/__init__.py +12 -1
  3. lean_explore/api/client.py +60 -80
  4. lean_explore/cli/__init__.py +10 -1
  5. lean_explore/cli/data_commands.py +157 -479
  6. lean_explore/cli/display.py +171 -0
  7. lean_explore/cli/main.py +51 -608
  8. lean_explore/config.py +244 -0
  9. lean_explore/extract/__init__.py +5 -0
  10. lean_explore/extract/__main__.py +368 -0
  11. lean_explore/extract/doc_gen4.py +200 -0
  12. lean_explore/extract/doc_parser.py +499 -0
  13. lean_explore/extract/embeddings.py +371 -0
  14. lean_explore/extract/github.py +110 -0
  15. lean_explore/extract/index.py +317 -0
  16. lean_explore/extract/informalize.py +653 -0
  17. lean_explore/extract/package_config.py +59 -0
  18. lean_explore/extract/package_registry.py +45 -0
  19. lean_explore/extract/package_utils.py +105 -0
  20. lean_explore/extract/types.py +25 -0
  21. lean_explore/mcp/__init__.py +11 -1
  22. lean_explore/mcp/app.py +14 -46
  23. lean_explore/mcp/server.py +20 -35
  24. lean_explore/mcp/tools.py +70 -177
  25. lean_explore/models/__init__.py +9 -0
  26. lean_explore/models/search_db.py +76 -0
  27. lean_explore/models/search_types.py +53 -0
  28. lean_explore/search/__init__.py +32 -0
  29. lean_explore/search/engine.py +655 -0
  30. lean_explore/search/scoring.py +156 -0
  31. lean_explore/search/service.py +68 -0
  32. lean_explore/search/tokenization.py +71 -0
  33. lean_explore/util/__init__.py +28 -0
  34. lean_explore/util/embedding_client.py +92 -0
  35. lean_explore/util/logging.py +22 -0
  36. lean_explore/util/openrouter_client.py +63 -0
  37. lean_explore/util/reranker_client.py +189 -0
  38. {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/METADATA +55 -10
  39. lean_explore-1.0.0.dist-info/RECORD +43 -0
  40. {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/WHEEL +1 -1
  41. lean_explore-1.0.0.dist-info/entry_points.txt +2 -0
  42. lean_explore/cli/agent.py +0 -781
  43. lean_explore/cli/config_utils.py +0 -481
  44. lean_explore/defaults.py +0 -114
  45. lean_explore/local/__init__.py +0 -1
  46. lean_explore/local/search.py +0 -1050
  47. lean_explore/local/service.py +0 -392
  48. lean_explore/shared/__init__.py +0 -1
  49. lean_explore/shared/models/__init__.py +0 -1
  50. lean_explore/shared/models/api.py +0 -117
  51. lean_explore/shared/models/db.py +0 -396
  52. lean_explore-0.2.2.dist-info/RECORD +0 -26
  53. lean_explore-0.2.2.dist-info/entry_points.txt +0 -2
  54. {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/licenses/LICENSE +0 -0
  55. {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/top_level.txt +0 -0
lean_explore/config.py ADDED
@@ -0,0 +1,244 @@
1
+ # src/lean_explore/config.py
2
+
3
+ """Centralized configuration for lean_explore.
4
+
5
+ This module provides all configuration settings including paths, URLs,
6
+ and other constants used throughout the application.
7
+ """
8
+
9
+ import os
10
+ import pathlib
11
+
12
+
13
+ def _get_active_cache_version() -> str:
14
+ """Get the active cache version from the version file or environment.
15
+
16
+ The version is determined by (in order of priority):
17
+ 1. LEAN_EXPLORE_VERSION environment variable
18
+ 2. Contents of ~/.lean_explore/active_version file (set by data fetch)
19
+ 3. Default fallback version
20
+
21
+ Returns:
22
+ The active version string (e.g., "2025.01.27" or "v4.24.0").
23
+ """
24
+ env_version = os.getenv("LEAN_EXPLORE_VERSION")
25
+ if env_version:
26
+ return env_version
27
+
28
+ version_file = pathlib.Path.home() / ".lean_explore" / "active_version"
29
+ if version_file.exists():
30
+ return version_file.read_text().strip()
31
+
32
+ return "v4.24.0"
33
+
34
+
35
+ def _get_data_directory() -> pathlib.Path:
36
+ """Get the data directory path."""
37
+ return pathlib.Path(
38
+ os.getenv(
39
+ "LEAN_EXPLORE_DATA_DIR",
40
+ pathlib.Path(__file__).parent.parent.parent / "data",
41
+ )
42
+ )
43
+
44
+
45
+ def _get_timestamped_directories(data_directory: pathlib.Path) -> list[pathlib.Path]:
46
+ """Get all timestamped extraction directories sorted by name descending."""
47
+ import re
48
+
49
+ if not data_directory.exists():
50
+ return []
51
+
52
+ timestamp_pattern = re.compile(r"^\d{8}_\d{6}$")
53
+ timestamped_directories = [
54
+ directory
55
+ for directory in data_directory.iterdir()
56
+ if directory.is_dir() and timestamp_pattern.match(directory.name)
57
+ ]
58
+
59
+ timestamped_directories.sort(key=lambda d: d.name, reverse=True)
60
+ return timestamped_directories
61
+
62
+
63
+ def _resolve_active_data_path(
64
+ data_directory: pathlib.Path, active_version: str
65
+ ) -> pathlib.Path:
66
+ """Resolve the active data path using the best available source.
67
+
68
+ Priority:
69
+ 1. DATA_DIRECTORY if it contains lean_explore.db directly
70
+ 2. Most recent timestamped extraction directory (YYYYMMDD_HHMMSS)
71
+ 3. DATA_DIRECTORY / ACTIVE_VERSION as fallback
72
+ """
73
+ if (data_directory / "lean_explore.db").exists():
74
+ return data_directory
75
+
76
+ timestamped_dirs = _get_timestamped_directories(data_directory)
77
+ if timestamped_dirs:
78
+ latest = timestamped_dirs[0]
79
+ if (latest / "lean_explore.db").exists():
80
+ return latest
81
+
82
+ return data_directory / active_version
83
+
84
+
85
+ class Config:
86
+ """Application-wide configuration settings."""
87
+
88
+ CACHE_DIRECTORY: pathlib.Path = pathlib.Path(
89
+ os.getenv(
90
+ "LEAN_EXPLORE_CACHE_DIR",
91
+ pathlib.Path.home() / ".lean_explore" / "cache",
92
+ )
93
+ )
94
+ """Cache directory for downloaded data (used by search engine and MCP server).
95
+
96
+ Can be overridden with LEAN_EXPLORE_CACHE_DIR environment variable.
97
+ Default: ~/.lean_explore/cache
98
+ """
99
+
100
+ DATA_DIRECTORY: pathlib.Path = _get_data_directory()
101
+ """Local data directory for extraction pipeline output.
102
+
103
+ Can be overridden with LEAN_EXPLORE_DATA_DIR environment variable.
104
+ Default: <repo-root>/data
105
+ """
106
+
107
+ DEFAULT_LEAN_VERSION: str = "4.24.0"
108
+ """Lean version for database naming and dependency resolution."""
109
+
110
+ ACTIVE_VERSION: str = _get_active_cache_version()
111
+ """Active version identifier for cached data (e.g., "2025.01.27").
112
+
113
+ Determined by LEAN_EXPLORE_VERSION env var, ~/.lean_explore/active_version
114
+ file, or defaults to v4.24.0.
115
+ """
116
+
117
+ ACTIVE_CACHE_PATH: pathlib.Path = CACHE_DIRECTORY / ACTIVE_VERSION
118
+ """Directory for the active version's cached data files."""
119
+
120
+ ACTIVE_DATA_PATH: pathlib.Path = _resolve_active_data_path(
121
+ DATA_DIRECTORY, ACTIVE_VERSION
122
+ )
123
+ """Directory for the active version's local data files.
124
+
125
+ Resolved using (in priority order):
126
+ 1. DATA_DIRECTORY if it contains lean_explore.db directly
127
+ 2. Most recent timestamped extraction directory (YYYYMMDD_HHMMSS)
128
+ 3. DATA_DIRECTORY / ACTIVE_VERSION as fallback
129
+ """
130
+
131
+ # =========================================================================
132
+ # Timestamped Extraction Directory Methods
133
+ # =========================================================================
134
+
135
+ @staticmethod
136
+ def _get_timestamped_directories() -> list[pathlib.Path]:
137
+ """Get all timestamped extraction directories sorted by name descending."""
138
+ return _get_timestamped_directories(Config.DATA_DIRECTORY)
139
+
140
+ @staticmethod
141
+ def get_latest_extraction_path() -> pathlib.Path | None:
142
+ """Get the most recent timestamped extraction directory.
143
+
144
+ Looks for directories matching YYYYMMDD_HHMMSS pattern in DATA_DIRECTORY.
145
+
146
+ Returns:
147
+ Path to most recent extraction directory, or None if none exist.
148
+ """
149
+ timestamped_directories = Config._get_timestamped_directories()
150
+ return timestamped_directories[0] if timestamped_directories else None
151
+
152
+ DATABASE_PATH: pathlib.Path = ACTIVE_CACHE_PATH / "lean_explore.db"
153
+ """Path to SQLite database file in cache (used by search engine)."""
154
+
155
+ FAISS_INDEX_PATH: pathlib.Path = ACTIVE_CACHE_PATH / "informalization_faiss.index"
156
+ """Path to FAISS index file in cache (using informalization embeddings)."""
157
+
158
+ FAISS_IDS_MAP_PATH: pathlib.Path = (
159
+ ACTIVE_CACHE_PATH / "informalization_faiss_ids_map.json"
160
+ )
161
+ """Path to FAISS ID mapping file in cache."""
162
+
163
+ BM25_SPACED_PATH: pathlib.Path = ACTIVE_CACHE_PATH / "bm25_name_spaced"
164
+ """Path to BM25 spaced tokenization index directory in cache."""
165
+
166
+ BM25_RAW_PATH: pathlib.Path = ACTIVE_CACHE_PATH / "bm25_name_raw"
167
+ """Path to BM25 raw tokenization index directory in cache."""
168
+
169
+ BM25_IDS_MAP_PATH: pathlib.Path = ACTIVE_CACHE_PATH / "bm25_ids_map.json"
170
+ """Path to BM25 ID mapping file in cache."""
171
+
172
+ DATABASE_URL: str = f"sqlite+aiosqlite:///{DATABASE_PATH}"
173
+ """Async SQLAlchemy database URL for SQLite (used by search engine)."""
174
+
175
+ EXTRACTION_DATABASE_PATH: pathlib.Path = ACTIVE_DATA_PATH / "lean_explore.db"
176
+ """Path to SQLite database file in data directory (used by extraction)."""
177
+
178
+ EXTRACTION_DATABASE_URL: str = f"sqlite+aiosqlite:///{EXTRACTION_DATABASE_PATH}"
179
+ """Async SQLAlchemy database URL for extraction pipeline."""
180
+
181
+ @staticmethod
182
+ def get_latest_database_path() -> pathlib.Path | None:
183
+ """Get the path to the most recent extraction database.
184
+
185
+ Returns:
186
+ Path to lean_explore.db in the most recent extraction, or None.
187
+ """
188
+ latest = Config.get_latest_extraction_path()
189
+ if latest:
190
+ database_path = latest / "lean_explore.db"
191
+ if database_path.exists():
192
+ return database_path
193
+ return None
194
+
195
+ @staticmethod
196
+ def create_timestamped_extraction_path() -> pathlib.Path:
197
+ """Create a new timestamped extraction directory.
198
+
199
+ Returns:
200
+ Path to the newly created directory (YYYYMMDD_HHMMSS format).
201
+ """
202
+ from datetime import datetime
203
+
204
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
205
+ extraction_path = Config.DATA_DIRECTORY / timestamp
206
+ extraction_path.mkdir(parents=True, exist_ok=True)
207
+ return extraction_path
208
+
209
+ # =========================================================================
210
+ # Package Workspace Paths
211
+ # =========================================================================
212
+
213
+ PACKAGES_ROOT: pathlib.Path = pathlib.Path(
214
+ os.getenv(
215
+ "LEAN_EXPLORE_PACKAGES_ROOT",
216
+ pathlib.Path(__file__).parent.parent.parent / "lean",
217
+ )
218
+ )
219
+ """Root directory for per-package Lean workspaces.
220
+
221
+ Can be overridden with LEAN_EXPLORE_PACKAGES_ROOT environment variable.
222
+ Default: <repo-root>/lean
223
+ """
224
+
225
+ EXTRACT_PACKAGES: set[str] = {
226
+ "batteries",
227
+ "init",
228
+ "lean4",
229
+ "mathlib",
230
+ "physlean",
231
+ "std",
232
+ }
233
+ """Set of package names to extract from doc-gen4 output."""
234
+
235
+ MANIFEST_URL: str = (
236
+ "https://pub-48b75babc4664808b15520033423c765.r2.dev/manifest.json"
237
+ )
238
+ """Remote URL for the data toolchain manifest."""
239
+
240
+ R2_ASSETS_BASE_URL: str = "https://pub-48b75babc4664808b15520033423c765.r2.dev"
241
+ """Base URL for Cloudflare R2 asset storage."""
242
+
243
+ API_BASE_URL: str = "https://www.leanexplore.com/api/v2"
244
+ """Base URL for the LeanExplore remote API service."""
@@ -0,0 +1,5 @@
1
+ """Lean declaration extraction and processing tools.
2
+
3
+ This package contains modules for extracting, parsing, and enriching
4
+ Lean mathematical declarations from documentation files.
5
+ """
@@ -0,0 +1,368 @@
1
+ """Pipeline orchestration for Lean declaration extraction and enrichment.
2
+
3
+ This module provides functions to coordinate the complete data extraction pipeline:
4
+ 1. Extract declarations from doc-gen4 output
5
+ 2. Generate informal natural language descriptions
6
+ 3. Generate vector embeddings for semantic search
7
+ 4. Build FAISS indices for vector similarity search
8
+ """
9
+
10
+ import asyncio
11
+ import logging
12
+ import os
13
+ from pathlib import Path
14
+
15
+ import click
16
+ from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine
17
+
18
+ from lean_explore.config import Config
19
+ from lean_explore.models import Base
20
+ from lean_explore.util.logging import setup_logging
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ async def _create_database_schema(engine: AsyncEngine) -> None:
26
+ """Create database tables if they don't exist.
27
+
28
+ Args:
29
+ engine: SQLAlchemy async engine instance.
30
+ """
31
+ logger.info("Creating database schema...")
32
+ async with engine.begin() as connection:
33
+ await connection.run_sync(Base.metadata.create_all)
34
+ logger.info("Database schema created successfully")
35
+
36
+
37
+ async def _run_doc_gen4_step(fresh: bool = False) -> None:
38
+ """Run doc-gen4 to generate documentation.
39
+
40
+ Args:
41
+ fresh: Clear cached dependencies to force fresh resolution.
42
+ """
43
+ from lean_explore.extract.doc_gen4 import run_doc_gen4
44
+
45
+ logger.info("Running doc-gen4...")
46
+ await run_doc_gen4(fresh=fresh)
47
+ logger.info("doc-gen4 complete")
48
+
49
+
50
+ async def _run_extract_step(engine: AsyncEngine) -> None:
51
+ """Extract declarations from doc-gen4 output."""
52
+ from lean_explore.extract.doc_parser import extract_declarations
53
+
54
+ logger.info("Step 1: Extracting declarations from doc-gen4...")
55
+ await extract_declarations(engine)
56
+ logger.info("Declaration extraction complete")
57
+
58
+
59
+ async def _run_informalize_step(
60
+ engine: AsyncEngine,
61
+ model: str,
62
+ batch_size: int,
63
+ max_concurrent: int,
64
+ limit: int | None,
65
+ ) -> None:
66
+ """Generate informal descriptions for declarations."""
67
+ from lean_explore.extract.informalize import informalize_declarations
68
+
69
+ logger.info("Step 2: Generating informal descriptions...")
70
+ await informalize_declarations(
71
+ engine,
72
+ model=model,
73
+ commit_batch_size=batch_size,
74
+ max_concurrent=max_concurrent,
75
+ limit=limit,
76
+ )
77
+ logger.info("Informalization complete")
78
+
79
+
80
+ async def _run_embeddings_step(
81
+ engine: AsyncEngine,
82
+ model_name: str,
83
+ batch_size: int,
84
+ limit: int | None,
85
+ max_seq_length: int,
86
+ ) -> None:
87
+ """Generate embeddings for all declaration fields."""
88
+ from lean_explore.extract.embeddings import generate_embeddings
89
+
90
+ logger.info("Step 3: Generating embeddings...")
91
+ await generate_embeddings(
92
+ engine,
93
+ model_name=model_name,
94
+ batch_size=batch_size,
95
+ limit=limit,
96
+ max_seq_length=max_seq_length,
97
+ )
98
+ logger.info("Embedding generation complete")
99
+
100
+
101
+ async def _run_index_step(engine: AsyncEngine, extraction_path: Path) -> None:
102
+ """Build search indices (FAISS and BM25).
103
+
104
+ Args:
105
+ engine: SQLAlchemy async engine instance.
106
+ extraction_path: Directory to save indices (same as database location).
107
+ """
108
+ from lean_explore.extract.index import build_bm25_indices, build_faiss_indices
109
+
110
+ logger.info("Step 4: Building search indices...")
111
+ await build_faiss_indices(engine, output_directory=extraction_path)
112
+ await build_bm25_indices(engine, output_directory=extraction_path)
113
+ logger.info("Index building complete")
114
+
115
+
116
+ async def run_pipeline(
117
+ database_url: str,
118
+ extraction_path: Path,
119
+ run_doc_gen4: bool = False,
120
+ fresh: bool = False,
121
+ parse_docs: bool = True,
122
+ informalize: bool = True,
123
+ embeddings: bool = True,
124
+ index: bool = True,
125
+ informalize_model: str = "google/gemini-3-flash-preview",
126
+ informalize_batch_size: int = 1000,
127
+ informalize_max_concurrent: int = 100,
128
+ informalize_limit: int | None = None,
129
+ embedding_model: str = "Qwen/Qwen3-Embedding-0.6B",
130
+ embedding_batch_size: int = 250,
131
+ embedding_limit: int | None = None,
132
+ embedding_max_seq_length: int = 512,
133
+ verbose: bool = False,
134
+ ) -> None:
135
+ """Run the Lean declaration extraction and enrichment pipeline.
136
+
137
+ Args:
138
+ database_url: SQLite database URL (e.g., sqlite+aiosqlite:///path/to/db)
139
+ extraction_path: Directory containing the extraction (for saving indices).
140
+ run_doc_gen4: Run doc-gen4 to generate documentation before parsing
141
+ fresh: Clear cached dependencies to force fresh resolution (for nightly updates)
142
+ parse_docs: Run doc-gen4 parsing step
143
+ informalize: Run informalization step
144
+ embeddings: Run embeddings generation step
145
+ index: Run FAISS index building step
146
+ informalize_model: LLM model for generating informalizations
147
+ informalize_batch_size: Commit batch size for informalization
148
+ informalize_max_concurrent: Maximum concurrent informalization requests
149
+ informalize_limit: Limit number of declarations to informalize
150
+ embedding_model: Sentence transformer model for embeddings
151
+ embedding_batch_size: Batch size for embedding generation
152
+ embedding_limit: Limit number of declarations for embeddings
153
+ embedding_max_seq_length: Max sequence length for embeddings (lower=less mem)
154
+ verbose: Enable verbose logging
155
+ """
156
+ setup_logging(verbose)
157
+
158
+ # Validate OpenRouter API key if informalization is needed
159
+ if informalize:
160
+ if not os.getenv("OPENROUTER_API_KEY"):
161
+ logger.error(
162
+ "OPENROUTER_API_KEY environment variable is required for "
163
+ "informalization"
164
+ )
165
+ raise RuntimeError("OPENROUTER_API_KEY not set")
166
+
167
+ steps_enabled = []
168
+ if parse_docs:
169
+ steps_enabled.append("parse-docs")
170
+ if informalize:
171
+ steps_enabled.append("informalize")
172
+ if embeddings:
173
+ steps_enabled.append("embeddings")
174
+ if index:
175
+ steps_enabled.append("index")
176
+
177
+ logger.info("Starting Lean Explore extraction pipeline")
178
+ logger.info(f"Database URL: {database_url}")
179
+ logger.info(f"Steps to run: {', '.join(steps_enabled)}")
180
+
181
+ engine = create_async_engine(database_url, echo=verbose)
182
+
183
+ try:
184
+ await _create_database_schema(engine)
185
+
186
+ if run_doc_gen4:
187
+ await _run_doc_gen4_step(fresh=fresh)
188
+
189
+ if parse_docs:
190
+ await _run_extract_step(engine)
191
+
192
+ if informalize:
193
+ await _run_informalize_step(
194
+ engine,
195
+ informalize_model,
196
+ informalize_batch_size,
197
+ informalize_max_concurrent,
198
+ informalize_limit,
199
+ )
200
+
201
+ if embeddings:
202
+ await _run_embeddings_step(
203
+ engine,
204
+ embedding_model,
205
+ embedding_batch_size,
206
+ embedding_limit,
207
+ embedding_max_seq_length,
208
+ )
209
+
210
+ if index:
211
+ await _run_index_step(engine, extraction_path)
212
+
213
+ logger.info("Pipeline completed successfully!")
214
+
215
+ finally:
216
+ await engine.dispose()
217
+
218
+
219
+ @click.command()
220
+ @click.option(
221
+ "--run-doc-gen4",
222
+ is_flag=True,
223
+ help="Run doc-gen4 to generate documentation before parsing",
224
+ )
225
+ @click.option(
226
+ "--fresh",
227
+ is_flag=True,
228
+ help="Clear cached dependencies to fetch latest versions (use for nightly updates)",
229
+ )
230
+ @click.option(
231
+ "--parse-docs/--no-parse-docs",
232
+ default=None,
233
+ help="Run doc-gen4 parsing step (creates new timestamped directory)",
234
+ )
235
+ @click.option(
236
+ "--informalize/--no-informalize",
237
+ default=None,
238
+ help="Run informalization step (uses latest extraction)",
239
+ )
240
+ @click.option(
241
+ "--embeddings/--no-embeddings",
242
+ default=None,
243
+ help="Run embeddings generation step (uses latest extraction)",
244
+ )
245
+ @click.option(
246
+ "--index/--no-index",
247
+ default=None,
248
+ help="Run FAISS index building step (uses latest extraction)",
249
+ )
250
+ @click.option(
251
+ "--informalize-model",
252
+ default="google/gemini-3-flash-preview",
253
+ help="LLM model for generating informalizations",
254
+ )
255
+ @click.option(
256
+ "--informalize-max-concurrent",
257
+ type=int,
258
+ default=100,
259
+ help="Maximum concurrent informalization requests",
260
+ )
261
+ @click.option(
262
+ "--informalize-limit",
263
+ type=int,
264
+ default=None,
265
+ help="Limit number of declarations to informalize (for testing)",
266
+ )
267
+ @click.option(
268
+ "--embedding-model",
269
+ default="Qwen/Qwen3-Embedding-0.6B",
270
+ help="Sentence transformer model for embeddings",
271
+ )
272
+ @click.option(
273
+ "--embedding-batch-size",
274
+ type=int,
275
+ default=250,
276
+ help="Batch size for embedding generation (lower = less memory, default 250)",
277
+ )
278
+ @click.option(
279
+ "--embedding-limit",
280
+ type=int,
281
+ default=None,
282
+ help="Limit number of declarations for embeddings (for testing)",
283
+ )
284
+ @click.option(
285
+ "--embedding-max-seq-length",
286
+ type=int,
287
+ default=512,
288
+ help="Max sequence length for embeddings (lower = less memory, default 512)",
289
+ )
290
+ @click.option("--verbose", is_flag=True, help="Enable verbose logging")
291
+ def main(
292
+ run_doc_gen4: bool,
293
+ fresh: bool,
294
+ parse_docs: bool | None,
295
+ informalize: bool | None,
296
+ embeddings: bool | None,
297
+ index: bool | None,
298
+ informalize_model: str,
299
+ informalize_max_concurrent: int,
300
+ informalize_limit: int | None,
301
+ embedding_model: str,
302
+ embedding_batch_size: int,
303
+ embedding_limit: int | None,
304
+ embedding_max_seq_length: int,
305
+ verbose: bool,
306
+ ) -> None:
307
+ """Run the Lean declaration extraction and enrichment pipeline.
308
+
309
+ Extraction creates timestamped directories (YYYYMMDD_HHMMSS format).
310
+ Subsequent steps (informalize, embeddings, index) use the latest extraction.
311
+ """
312
+ # Determine if any flags were explicitly set (including --run-doc-gen4)
313
+ step_flags = [run_doc_gen4, parse_docs, informalize, embeddings, index]
314
+ any_flag_explicitly_set = run_doc_gen4 or any(
315
+ flag is not None for flag in step_flags[1:]
316
+ )
317
+
318
+ # If no flags were explicitly set, run all pipeline steps by default
319
+ # Otherwise, only run what was explicitly requested
320
+ if not any_flag_explicitly_set:
321
+ parse_docs = informalize = embeddings = index = True
322
+ else:
323
+ parse_docs = parse_docs if parse_docs is not None else False
324
+ informalize = informalize if informalize is not None else False
325
+ embeddings = embeddings if embeddings is not None else False
326
+ index = index if index is not None else False
327
+
328
+ # Determine extraction directory
329
+ if parse_docs:
330
+ # Create new timestamped directory for fresh extraction
331
+ extraction_path = Config.create_timestamped_extraction_path()
332
+ logger.info(f"Created new extraction directory: {extraction_path}")
333
+ else:
334
+ # Use latest existing extraction for subsequent steps
335
+ extraction_path = Config.get_latest_extraction_path()
336
+ if extraction_path is None:
337
+ raise click.ClickException(
338
+ "No existing extraction found. Run with --parse-docs first."
339
+ )
340
+ logger.info(f"Using existing extraction: {extraction_path}")
341
+
342
+ database_path = extraction_path / "lean_explore.db"
343
+ database_url = f"sqlite+aiosqlite:///{database_path}"
344
+
345
+ asyncio.run(
346
+ run_pipeline(
347
+ database_url=database_url,
348
+ extraction_path=extraction_path,
349
+ run_doc_gen4=run_doc_gen4,
350
+ fresh=fresh,
351
+ parse_docs=parse_docs,
352
+ informalize=informalize,
353
+ embeddings=embeddings,
354
+ index=index,
355
+ informalize_model=informalize_model,
356
+ informalize_max_concurrent=informalize_max_concurrent,
357
+ informalize_limit=informalize_limit,
358
+ embedding_model=embedding_model,
359
+ embedding_batch_size=embedding_batch_size,
360
+ embedding_limit=embedding_limit,
361
+ embedding_max_seq_length=embedding_max_seq_length,
362
+ verbose=verbose,
363
+ )
364
+ )
365
+
366
+
367
+ if __name__ == "__main__":
368
+ main()