lean-explore 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,394 @@
1
+ # src/lean_explore/local/service.py
2
+
3
+ """Provides a service class for local Lean data exploration.
4
+
5
+ This module defines the Service class, which offers methods to search,
6
+ retrieve by ID, and get dependencies for statement groups using local
7
+ data assets (SQLite database, FAISS index, and embedding models).
8
+ """
9
+
10
+ import logging
11
+ import time
12
+ from typing import List, Optional
13
+
14
+ import faiss # For type hinting if needed
15
+ from sentence_transformers import SentenceTransformer # For type hinting if needed
16
+ from sqlalchemy import create_engine
17
+ from sqlalchemy.exc import OperationalError, SQLAlchemyError
18
+ from sqlalchemy.orm import Session as SQLAlchemySessionType
19
+ from sqlalchemy.orm import joinedload, sessionmaker
20
+
21
+ from lean_explore import defaults
22
+ from lean_explore.shared.models.api import (
23
+ APICitationsResponse,
24
+ APIPrimaryDeclarationInfo,
25
+ APISearchResponse,
26
+ APISearchResultItem,
27
+ )
28
+ from lean_explore.shared.models.db import (
29
+ StatementGroup,
30
+ StatementGroupDependency,
31
+ )
32
+
33
+ from .search import load_embedding_model, load_faiss_assets, perform_search
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class Service:
39
+ """A service for interacting with local Lean explore data.
40
+
41
+ This service loads necessary data assets (embedding model, FAISS index,
42
+ database connection) upon initialization using default paths and parameters
43
+ derived from the active toolchain. It provides methods for searching
44
+ statement groups, retrieving them by ID, and fetching dependencies (citations).
45
+
46
+ Attributes:
47
+ embedding_model: The loaded sentence embedding model.
48
+ faiss_index: The loaded FAISS index.
49
+ text_chunk_id_map: A list mapping FAISS indices to text chunk IDs.
50
+ engine: The SQLAlchemy engine for database connections.
51
+ SessionLocal: The SQLAlchemy sessionmaker for creating sessions.
52
+ default_faiss_k (int): Default number of FAISS neighbors to retrieve.
53
+ default_pagerank_weight (float): Default weight for PageRank.
54
+ default_text_relevance_weight (float): Default weight for text relevance.
55
+ default_name_match_weight (float): Default weight for name matching.
56
+ default_semantic_similarity_threshold (float): Default similarity threshold.
57
+ default_results_limit (int): Default limit for search results.
58
+ default_faiss_nprobe (int): Default nprobe for FAISS IVF indexes.
59
+ """
60
+
61
+ def __init__(self):
62
+ """Initializes the Service by loading data assets and configurations.
63
+
64
+ Checks for essential local data files first, then loads the
65
+ embedding model, FAISS index, and sets up the database engine.
66
+ Paths for data assets are sourced from `lean_explore.defaults`.
67
+
68
+ Raises:
69
+ FileNotFoundError: If essential data files (DB, FAISS index, map)
70
+ are not found at their expected locations.
71
+ RuntimeError: If the embedding model fails to load or if other
72
+ critical initialization steps (like database connection
73
+ after file checks) fail.
74
+ """
75
+ logger.info("Initializing local Service...")
76
+ try:
77
+ defaults.LEAN_EXPLORE_TOOLCHAINS_BASE_DIR.mkdir(parents=True, exist_ok=True)
78
+ logger.info(
79
+ "User toolchains base directory ensured: "
80
+ f"{defaults.LEAN_EXPLORE_TOOLCHAINS_BASE_DIR}"
81
+ )
82
+ except OSError as e:
83
+ logger.error(
84
+ f"Could not create user toolchains base directory "
85
+ f"{defaults.LEAN_EXPLORE_TOOLCHAINS_BASE_DIR}: {e}"
86
+ )
87
+
88
+ db_path = defaults.DEFAULT_DB_PATH
89
+ db_url = defaults.DEFAULT_DB_URL
90
+ is_file_db = db_url.startswith("sqlite:///")
91
+
92
+ if is_file_db and not db_path.exists():
93
+ error_message = (
94
+ f"Database file not found at the expected location: {db_path}\n"
95
+ "Please run 'leanexplore data fetch' to download the data toolchain."
96
+ )
97
+ logger.error(error_message)
98
+ raise FileNotFoundError(error_message)
99
+
100
+ logger.info(f"Loading embedding model: {defaults.DEFAULT_EMBEDDING_MODEL_NAME}")
101
+ self.embedding_model: Optional[SentenceTransformer] = load_embedding_model(
102
+ defaults.DEFAULT_EMBEDDING_MODEL_NAME
103
+ )
104
+ if self.embedding_model is None:
105
+ raise RuntimeError(
106
+ f"Failed to load embedding model: "
107
+ f"{defaults.DEFAULT_EMBEDDING_MODEL_NAME}. "
108
+ "Check model name and network connection if downloaded on the fly."
109
+ )
110
+
111
+ faiss_index_path = defaults.DEFAULT_FAISS_INDEX_PATH
112
+ faiss_map_path = defaults.DEFAULT_FAISS_MAP_PATH
113
+ logger.info(
114
+ f"Attempting to load FAISS assets: Index='{faiss_index_path}', "
115
+ f"Map='{faiss_map_path}'"
116
+ )
117
+
118
+ faiss_assets = load_faiss_assets(str(faiss_index_path), str(faiss_map_path))
119
+ if faiss_assets[0] is None or faiss_assets[1] is None:
120
+ error_message = (
121
+ "Failed to load critical FAISS assets (index or ID map).\n"
122
+ "Expected at:\n"
123
+ f" Index path: {faiss_index_path}\n"
124
+ f" ID map path: {faiss_map_path}\n"
125
+ "Please run 'leanexplore data fetch' to download or update the data "
126
+ "toolchain."
127
+ )
128
+ logger.error(error_message)
129
+ raise FileNotFoundError(error_message)
130
+ self.faiss_index: faiss.Index = faiss_assets[0]
131
+ self.text_chunk_id_map: List[str] = faiss_assets[1]
132
+ logger.info("FAISS assets loaded successfully.")
133
+
134
+ logger.info(f"Initializing database engine. Expected DB path: {db_path}")
135
+ try:
136
+ self.engine = create_engine(db_url)
137
+ # Test connection
138
+ with (
139
+ self.engine.connect()
140
+ ): # Ensure connect is within try for OperationalError
141
+ logger.info("Database connection successful.")
142
+ # Setup SessionLocal after successful connection test
143
+ self.SessionLocal: sessionmaker[SQLAlchemySessionType] = sessionmaker(
144
+ autocommit=False, autoflush=False, bind=self.engine
145
+ )
146
+ except OperationalError as oe:
147
+ guidance = (
148
+ "Please check your database configuration or connection parameters."
149
+ )
150
+ if is_file_db: # This check is now valid as is_file_db is defined earlier
151
+ guidance = (
152
+ f"The database file at '{db_path}' might be corrupted, "
153
+ "inaccessible, or not a valid SQLite file. "
154
+ "Consider running 'leanexplore data fetch' to get a fresh copy."
155
+ )
156
+ logger.error(
157
+ f"Failed to initialize database engine or connection to {db_url}: "
158
+ f"{oe}\n{guidance}"
159
+ )
160
+ raise RuntimeError(
161
+ f"Database initialization failed: {oe}. {guidance}"
162
+ ) from oe
163
+ except Exception as e:
164
+ logger.error(
165
+ f"Unexpected error during database engine initialization: {e}",
166
+ exc_info=True,
167
+ )
168
+ raise RuntimeError(
169
+ f"Database initialization failed unexpectedly: {e}"
170
+ ) from e
171
+
172
+ self.default_faiss_k: int = defaults.DEFAULT_FAISS_K
173
+ self.default_pagerank_weight: float = defaults.DEFAULT_PAGERANK_WEIGHT
174
+ self.default_text_relevance_weight: float = (
175
+ defaults.DEFAULT_TEXT_RELEVANCE_WEIGHT
176
+ )
177
+ self.default_name_match_weight: float = defaults.DEFAULT_NAME_MATCH_WEIGHT
178
+ self.default_semantic_similarity_threshold: float = (
179
+ defaults.DEFAULT_SEM_SIM_THRESHOLD
180
+ )
181
+ self.default_results_limit: int = defaults.DEFAULT_RESULTS_LIMIT
182
+ self.default_faiss_nprobe: int = defaults.DEFAULT_FAISS_NPROBE
183
+
184
+ logger.info("Local Service initialized successfully.")
185
+
186
+ def _serialize_sg_to_api_item(self, sg_orm: StatementGroup) -> APISearchResultItem:
187
+ """Converts a StatementGroup ORM obj to APISearchResultItem Pydantic model.
188
+
189
+ Args:
190
+ sg_orm: The SQLAlchemy StatementGroup object.
191
+
192
+ Returns:
193
+ An APISearchResultItem Pydantic model instance.
194
+ """
195
+ primary_decl_info = APIPrimaryDeclarationInfo(
196
+ lean_name=sg_orm.primary_declaration.lean_name
197
+ if sg_orm.primary_declaration
198
+ else None
199
+ )
200
+ return APISearchResultItem(
201
+ id=sg_orm.id,
202
+ primary_declaration=primary_decl_info,
203
+ source_file=sg_orm.source_file,
204
+ range_start_line=sg_orm.range_start_line,
205
+ display_statement_text=sg_orm.display_statement_text,
206
+ statement_text=sg_orm.statement_text,
207
+ docstring=sg_orm.docstring,
208
+ informal_description=sg_orm.informal_description,
209
+ )
210
+
211
+ def search(
212
+ self,
213
+ query: str,
214
+ package_filters: Optional[List[str]] = None,
215
+ limit: Optional[int] = None,
216
+ ) -> APISearchResponse:
217
+ """Performs a local search for statement groups.
218
+
219
+ Args:
220
+ query: The search query string.
221
+ package_filters: An optional list of package names to filter results by.
222
+ limit: An optional limit on the number of results to return.
223
+ If None, defaults.DEFAULT_RESULTS_LIMIT is used.
224
+
225
+ Returns:
226
+ An APISearchResponse object containing search results and metadata.
227
+
228
+ Raises:
229
+ RuntimeError: If service not properly initialized (e.g., assets missing).
230
+ Exception: Propagates exceptions from `perform_search`.
231
+ """
232
+ start_time = time.time()
233
+ actual_limit = limit if limit is not None else self.default_results_limit
234
+
235
+ if (
236
+ self.embedding_model is None
237
+ or self.faiss_index is None
238
+ or self.text_chunk_id_map is None
239
+ ):
240
+ logger.error(
241
+ "Search service assets not loaded. Service may not have initialized "
242
+ "correctly."
243
+ )
244
+ raise RuntimeError(
245
+ "Search service assets not loaded. Please ensure data has been fetched."
246
+ )
247
+
248
+ with self.SessionLocal() as session:
249
+ try:
250
+ ranked_results_orm = perform_search(
251
+ session=session,
252
+ query_string=query,
253
+ model=self.embedding_model,
254
+ faiss_index=self.faiss_index,
255
+ text_chunk_id_map=self.text_chunk_id_map,
256
+ faiss_k=self.default_faiss_k,
257
+ pagerank_weight=self.default_pagerank_weight,
258
+ text_relevance_weight=self.default_text_relevance_weight,
259
+ name_match_weight=self.default_name_match_weight,
260
+ selected_packages=package_filters,
261
+ semantic_similarity_threshold=(
262
+ self.default_semantic_similarity_threshold
263
+ ),
264
+ faiss_nprobe=self.default_faiss_nprobe,
265
+ )
266
+ except Exception as e: # Catch exceptions from perform_search
267
+ logger.error(
268
+ f"Error during perform_search execution: {e}", exc_info=True
269
+ )
270
+ # Re-raise to allow higher-level error handling if needed by the caller
271
+ # (e.g., MCP server might want to return a specific error response)
272
+ raise
273
+
274
+ api_results = [
275
+ self._serialize_sg_to_api_item(sg_obj)
276
+ for sg_obj, _scores in ranked_results_orm
277
+ ]
278
+
279
+ final_results = api_results[:actual_limit]
280
+ end_time = time.time()
281
+ processing_time_ms = int((end_time - start_time) * 1000)
282
+
283
+ return APISearchResponse(
284
+ query=query,
285
+ packages_applied=package_filters,
286
+ results=final_results,
287
+ count=len(final_results),
288
+ total_candidates_considered=len(api_results), # Number before final limit
289
+ processing_time_ms=processing_time_ms,
290
+ )
291
+
292
+ def get_by_id(self, group_id: int) -> Optional[APISearchResultItem]:
293
+ """Retrieves a specific statement group by its ID from local data.
294
+
295
+ Args:
296
+ group_id: The unique identifier of the statement group.
297
+
298
+ Returns:
299
+ An APISearchResultItem if found, otherwise None.
300
+ """
301
+ with self.SessionLocal() as session:
302
+ try:
303
+ stmt_group_orm = (
304
+ session.query(StatementGroup)
305
+ .options(joinedload(StatementGroup.primary_declaration))
306
+ .filter(StatementGroup.id == group_id)
307
+ .first()
308
+ )
309
+ if stmt_group_orm:
310
+ return self._serialize_sg_to_api_item(stmt_group_orm)
311
+ return None
312
+ except SQLAlchemyError as e:
313
+ logger.error(
314
+ f"Database error in get_by_id for group_id {group_id}: {e}",
315
+ exc_info=True,
316
+ )
317
+ # For a service method, returning None on DB error might be acceptable,
318
+ # or raise a custom service-level exception.
319
+ return None
320
+ except Exception as e: # Catch any other unexpected errors
321
+ logger.error(
322
+ f"Unexpected error in get_by_id for group_id {group_id}: {e}",
323
+ exc_info=True,
324
+ )
325
+ return None
326
+
327
+ def get_dependencies(self, group_id: int) -> Optional[APICitationsResponse]:
328
+ """Retrieves citations for a specific statement group from local data.
329
+
330
+ Citations are the statement groups that the specified group_id depends on.
331
+
332
+ Args:
333
+ group_id: The unique identifier of the statement group for which
334
+ to fetch citations.
335
+
336
+ Returns:
337
+ An APICitationsResponse object if the source group is found and has
338
+ citations, or an APICitationsResponse with an empty list if no
339
+ citations, otherwise None if the source group itself is not found or
340
+ a DB error occurs.
341
+ """
342
+ with self.SessionLocal() as session:
343
+ try:
344
+ # Check if the source statement group exists
345
+ source_group_exists = (
346
+ session.query(StatementGroup.id)
347
+ .filter(StatementGroup.id == group_id)
348
+ .first()
349
+ )
350
+ if not source_group_exists:
351
+ logger.warning(
352
+ f"Source statement group ID {group_id} not found for "
353
+ "dependency lookup."
354
+ )
355
+ return None # Source group does not exist
356
+
357
+ # Query for statement groups that `group_id` depends on (citations)
358
+ cited_target_groups_orm = (
359
+ session.query(StatementGroup)
360
+ .join(
361
+ StatementGroupDependency,
362
+ StatementGroup.id
363
+ == StatementGroupDependency.target_statement_group_id,
364
+ )
365
+ .filter(
366
+ StatementGroupDependency.source_statement_group_id == group_id
367
+ )
368
+ .options(joinedload(StatementGroup.primary_declaration))
369
+ .all()
370
+ )
371
+
372
+ citations_api_items = [
373
+ self._serialize_sg_to_api_item(sg_orm)
374
+ for sg_orm in cited_target_groups_orm
375
+ ]
376
+
377
+ return APICitationsResponse(
378
+ source_group_id=group_id,
379
+ citations=citations_api_items,
380
+ count=len(citations_api_items),
381
+ )
382
+ except SQLAlchemyError as e:
383
+ logger.error(
384
+ f"Database error in get_dependencies for group_id {group_id}: {e}",
385
+ exc_info=True,
386
+ )
387
+ return None
388
+ except Exception as e: # Catch any other unexpected errors
389
+ logger.error(
390
+ f"Unexpected error in get_dependencies for "
391
+ f"group_id {group_id}: {e}",
392
+ exc_info=True,
393
+ )
394
+ return None
@@ -0,0 +1 @@
1
+ """Local package for lean explore."""
@@ -0,0 +1,107 @@
1
+ # src/lean_explore/mcp/app.py
2
+
3
+ """Initializes the FastMCP application and its lifespan context.
4
+
5
+ This module creates the main FastMCP application instance and defines
6
+ a lifespan context manager. The lifespan manager is responsible for
7
+ making the configured backend service (API client or local service)
8
+ available to MCP tools via the request context. The actual backend
9
+ instance will be set by the server startup script before running the app.
10
+ """
11
+
12
+ import logging
13
+ from contextlib import asynccontextmanager
14
+ from dataclasses import dataclass
15
+ from typing import AsyncIterator, Union
16
+
17
+ from mcp.server.fastmcp import FastMCP
18
+
19
+ # Import your backend service types for type hinting
20
+ from lean_explore.api.client import Client as APIClient
21
+ from lean_explore.local.service import Service as LocalService
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Define a type for the backend service to be used by tools
26
+ BackendServiceType = Union[APIClient, LocalService, None]
27
+
28
+
29
+ @dataclass
30
+ class AppContext:
31
+ """Dataclass to hold application-level context for MCP tools.
32
+
33
+ Attributes:
34
+ backend_service: The initialized backend service (either APIClient or
35
+ LocalService) that tools will use to perform actions.
36
+ Will be None if not properly initialized by the server script.
37
+ """
38
+
39
+ backend_service: BackendServiceType
40
+
41
+
42
+ @asynccontextmanager
43
+ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
44
+ """Asynchronous context manager for the MCP application's lifespan.
45
+
46
+ This function is called by FastMCP when the server starts and stops.
47
+ It retrieves the backend service instance (which should have been
48
+ initialized and attached to an attribute of the `server` instance,
49
+ e.g., `server._lean_explore_backend_service`, by the main server script)
50
+ and makes it available in the AppContext.
51
+
52
+ Args:
53
+ server: The FastMCP application instance.
54
+
55
+ Yields:
56
+ AppContext: The application context containing the backend service.
57
+
58
+ Raises:
59
+ RuntimeError: If the backend service has not been initialized and
60
+ set on an attribute of the `server` instance prior to
61
+ the app running.
62
+ """
63
+ logger.info("MCP application lifespan starting...")
64
+
65
+ # The main server script (mcp/server.py) is expected to instantiate
66
+ # the backend (APIClient or LocalService) based on its startup arguments
67
+ # and store it as an attribute on the mcp_app instance (e.g.,
68
+ # mcp_app._lean_explore_backend_service) before mcp_app.run() is called.
69
+ backend_service_instance: BackendServiceType = getattr(
70
+ server, "_lean_explore_backend_service", None
71
+ )
72
+
73
+ if backend_service_instance is None:
74
+ logger.error(
75
+ "Backend service not found on the FastMCP app instance. "
76
+ "The MCP server script must set this attribute (e.g., "
77
+ "'_lean_explore_backend_service') before running the app."
78
+ )
79
+ raise RuntimeError(
80
+ "Backend service not initialized for MCP app. "
81
+ "Ensure the server script correctly sets the backend service attribute "
82
+ "on the FastMCP app instance."
83
+ "on the FastMCP app instance."
84
+ )
85
+
86
+ app_context = AppContext(backend_service=backend_service_instance)
87
+
88
+ try:
89
+ yield app_context
90
+ finally:
91
+ logger.info("MCP application lifespan shutting down...")
92
+ pass
93
+
94
+
95
+ # Create the FastMCP application instance
96
+ # The lifespan manager will be associated with this app.
97
+ mcp_app = FastMCP(
98
+ "LeanExploreMCPServer",
99
+ version="0.1.0",
100
+ description=(
101
+ "MCP Server for Lean Explore, providing tools to search and query Lean"
102
+ " mathematical data."
103
+ ),
104
+ lifespan=app_lifespan,
105
+ )
106
+
107
+ mcp_app.lifespan = app_lifespan