lean-explore 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. lean_explore/__init__.py +14 -1
  2. lean_explore/api/__init__.py +12 -1
  3. lean_explore/api/client.py +60 -80
  4. lean_explore/cli/__init__.py +10 -1
  5. lean_explore/cli/data_commands.py +157 -479
  6. lean_explore/cli/display.py +171 -0
  7. lean_explore/cli/main.py +51 -608
  8. lean_explore/config.py +244 -0
  9. lean_explore/extract/__init__.py +5 -0
  10. lean_explore/extract/__main__.py +368 -0
  11. lean_explore/extract/doc_gen4.py +200 -0
  12. lean_explore/extract/doc_parser.py +499 -0
  13. lean_explore/extract/embeddings.py +371 -0
  14. lean_explore/extract/github.py +110 -0
  15. lean_explore/extract/index.py +317 -0
  16. lean_explore/extract/informalize.py +653 -0
  17. lean_explore/extract/package_config.py +59 -0
  18. lean_explore/extract/package_registry.py +45 -0
  19. lean_explore/extract/package_utils.py +105 -0
  20. lean_explore/extract/types.py +25 -0
  21. lean_explore/mcp/__init__.py +11 -1
  22. lean_explore/mcp/app.py +14 -46
  23. lean_explore/mcp/server.py +20 -35
  24. lean_explore/mcp/tools.py +70 -177
  25. lean_explore/models/__init__.py +9 -0
  26. lean_explore/models/search_db.py +76 -0
  27. lean_explore/models/search_types.py +53 -0
  28. lean_explore/search/__init__.py +32 -0
  29. lean_explore/search/engine.py +655 -0
  30. lean_explore/search/scoring.py +156 -0
  31. lean_explore/search/service.py +68 -0
  32. lean_explore/search/tokenization.py +71 -0
  33. lean_explore/util/__init__.py +28 -0
  34. lean_explore/util/embedding_client.py +92 -0
  35. lean_explore/util/logging.py +22 -0
  36. lean_explore/util/openrouter_client.py +63 -0
  37. lean_explore/util/reranker_client.py +189 -0
  38. {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/METADATA +55 -10
  39. lean_explore-1.0.0.dist-info/RECORD +43 -0
  40. {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/WHEEL +1 -1
  41. lean_explore-1.0.0.dist-info/entry_points.txt +2 -0
  42. lean_explore/cli/agent.py +0 -781
  43. lean_explore/cli/config_utils.py +0 -481
  44. lean_explore/defaults.py +0 -114
  45. lean_explore/local/__init__.py +0 -1
  46. lean_explore/local/search.py +0 -1050
  47. lean_explore/local/service.py +0 -392
  48. lean_explore/shared/__init__.py +0 -1
  49. lean_explore/shared/models/__init__.py +0 -1
  50. lean_explore/shared/models/api.py +0 -117
  51. lean_explore/shared/models/db.py +0 -396
  52. lean_explore-0.2.2.dist-info/RECORD +0 -26
  53. lean_explore-0.2.2.dist-info/entry_points.txt +0 -2
  54. {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/licenses/LICENSE +0 -0
  55. {lean_explore-0.2.2.dist-info → lean_explore-1.0.0.dist-info}/top_level.txt +0 -0
lean_explore/mcp/tools.py CHANGED
@@ -1,27 +1,35 @@
1
- # src/lean_explore/mcp/tools.py
1
+ """Defines MCP tools for interacting with the Lean Explore search engine."""
2
2
 
3
- """Defines MCP tools for interacting with the Lean Explore search engine.
4
-
5
- These tools provide functionalities such as searching for statement groups,
6
- retrieving specific groups by ID, and getting their dependencies. They
7
- utilize a backend service (either an API client or a local service)
8
- made available through the MCP application context.
9
- """
10
-
11
- import asyncio # Needed for asyncio.iscoroutinefunction
3
+ import asyncio
12
4
  import logging
13
- from typing import Any, Dict, List, Optional
5
+ from typing import TypedDict
14
6
 
15
7
  from mcp.server.fastmcp import Context as MCPContext
16
8
 
17
9
  from lean_explore.mcp.app import AppContext, BackendServiceType, mcp_app
10
+ from lean_explore.models import SearchResponse, SearchResult
11
+
12
+
13
+ class SearchResultDict(TypedDict, total=False):
14
+ """Serialized SearchResult for MCP tool responses."""
15
+
16
+ id: int
17
+ name: str
18
+ module: str
19
+ docstring: str | None
20
+ source_text: str
21
+ source_link: str
22
+ dependencies: str | None
23
+ informalization: str | None
24
+
18
25
 
19
- # Import Pydantic models for type hinting and for creating response dicts
20
- from lean_explore.shared.models.api import (
21
- APICitationsResponse,
22
- APISearchResponse,
23
- APISearchResultItem,
24
- )
26
+ class SearchResponseDict(TypedDict, total=False):
27
+ """Serialized SearchResponse for MCP tool responses."""
28
+
29
+ query: str
30
+ results: list[SearchResultDict]
31
+ count: int
32
+ processing_time_ms: int | None
25
33
 
26
34
  logger = logging.getLogger(__name__)
27
35
 
@@ -33,210 +41,95 @@ async def _get_backend_from_context(ctx: MCPContext) -> BackendServiceType:
33
41
  ctx: The MCP context provided to the tool.
34
42
 
35
43
  Returns:
36
- The configured backend service (APIClient or LocalService).
37
- Guaranteed to be non-None if this function returns, otherwise
38
- it raises an exception.
44
+ The configured backend service (ApiClient or Service).
39
45
 
40
46
  Raises:
41
- RuntimeError: If the backend service is not available in the context,
42
- indicating a server configuration issue.
47
+ RuntimeError: If the backend service is not available in the context.
43
48
  """
44
49
  app_ctx: AppContext = ctx.request_context.lifespan_context
45
50
  backend = app_ctx.backend_service
46
51
  if not backend:
47
- logger.error(
48
- "MCP Tool Error: Backend service is not available in lifespan_context."
49
- )
52
+ logger.error("MCP Tool Error: Backend service is not available.")
50
53
  raise RuntimeError("Backend service not configured or available for MCP tool.")
51
54
  return backend
52
55
 
53
56
 
54
- def _prepare_mcp_result_item(backend_item: APISearchResultItem) -> APISearchResultItem:
55
- """Prepares an APISearchResultItem for MCP response.
56
-
57
- This helper ensures that the item sent over MCP does not include
58
- the display_statement_text, as the full statement_text is preferred
59
- for model consumption.
60
-
61
- Args:
62
- backend_item: The item as received from the backend service.
63
-
64
- Returns:
65
- A new APISearchResultItem instance suitable for MCP responses.
66
- """
67
- # Create a new instance or use .model_copy(update=...) for Pydantic v2
68
- return APISearchResultItem(
69
- id=backend_item.id,
70
- primary_declaration=backend_item.primary_declaration.model_copy()
71
- if backend_item.primary_declaration
72
- else None,
73
- source_file=backend_item.source_file,
74
- range_start_line=backend_item.range_start_line,
75
- statement_text=backend_item.statement_text,
76
- docstring=backend_item.docstring,
77
- informal_description=backend_item.informal_description,
78
- display_statement_text=None, # Ensure this is not sent over MCP
79
- )
80
-
81
-
82
57
  @mcp_app.tool()
83
58
  async def search(
84
59
  ctx: MCPContext,
85
60
  query: str,
86
- package_filters: Optional[List[str]] = None,
87
61
  limit: int = 10,
88
- ) -> Dict[str, Any]:
89
- """Searches Lean statement groups by a query string.
90
-
91
- This tool allows for filtering by package names and limits the number
92
- of results returned.
62
+ rerank_top: int | None = 50,
63
+ packages: list[str] | None = None,
64
+ ) -> SearchResponseDict:
65
+ """Searches Lean declarations by a query string.
93
66
 
94
67
  Args:
95
- ctx: The MCP context, providing access to shared resources like the
96
- backend service.
97
- query: The search query string. For example, "continuous function" or
98
- "prime number theorem".
99
- package_filters: An optional list of package names to filter the search
100
- results by. For example, `["Mathlib.Analysis",
101
- "Mathlib.Order"]`. If None or empty, no package filter
102
- is applied.
103
- limit: The maximum number of search results to return from this tool.
104
- Defaults to 10. Must be a positive integer.
68
+ ctx: The MCP context, providing access to the backend service.
69
+ query: A search query string, e.g., "continuous function".
70
+ limit: The maximum number of search results to return. Defaults to 10.
71
+ rerank_top: Number of candidates to rerank with cross-encoder. Set to 0 or
72
+ None to skip reranking. Defaults to 50. Only used with local backend.
73
+ packages: Filter results to specific packages (e.g., ["Mathlib", "Std"]).
74
+ Defaults to None (all packages).
105
75
 
106
76
  Returns:
107
- A dictionary corresponding to the APISearchResponse model, containing
108
- the search results (potentially truncated by the `limit` parameter of
109
- this tool), and metadata about the search operation. The
110
- `display_statement_text` field within each result item is omitted.
77
+ A dictionary containing the search response with results.
111
78
  """
112
79
  backend = await _get_backend_from_context(ctx)
113
80
  logger.info(
114
- f"MCP Tool 'search' called with query: '{query}', "
115
- f"packages: {package_filters}, tool_limit: {limit}"
81
+ f"MCP Tool 'search' called with query: '{query}', limit: {limit}, "
82
+ f"rerank_top: {rerank_top}, packages: {packages}"
116
83
  )
117
84
 
118
85
  if not hasattr(backend, "search"):
119
86
  logger.error("Backend service does not have a 'search' method.")
120
- # This should ideally return a structured error for MCP if possible.
121
- # For now, FastMCP will convert this RuntimeError.
122
87
  raise RuntimeError("Search functionality not available on configured backend.")
123
88
 
124
- tool_limit = max(1, limit) # Ensure limit is at least 1 for slicing
125
- api_response_pydantic: Optional[APISearchResponse]
126
-
127
- # Conditionally await based on the backend's search method type
89
+ # Call backend search (handle both async and sync)
128
90
  if asyncio.iscoroutinefunction(backend.search):
129
- api_response_pydantic = await backend.search(
130
- query=query,
131
- package_filters=package_filters,
132
- # The backend.search method uses its own internal default for limit
133
- # if None is passed, or the passed limit.
134
- # The MCP tool will truncate the results later using tool_limit.
91
+ response: SearchResponse = await backend.search(
92
+ query=query, limit=limit, rerank_top=rerank_top, packages=packages
135
93
  )
136
94
  else:
137
- api_response_pydantic = backend.search(
138
- query=query, package_filters=package_filters
139
- )
140
-
141
- if not api_response_pydantic:
142
- logger.warning("Backend search returned None, responding with empty results.")
143
- empty_response = APISearchResponse(
144
- query=query,
145
- packages_applied=package_filters or [],
146
- results=[],
147
- count=0,
148
- total_candidates_considered=0,
149
- processing_time_ms=0,
95
+ response: SearchResponse = backend.search(
96
+ query=query, limit=limit, rerank_top=rerank_top, packages=packages
150
97
  )
151
- return empty_response.model_dump(exclude_none=True)
152
-
153
- actual_backend_results = api_response_pydantic.results
154
98
 
155
- mcp_results_list = []
156
- for backend_item in actual_backend_results[:tool_limit]: # Apply MCP tool's limit
157
- mcp_results_list.append(_prepare_mcp_result_item(backend_item))
158
-
159
- final_mcp_response = APISearchResponse(
160
- query=api_response_pydantic.query,
161
- packages_applied=api_response_pydantic.packages_applied,
162
- results=mcp_results_list,
163
- count=len(mcp_results_list), # Count is after this tool's truncation
164
- total_candidates_considered=api_response_pydantic.total_candidates_considered,
165
- processing_time_ms=api_response_pydantic.processing_time_ms,
166
- )
167
-
168
- return final_mcp_response.model_dump(exclude_none=True)
99
+ # Return as dict for MCP
100
+ return response.model_dump(exclude_none=True)
169
101
 
170
102
 
171
103
  @mcp_app.tool()
172
- async def get_by_id(ctx: MCPContext, group_id: int) -> Optional[Dict[str, Any]]:
173
- """Retrieves a specific statement group by its unique identifier.
174
-
175
- The `display_statement_text` field is omitted from the response.
104
+ async def get_by_id(
105
+ ctx: MCPContext,
106
+ declaration_id: int,
107
+ ) -> SearchResultDict | None:
108
+ """Retrieves a specific declaration by its unique identifier.
176
109
 
177
110
  Args:
178
111
  ctx: The MCP context, providing access to the backend service.
179
- group_id: The unique integer identifier of the statement group to retrieve.
180
- For example, `12345`.
112
+ declaration_id: The unique integer identifier of the declaration.
181
113
 
182
114
  Returns:
183
- A dictionary corresponding to the APISearchResultItem model if a
184
- statement group with the given ID is found (with
185
- `display_statement_text` omitted). Returns None (which will be
186
- serialized as JSON null by MCP) if no such group exists.
115
+ A dictionary representing the SearchResult, or None if not found.
187
116
  """
188
117
  backend = await _get_backend_from_context(ctx)
189
- logger.info(f"MCP Tool 'get_by_id' called for group_id: {group_id}")
190
-
191
- backend_item: Optional[APISearchResultItem]
192
- if asyncio.iscoroutinefunction(backend.get_by_id):
193
- backend_item = await backend.get_by_id(group_id=group_id)
194
- else:
195
- backend_item = backend.get_by_id(group_id=group_id)
196
-
197
- if backend_item:
198
- mcp_item = _prepare_mcp_result_item(backend_item)
199
- return mcp_item.model_dump(exclude_none=True)
200
- return None
201
-
202
-
203
- @mcp_app.tool()
204
- async def get_dependencies(ctx: MCPContext, group_id: int) -> Optional[Dict[str, Any]]:
205
- """Retrieves the direct dependencies (citations) for a specific statement group.
206
-
207
- The `display_statement_text` field within each cited item is omitted
208
- from the response.
118
+ logger.info(f"MCP Tool 'get_by_id' called for declaration_id: {declaration_id}")
209
119
 
210
- Args:
211
- ctx: The MCP context, providing access to the backend service.
212
- group_id: The unique integer identifier of the statement group for which
213
- to fetch its direct dependencies. For example, `12345`.
214
-
215
- Returns:
216
- A dictionary corresponding to the APICitationsResponse model, which
217
- contains a list of cited statement groups (each with
218
- `display_statement_text` omitted), if the source group_id
219
- is found and has dependencies. Returns None (serialized as JSON null
220
- by MCP) if the source group is not found or has no dependencies.
221
- """
222
- backend = await _get_backend_from_context(ctx)
223
- logger.info(f"MCP Tool 'get_dependencies' called for group_id: {group_id}")
120
+ if not hasattr(backend, "get_by_id"):
121
+ logger.error("Backend service does not have a 'get_by_id' method.")
122
+ raise RuntimeError(
123
+ "Get by ID functionality not available on configured backend."
124
+ )
224
125
 
225
- backend_response: Optional[APICitationsResponse]
226
- if asyncio.iscoroutinefunction(backend.get_dependencies):
227
- backend_response = await backend.get_dependencies(group_id=group_id)
126
+ # Call backend get_by_id (handle both async and sync)
127
+ if asyncio.iscoroutinefunction(backend.get_by_id):
128
+ result: SearchResult | None = await backend.get_by_id(
129
+ declaration_id=declaration_id
130
+ )
228
131
  else:
229
- backend_response = backend.get_dependencies(group_id=group_id)
132
+ result: SearchResult | None = backend.get_by_id(declaration_id=declaration_id)
230
133
 
231
- if backend_response:
232
- mcp_citations_list = []
233
- for backend_item in backend_response.citations:
234
- mcp_citations_list.append(_prepare_mcp_result_item(backend_item))
235
-
236
- final_mcp_response = APICitationsResponse(
237
- source_group_id=backend_response.source_group_id,
238
- citations=mcp_citations_list,
239
- count=len(mcp_citations_list),
240
- )
241
- return final_mcp_response.model_dump(exclude_none=True)
242
- return None
134
+ # Return as dict for MCP, or None
135
+ return result.model_dump(exclude_none=True) if result else None
@@ -0,0 +1,9 @@
1
+ """Data models for lean_explore.
2
+
3
+ This package contains database models and type definitions for search results.
4
+ """
5
+
6
+ from lean_explore.models.search_db import Base, Declaration
7
+ from lean_explore.models.search_types import SearchResponse, SearchResult
8
+
9
+ __all__ = ["Base", "Declaration", "SearchResult", "SearchResponse"]
@@ -0,0 +1,76 @@
1
+ """SQLAlchemy ORM models for Lean declaration database.
2
+
3
+ Simple schema for a Lean declaration search engine.
4
+ Uses SQLAlchemy 2.0 syntax with SQLite for storage and FAISS for vector search.
5
+ """
6
+
7
+ import struct
8
+
9
+ from sqlalchemy import Integer, LargeBinary, Text
10
+ from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
11
+ from sqlalchemy.types import TypeDecorator
12
+
13
+
14
+ class BinaryEmbedding(TypeDecorator):
15
+ """Custom type for storing embeddings as binary blobs.
16
+
17
+ Converts between Python list[float] and compact binary representation.
18
+ Uses float32 (4 bytes per dimension) for ~5x space savings over JSON.
19
+ """
20
+
21
+ impl = LargeBinary
22
+ cache_ok = True
23
+
24
+ def process_bind_param(self, value: list[float] | None, dialect) -> bytes | None:
25
+ """Convert list[float] to binary for storage."""
26
+ if value is None:
27
+ return None
28
+ return struct.pack(f"{len(value)}f", *value)
29
+
30
+ def process_result_value(self, value: bytes | None, dialect) -> list[float] | None:
31
+ """Convert binary back to list[float] on retrieval."""
32
+ if value is None:
33
+ return None
34
+ num_floats = len(value) // 4
35
+ return list(struct.unpack(f"{num_floats}f", value))
36
+
37
+
38
+ class Base(DeclarativeBase):
39
+ """Base class for SQLAlchemy declarative models."""
40
+
41
+ pass
42
+
43
+
44
+ class Declaration(Base):
45
+ """Represents a Lean declaration for search."""
46
+
47
+ __tablename__ = "declarations"
48
+
49
+ id: Mapped[int] = mapped_column(Integer, primary_key=True)
50
+ """Primary key identifier."""
51
+
52
+ name: Mapped[str] = mapped_column(Text, unique=True, index=True, nullable=False)
53
+ """Fully qualified Lean name (e.g., 'Nat.add')."""
54
+
55
+ module: Mapped[str] = mapped_column(Text, index=True, nullable=False)
56
+ """Module name (e.g., 'Mathlib.Data.List.Basic')."""
57
+
58
+ docstring: Mapped[str | None] = mapped_column(Text, nullable=True)
59
+ """Documentation string from the source code, if available."""
60
+
61
+ source_text: Mapped[str] = mapped_column(Text, nullable=False)
62
+ """The actual Lean source code for this declaration."""
63
+
64
+ source_link: Mapped[str] = mapped_column(Text, nullable=False)
65
+ """GitHub URL to the declaration source code."""
66
+
67
+ dependencies: Mapped[str | None] = mapped_column(Text, nullable=True)
68
+ """JSON array of declaration names this declaration depends on."""
69
+
70
+ informalization: Mapped[str | None] = mapped_column(Text, nullable=True)
71
+ """Natural language description of the declaration."""
72
+
73
+ informalization_embedding: Mapped[list[float] | None] = mapped_column(
74
+ BinaryEmbedding, nullable=True
75
+ )
76
+ """1024-dimensional embedding of the informalization text (binary float32)."""
@@ -0,0 +1,53 @@
1
+ """Type definitions for search results and related data structures."""
2
+
3
+ from pydantic import BaseModel, ConfigDict
4
+
5
+
6
+ class SearchResult(BaseModel):
7
+ """A search result representing a Lean declaration.
8
+
9
+ This model represents the core information returned from a search query,
10
+ mirroring the essential fields from the database Declaration model.
11
+ """
12
+
13
+ id: int
14
+ """Primary key identifier."""
15
+
16
+ name: str
17
+ """Fully qualified Lean name (e.g., 'Nat.add')."""
18
+
19
+ module: str
20
+ """Module name (e.g., 'Mathlib.Data.List.Basic')."""
21
+
22
+ docstring: str | None
23
+ """Documentation string from the source code, if available."""
24
+
25
+ source_text: str
26
+ """The actual Lean source code for this declaration."""
27
+
28
+ source_link: str
29
+ """GitHub URL to the declaration source code."""
30
+
31
+ dependencies: str | None
32
+ """JSON array of declaration names this declaration depends on."""
33
+
34
+ informalization: str | None
35
+ """Natural language description of the declaration."""
36
+
37
+ model_config = ConfigDict(from_attributes=True)
38
+
39
+
40
+ class SearchResponse(BaseModel):
41
+ """Response from a search operation containing results and metadata."""
42
+
43
+ query: str
44
+ """The original search query string."""
45
+
46
+ results: list[SearchResult]
47
+ """List of search results."""
48
+
49
+ count: int
50
+ """Number of results returned."""
51
+
52
+ processing_time_ms: int | None = None
53
+ """Processing time in milliseconds, if available."""
@@ -0,0 +1,32 @@
1
+ """Search package for Lean Explore.
2
+
3
+ This package provides hybrid search for Lean declarations using BM25 lexical
4
+ matching and FAISS semantic search, combined via Reciprocal Rank Fusion.
5
+
6
+ Modules:
7
+ engine: Core SearchEngine class with hybrid retrieval and cross-encoder reranking.
8
+ scoring: Score normalization and fusion algorithms (RRF, weighted fusion).
9
+ service: Service layer wrapper for search operations.
10
+ tokenization: Text tokenization utilities for Lean declaration names.
11
+
12
+ Note: SearchEngine and Service are lazily imported to avoid loading FAISS at module
13
+ import time, which helps prevent OpenMP library conflicts with torch on macOS.
14
+ """
15
+
16
+ from lean_explore.models import SearchResponse, SearchResult
17
+
18
+
19
+ def __getattr__(name: str):
20
+ """Lazy import SearchEngine and Service to avoid FAISS loading at import time."""
21
+ if name == "SearchEngine":
22
+ from lean_explore.search.engine import SearchEngine
23
+
24
+ return SearchEngine
25
+ if name == "Service":
26
+ from lean_explore.search.service import Service
27
+
28
+ return Service
29
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
30
+
31
+
32
+ __all__ = ["SearchEngine", "Service", "SearchResponse", "SearchResult"]