lean-explore 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lean_explore/__init__.py +14 -1
- lean_explore/api/__init__.py +12 -1
- lean_explore/api/client.py +64 -176
- lean_explore/cli/__init__.py +10 -1
- lean_explore/cli/data_commands.py +157 -479
- lean_explore/cli/display.py +171 -0
- lean_explore/cli/main.py +51 -608
- lean_explore/config.py +244 -0
- lean_explore/extract/__init__.py +5 -0
- lean_explore/extract/__main__.py +368 -0
- lean_explore/extract/doc_gen4.py +200 -0
- lean_explore/extract/doc_parser.py +499 -0
- lean_explore/extract/embeddings.py +371 -0
- lean_explore/extract/github.py +110 -0
- lean_explore/extract/index.py +317 -0
- lean_explore/extract/informalize.py +653 -0
- lean_explore/extract/package_config.py +59 -0
- lean_explore/extract/package_registry.py +45 -0
- lean_explore/extract/package_utils.py +105 -0
- lean_explore/extract/types.py +25 -0
- lean_explore/mcp/__init__.py +11 -1
- lean_explore/mcp/app.py +14 -46
- lean_explore/mcp/server.py +20 -35
- lean_explore/mcp/tools.py +70 -205
- lean_explore/models/__init__.py +9 -0
- lean_explore/models/search_db.py +76 -0
- lean_explore/models/search_types.py +53 -0
- lean_explore/search/__init__.py +32 -0
- lean_explore/search/engine.py +655 -0
- lean_explore/search/scoring.py +156 -0
- lean_explore/search/service.py +68 -0
- lean_explore/search/tokenization.py +71 -0
- lean_explore/util/__init__.py +28 -0
- lean_explore/util/embedding_client.py +92 -0
- lean_explore/util/logging.py +22 -0
- lean_explore/util/openrouter_client.py +63 -0
- lean_explore/util/reranker_client.py +189 -0
- {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/METADATA +32 -9
- lean_explore-1.0.0.dist-info/RECORD +43 -0
- {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/WHEEL +1 -1
- lean_explore-1.0.0.dist-info/entry_points.txt +2 -0
- lean_explore/cli/agent.py +0 -788
- lean_explore/cli/config_utils.py +0 -481
- lean_explore/defaults.py +0 -114
- lean_explore/local/__init__.py +0 -1
- lean_explore/local/search.py +0 -1050
- lean_explore/local/service.py +0 -479
- lean_explore/shared/__init__.py +0 -1
- lean_explore/shared/models/__init__.py +0 -1
- lean_explore/shared/models/api.py +0 -117
- lean_explore/shared/models/db.py +0 -396
- lean_explore-0.3.0.dist-info/RECORD +0 -26
- lean_explore-0.3.0.dist-info/entry_points.txt +0 -2
- {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {lean_explore-0.3.0.dist-info → lean_explore-1.0.0.dist-info}/top_level.txt +0 -0
lean_explore/mcp/tools.py
CHANGED
|
@@ -1,27 +1,35 @@
|
|
|
1
|
-
|
|
1
|
+
"""Defines MCP tools for interacting with the Lean Explore search engine."""
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
These tools provide functionalities such as searching for statement groups,
|
|
6
|
-
retrieving specific groups by ID, and getting their dependencies. They
|
|
7
|
-
utilize a backend service (either an API client or a local service)
|
|
8
|
-
made available through the MCP application context.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import asyncio # Needed for asyncio.iscoroutinefunction
|
|
3
|
+
import asyncio
|
|
12
4
|
import logging
|
|
13
|
-
from typing import
|
|
5
|
+
from typing import TypedDict
|
|
14
6
|
|
|
15
7
|
from mcp.server.fastmcp import Context as MCPContext
|
|
16
8
|
|
|
17
9
|
from lean_explore.mcp.app import AppContext, BackendServiceType, mcp_app
|
|
10
|
+
from lean_explore.models import SearchResponse, SearchResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SearchResultDict(TypedDict, total=False):
|
|
14
|
+
"""Serialized SearchResult for MCP tool responses."""
|
|
15
|
+
|
|
16
|
+
id: int
|
|
17
|
+
name: str
|
|
18
|
+
module: str
|
|
19
|
+
docstring: str | None
|
|
20
|
+
source_text: str
|
|
21
|
+
source_link: str
|
|
22
|
+
dependencies: str | None
|
|
23
|
+
informalization: str | None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SearchResponseDict(TypedDict, total=False):
|
|
27
|
+
"""Serialized SearchResponse for MCP tool responses."""
|
|
18
28
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
APISearchResultItem,
|
|
24
|
-
)
|
|
29
|
+
query: str
|
|
30
|
+
results: list[SearchResultDict]
|
|
31
|
+
count: int
|
|
32
|
+
processing_time_ms: int | None
|
|
25
33
|
|
|
26
34
|
logger = logging.getLogger(__name__)
|
|
27
35
|
|
|
@@ -33,238 +41,95 @@ async def _get_backend_from_context(ctx: MCPContext) -> BackendServiceType:
|
|
|
33
41
|
ctx: The MCP context provided to the tool.
|
|
34
42
|
|
|
35
43
|
Returns:
|
|
36
|
-
The configured backend service (
|
|
37
|
-
Guaranteed to be non-None if this function returns, otherwise
|
|
38
|
-
it raises an exception.
|
|
44
|
+
The configured backend service (ApiClient or Service).
|
|
39
45
|
|
|
40
46
|
Raises:
|
|
41
|
-
RuntimeError: If the backend service is not available in the context
|
|
42
|
-
indicating a server configuration issue.
|
|
47
|
+
RuntimeError: If the backend service is not available in the context.
|
|
43
48
|
"""
|
|
44
49
|
app_ctx: AppContext = ctx.request_context.lifespan_context
|
|
45
50
|
backend = app_ctx.backend_service
|
|
46
51
|
if not backend:
|
|
47
|
-
logger.error(
|
|
48
|
-
"MCP Tool Error: Backend service is not available in lifespan_context."
|
|
49
|
-
)
|
|
52
|
+
logger.error("MCP Tool Error: Backend service is not available.")
|
|
50
53
|
raise RuntimeError("Backend service not configured or available for MCP tool.")
|
|
51
54
|
return backend
|
|
52
55
|
|
|
53
56
|
|
|
54
|
-
def _prepare_mcp_result_item(backend_item: APISearchResultItem) -> APISearchResultItem:
|
|
55
|
-
"""Prepares an APISearchResultItem for MCP response.
|
|
56
|
-
|
|
57
|
-
This helper ensures that the item sent over MCP does not include
|
|
58
|
-
the display_statement_text, as the full statement_text is preferred
|
|
59
|
-
for model consumption.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
backend_item: The item as received from the backend service.
|
|
63
|
-
|
|
64
|
-
Returns:
|
|
65
|
-
A new APISearchResultItem instance suitable for MCP responses.
|
|
66
|
-
"""
|
|
67
|
-
# Create a new instance or use .model_copy(update=...) for Pydantic v2
|
|
68
|
-
return APISearchResultItem(
|
|
69
|
-
id=backend_item.id,
|
|
70
|
-
primary_declaration=backend_item.primary_declaration.model_copy()
|
|
71
|
-
if backend_item.primary_declaration
|
|
72
|
-
else None,
|
|
73
|
-
source_file=backend_item.source_file,
|
|
74
|
-
range_start_line=backend_item.range_start_line,
|
|
75
|
-
statement_text=backend_item.statement_text,
|
|
76
|
-
docstring=backend_item.docstring,
|
|
77
|
-
informal_description=backend_item.informal_description,
|
|
78
|
-
display_statement_text=None, # Ensure this is not sent over MCP
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
|
|
82
57
|
@mcp_app.tool()
|
|
83
58
|
async def search(
|
|
84
59
|
ctx: MCPContext,
|
|
85
|
-
query:
|
|
86
|
-
package_filters: Optional[List[str]] = None,
|
|
60
|
+
query: str,
|
|
87
61
|
limit: int = 10,
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
of results returned per query.
|
|
62
|
+
rerank_top: int | None = 50,
|
|
63
|
+
packages: list[str] | None = None,
|
|
64
|
+
) -> SearchResponseDict:
|
|
65
|
+
"""Searches Lean declarations by a query string.
|
|
93
66
|
|
|
94
67
|
Args:
|
|
95
|
-
ctx: The MCP context, providing access to
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
"Mathlib.Order"]`. If None or empty, no package filter
|
|
103
|
-
is applied.
|
|
104
|
-
limit: The maximum number of search results to return per query.
|
|
105
|
-
Defaults to 10. Must be a positive integer.
|
|
68
|
+
ctx: The MCP context, providing access to the backend service.
|
|
69
|
+
query: A search query string, e.g., "continuous function".
|
|
70
|
+
limit: The maximum number of search results to return. Defaults to 10.
|
|
71
|
+
rerank_top: Number of candidates to rerank with cross-encoder. Set to 0 or
|
|
72
|
+
None to skip reranking. Defaults to 50. Only used with local backend.
|
|
73
|
+
packages: Filter results to specific packages (e.g., ["Mathlib", "Std"]).
|
|
74
|
+
Defaults to None (all packages).
|
|
106
75
|
|
|
107
76
|
Returns:
|
|
108
|
-
A
|
|
109
|
-
APISearchResponse model. Each response contains the search results
|
|
110
|
-
for a single query. The `display_statement_text` field within each
|
|
111
|
-
result item is omitted.
|
|
77
|
+
A dictionary containing the search response with results.
|
|
112
78
|
"""
|
|
113
79
|
backend = await _get_backend_from_context(ctx)
|
|
114
80
|
logger.info(
|
|
115
|
-
f"MCP Tool 'search' called with query
|
|
116
|
-
f"
|
|
81
|
+
f"MCP Tool 'search' called with query: '{query}', limit: {limit}, "
|
|
82
|
+
f"rerank_top: {rerank_top}, packages: {packages}"
|
|
117
83
|
)
|
|
118
84
|
|
|
119
85
|
if not hasattr(backend, "search"):
|
|
120
86
|
logger.error("Backend service does not have a 'search' method.")
|
|
121
87
|
raise RuntimeError("Search functionality not available on configured backend.")
|
|
122
88
|
|
|
123
|
-
|
|
124
|
-
backend_responses: Union[APISearchResponse, List[APISearchResponse]]
|
|
125
|
-
|
|
126
|
-
# Conditionally await based on the backend's search method type
|
|
89
|
+
# Call backend search (handle both async and sync)
|
|
127
90
|
if asyncio.iscoroutinefunction(backend.search):
|
|
128
|
-
|
|
129
|
-
query=query,
|
|
91
|
+
response: SearchResponse = await backend.search(
|
|
92
|
+
query=query, limit=limit, rerank_top=rerank_top, packages=packages
|
|
130
93
|
)
|
|
131
94
|
else:
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
# Normalize to a list for consistent processing, handling None from backend.
|
|
135
|
-
if backend_responses is None:
|
|
136
|
-
responses_list = []
|
|
137
|
-
else:
|
|
138
|
-
responses_list = (
|
|
139
|
-
[backend_responses]
|
|
140
|
-
if isinstance(backend_responses, APISearchResponse)
|
|
141
|
-
else backend_responses
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
final_mcp_responses = []
|
|
145
|
-
|
|
146
|
-
for response_pydantic in responses_list:
|
|
147
|
-
if not response_pydantic:
|
|
148
|
-
logger.warning("A backend search returned None; skipping this response.")
|
|
149
|
-
continue
|
|
150
|
-
|
|
151
|
-
actual_backend_results = response_pydantic.results
|
|
152
|
-
mcp_results_list = []
|
|
153
|
-
for backend_item in actual_backend_results[:tool_limit]:
|
|
154
|
-
mcp_results_list.append(_prepare_mcp_result_item(backend_item))
|
|
155
|
-
|
|
156
|
-
final_mcp_response = APISearchResponse(
|
|
157
|
-
query=response_pydantic.query,
|
|
158
|
-
packages_applied=response_pydantic.packages_applied,
|
|
159
|
-
results=mcp_results_list,
|
|
160
|
-
count=len(mcp_results_list),
|
|
161
|
-
total_candidates_considered=response_pydantic.total_candidates_considered,
|
|
162
|
-
processing_time_ms=response_pydantic.processing_time_ms,
|
|
95
|
+
response: SearchResponse = backend.search(
|
|
96
|
+
query=query, limit=limit, rerank_top=rerank_top, packages=packages
|
|
163
97
|
)
|
|
164
|
-
final_mcp_responses.append(final_mcp_response.model_dump(exclude_none=True))
|
|
165
98
|
|
|
166
|
-
|
|
99
|
+
# Return as dict for MCP
|
|
100
|
+
return response.model_dump(exclude_none=True)
|
|
167
101
|
|
|
168
102
|
|
|
169
103
|
@mcp_app.tool()
|
|
170
104
|
async def get_by_id(
|
|
171
|
-
ctx: MCPContext,
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
The `display_statement_text` field is omitted from the response. This tool
|
|
176
|
-
always returns a list of results.
|
|
105
|
+
ctx: MCPContext,
|
|
106
|
+
declaration_id: int,
|
|
107
|
+
) -> SearchResultDict | None:
|
|
108
|
+
"""Retrieves a specific declaration by its unique identifier.
|
|
177
109
|
|
|
178
110
|
Args:
|
|
179
111
|
ctx: The MCP context, providing access to the backend service.
|
|
180
|
-
|
|
181
|
-
of the statement group(s) to retrieve. For example, `12345` or
|
|
182
|
-
`[12345, 67890]`.
|
|
112
|
+
declaration_id: The unique integer identifier of the declaration.
|
|
183
113
|
|
|
184
114
|
Returns:
|
|
185
|
-
A
|
|
186
|
-
APISearchResultItem model. If an ID is not found, its corresponding
|
|
187
|
-
entry in the list will be None (serialized as JSON null by MCP).
|
|
115
|
+
A dictionary representing the SearchResult, or None if not found.
|
|
188
116
|
"""
|
|
189
117
|
backend = await _get_backend_from_context(ctx)
|
|
190
|
-
logger.info(f"MCP Tool 'get_by_id' called for
|
|
191
|
-
|
|
192
|
-
backend_items: Union[
|
|
193
|
-
Optional[APISearchResultItem], List[Optional[APISearchResultItem]]
|
|
194
|
-
]
|
|
195
|
-
if asyncio.iscoroutinefunction(backend.get_by_id):
|
|
196
|
-
backend_items = await backend.get_by_id(group_id=group_id)
|
|
197
|
-
else:
|
|
198
|
-
backend_items = backend.get_by_id(group_id=group_id)
|
|
199
|
-
|
|
200
|
-
# Normalize to a list for consistent return type
|
|
201
|
-
items_list = (
|
|
202
|
-
[backend_items] if not isinstance(backend_items, list) else backend_items
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
mcp_items = []
|
|
206
|
-
for item in items_list:
|
|
207
|
-
if item:
|
|
208
|
-
mcp_item = _prepare_mcp_result_item(item)
|
|
209
|
-
mcp_items.append(mcp_item.model_dump(exclude_none=True))
|
|
210
|
-
else:
|
|
211
|
-
mcp_items.append(None)
|
|
212
|
-
|
|
213
|
-
return mcp_items
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
@mcp_app.tool()
|
|
217
|
-
async def get_dependencies(
|
|
218
|
-
ctx: MCPContext, group_id: Union[int, List[int]]
|
|
219
|
-
) -> List[Optional[Dict[str, Any]]]:
|
|
220
|
-
"""Retrieves direct dependencies (citations) for specific statement group(s).
|
|
221
|
-
|
|
222
|
-
The `display_statement_text` field within each cited item is omitted
|
|
223
|
-
from the response. This tool always returns a list of results.
|
|
118
|
+
logger.info(f"MCP Tool 'get_by_id' called for declaration_id: {declaration_id}")
|
|
224
119
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
Returns:
|
|
232
|
-
A list of dictionaries, where each dictionary corresponds to the
|
|
233
|
-
APICitationsResponse model. If a source group ID is not found or has
|
|
234
|
-
no dependencies, its corresponding entry will be None.
|
|
235
|
-
"""
|
|
236
|
-
backend = await _get_backend_from_context(ctx)
|
|
237
|
-
logger.info(f"MCP Tool 'get_dependencies' called for group_id(s): {group_id}")
|
|
120
|
+
if not hasattr(backend, "get_by_id"):
|
|
121
|
+
logger.error("Backend service does not have a 'get_by_id' method.")
|
|
122
|
+
raise RuntimeError(
|
|
123
|
+
"Get by ID functionality not available on configured backend."
|
|
124
|
+
)
|
|
238
125
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
126
|
+
# Call backend get_by_id (handle both async and sync)
|
|
127
|
+
if asyncio.iscoroutinefunction(backend.get_by_id):
|
|
128
|
+
result: SearchResult | None = await backend.get_by_id(
|
|
129
|
+
declaration_id=declaration_id
|
|
130
|
+
)
|
|
244
131
|
else:
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
# Normalize to a list for consistent return type
|
|
248
|
-
responses_list = (
|
|
249
|
-
[backend_responses]
|
|
250
|
-
if not isinstance(backend_responses, list)
|
|
251
|
-
else backend_responses
|
|
252
|
-
)
|
|
253
|
-
final_mcp_responses = []
|
|
254
|
-
|
|
255
|
-
for response in responses_list:
|
|
256
|
-
if response:
|
|
257
|
-
mcp_citations_list = []
|
|
258
|
-
for backend_item in response.citations:
|
|
259
|
-
mcp_citations_list.append(_prepare_mcp_result_item(backend_item))
|
|
260
|
-
|
|
261
|
-
final_response = APICitationsResponse(
|
|
262
|
-
source_group_id=response.source_group_id,
|
|
263
|
-
citations=mcp_citations_list,
|
|
264
|
-
count=len(mcp_citations_list),
|
|
265
|
-
)
|
|
266
|
-
final_mcp_responses.append(final_response.model_dump(exclude_none=True))
|
|
267
|
-
else:
|
|
268
|
-
final_mcp_responses.append(None)
|
|
132
|
+
result: SearchResult | None = backend.get_by_id(declaration_id=declaration_id)
|
|
269
133
|
|
|
270
|
-
|
|
134
|
+
# Return as dict for MCP, or None
|
|
135
|
+
return result.model_dump(exclude_none=True) if result else None
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Data models for lean_explore.
|
|
2
|
+
|
|
3
|
+
This package contains database models and type definitions for search results.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from lean_explore.models.search_db import Base, Declaration
|
|
7
|
+
from lean_explore.models.search_types import SearchResponse, SearchResult
|
|
8
|
+
|
|
9
|
+
__all__ = ["Base", "Declaration", "SearchResult", "SearchResponse"]
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""SQLAlchemy ORM models for Lean declaration database.
|
|
2
|
+
|
|
3
|
+
Simple schema for a Lean declaration search engine.
|
|
4
|
+
Uses SQLAlchemy 2.0 syntax with SQLite for storage and FAISS for vector search.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import struct
|
|
8
|
+
|
|
9
|
+
from sqlalchemy import Integer, LargeBinary, Text
|
|
10
|
+
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
|
|
11
|
+
from sqlalchemy.types import TypeDecorator
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BinaryEmbedding(TypeDecorator):
|
|
15
|
+
"""Custom type for storing embeddings as binary blobs.
|
|
16
|
+
|
|
17
|
+
Converts between Python list[float] and compact binary representation.
|
|
18
|
+
Uses float32 (4 bytes per dimension) for ~5x space savings over JSON.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
impl = LargeBinary
|
|
22
|
+
cache_ok = True
|
|
23
|
+
|
|
24
|
+
def process_bind_param(self, value: list[float] | None, dialect) -> bytes | None:
|
|
25
|
+
"""Convert list[float] to binary for storage."""
|
|
26
|
+
if value is None:
|
|
27
|
+
return None
|
|
28
|
+
return struct.pack(f"{len(value)}f", *value)
|
|
29
|
+
|
|
30
|
+
def process_result_value(self, value: bytes | None, dialect) -> list[float] | None:
|
|
31
|
+
"""Convert binary back to list[float] on retrieval."""
|
|
32
|
+
if value is None:
|
|
33
|
+
return None
|
|
34
|
+
num_floats = len(value) // 4
|
|
35
|
+
return list(struct.unpack(f"{num_floats}f", value))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Base(DeclarativeBase):
|
|
39
|
+
"""Base class for SQLAlchemy declarative models."""
|
|
40
|
+
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Declaration(Base):
|
|
45
|
+
"""Represents a Lean declaration for search."""
|
|
46
|
+
|
|
47
|
+
__tablename__ = "declarations"
|
|
48
|
+
|
|
49
|
+
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
50
|
+
"""Primary key identifier."""
|
|
51
|
+
|
|
52
|
+
name: Mapped[str] = mapped_column(Text, unique=True, index=True, nullable=False)
|
|
53
|
+
"""Fully qualified Lean name (e.g., 'Nat.add')."""
|
|
54
|
+
|
|
55
|
+
module: Mapped[str] = mapped_column(Text, index=True, nullable=False)
|
|
56
|
+
"""Module name (e.g., 'Mathlib.Data.List.Basic')."""
|
|
57
|
+
|
|
58
|
+
docstring: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
59
|
+
"""Documentation string from the source code, if available."""
|
|
60
|
+
|
|
61
|
+
source_text: Mapped[str] = mapped_column(Text, nullable=False)
|
|
62
|
+
"""The actual Lean source code for this declaration."""
|
|
63
|
+
|
|
64
|
+
source_link: Mapped[str] = mapped_column(Text, nullable=False)
|
|
65
|
+
"""GitHub URL to the declaration source code."""
|
|
66
|
+
|
|
67
|
+
dependencies: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
68
|
+
"""JSON array of declaration names this declaration depends on."""
|
|
69
|
+
|
|
70
|
+
informalization: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
71
|
+
"""Natural language description of the declaration."""
|
|
72
|
+
|
|
73
|
+
informalization_embedding: Mapped[list[float] | None] = mapped_column(
|
|
74
|
+
BinaryEmbedding, nullable=True
|
|
75
|
+
)
|
|
76
|
+
"""1024-dimensional embedding of the informalization text (binary float32)."""
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Type definitions for search results and related data structures."""
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, ConfigDict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SearchResult(BaseModel):
|
|
7
|
+
"""A search result representing a Lean declaration.
|
|
8
|
+
|
|
9
|
+
This model represents the core information returned from a search query,
|
|
10
|
+
mirroring the essential fields from the database Declaration model.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
id: int
|
|
14
|
+
"""Primary key identifier."""
|
|
15
|
+
|
|
16
|
+
name: str
|
|
17
|
+
"""Fully qualified Lean name (e.g., 'Nat.add')."""
|
|
18
|
+
|
|
19
|
+
module: str
|
|
20
|
+
"""Module name (e.g., 'Mathlib.Data.List.Basic')."""
|
|
21
|
+
|
|
22
|
+
docstring: str | None
|
|
23
|
+
"""Documentation string from the source code, if available."""
|
|
24
|
+
|
|
25
|
+
source_text: str
|
|
26
|
+
"""The actual Lean source code for this declaration."""
|
|
27
|
+
|
|
28
|
+
source_link: str
|
|
29
|
+
"""GitHub URL to the declaration source code."""
|
|
30
|
+
|
|
31
|
+
dependencies: str | None
|
|
32
|
+
"""JSON array of declaration names this declaration depends on."""
|
|
33
|
+
|
|
34
|
+
informalization: str | None
|
|
35
|
+
"""Natural language description of the declaration."""
|
|
36
|
+
|
|
37
|
+
model_config = ConfigDict(from_attributes=True)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SearchResponse(BaseModel):
|
|
41
|
+
"""Response from a search operation containing results and metadata."""
|
|
42
|
+
|
|
43
|
+
query: str
|
|
44
|
+
"""The original search query string."""
|
|
45
|
+
|
|
46
|
+
results: list[SearchResult]
|
|
47
|
+
"""List of search results."""
|
|
48
|
+
|
|
49
|
+
count: int
|
|
50
|
+
"""Number of results returned."""
|
|
51
|
+
|
|
52
|
+
processing_time_ms: int | None = None
|
|
53
|
+
"""Processing time in milliseconds, if available."""
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Search package for Lean Explore.
|
|
2
|
+
|
|
3
|
+
This package provides hybrid search for Lean declarations using BM25 lexical
|
|
4
|
+
matching and FAISS semantic search, combined via Reciprocal Rank Fusion.
|
|
5
|
+
|
|
6
|
+
Modules:
|
|
7
|
+
engine: Core SearchEngine class with hybrid retrieval and cross-encoder reranking.
|
|
8
|
+
scoring: Score normalization and fusion algorithms (RRF, weighted fusion).
|
|
9
|
+
service: Service layer wrapper for search operations.
|
|
10
|
+
tokenization: Text tokenization utilities for Lean declaration names.
|
|
11
|
+
|
|
12
|
+
Note: SearchEngine and Service are lazily imported to avoid loading FAISS at module
|
|
13
|
+
import time, which helps prevent OpenMP library conflicts with torch on macOS.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from lean_explore.models import SearchResponse, SearchResult
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def __getattr__(name: str):
|
|
20
|
+
"""Lazy import SearchEngine and Service to avoid FAISS loading at import time."""
|
|
21
|
+
if name == "SearchEngine":
|
|
22
|
+
from lean_explore.search.engine import SearchEngine
|
|
23
|
+
|
|
24
|
+
return SearchEngine
|
|
25
|
+
if name == "Service":
|
|
26
|
+
from lean_explore.search.service import Service
|
|
27
|
+
|
|
28
|
+
return Service
|
|
29
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
__all__ = ["SearchEngine", "Service", "SearchResponse", "SearchResult"]
|