lean-explore 0.1.4__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {lean_explore-0.1.4 → lean_explore-0.2.0}/PKG-INFO +5 -4
  2. {lean_explore-0.1.4 → lean_explore-0.2.0}/README.md +2 -3
  3. {lean_explore-0.1.4 → lean_explore-0.2.0}/pyproject.toml +4 -2
  4. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/cli/data_commands.py +100 -42
  5. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/defaults.py +7 -10
  6. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/local/search.py +285 -156
  7. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/local/service.py +12 -10
  8. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/shared/models/db.py +7 -22
  9. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore.egg-info/PKG-INFO +5 -4
  10. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore.egg-info/requires.txt +3 -1
  11. {lean_explore-0.1.4 → lean_explore-0.2.0}/tests/test_defaults.py +19 -16
  12. {lean_explore-0.1.4 → lean_explore-0.2.0}/LICENSE +0 -0
  13. {lean_explore-0.1.4 → lean_explore-0.2.0}/setup.cfg +0 -0
  14. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/__init__.py +0 -0
  15. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/api/__init__.py +0 -0
  16. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/api/client.py +0 -0
  17. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/cli/__init__.py +0 -0
  18. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/cli/agent.py +0 -0
  19. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/cli/config_utils.py +0 -0
  20. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/cli/main.py +0 -0
  21. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/local/__init__.py +0 -0
  22. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/mcp/__init__.py +0 -0
  23. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/mcp/app.py +0 -0
  24. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/mcp/server.py +0 -0
  25. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/mcp/tools.py +0 -0
  26. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/shared/__init__.py +0 -0
  27. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/shared/models/__init__.py +0 -0
  28. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore/shared/models/api.py +0 -0
  29. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore.egg-info/SOURCES.txt +0 -0
  30. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore.egg-info/dependency_links.txt +0 -0
  31. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore.egg-info/entry_points.txt +0 -0
  32. {lean_explore-0.1.4 → lean_explore-0.2.0}/src/lean_explore.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lean-explore
3
- Version: 0.1.4
3
+ Version: 0.2.0
4
4
  Summary: A project to explore and rank Lean mathematical declarations.
5
5
  Author-email: Justin Asher <justinchadwickasher@gmail.com>
6
6
  License: Apache License
@@ -228,8 +228,9 @@ Requires-Dist: sqlalchemy>=2.0
228
228
  Requires-Dist: numpy>=1.20
229
229
  Requires-Dist: faiss-cpu>=1.7
230
230
  Requires-Dist: sentence-transformers>=2.2.0
231
- Requires-Dist: rapidfuzz>=3.0.0
232
231
  Requires-Dist: filelock>=3.0.0
232
+ Requires-Dist: nltk>=3.6
233
+ Requires-Dist: rank-bm25>=0.2.2
233
234
  Requires-Dist: httpx>=0.23.0
234
235
  Requires-Dist: pydantic>=2.0
235
236
  Requires-Dist: typer[all]>=0.9.0
@@ -237,6 +238,7 @@ Requires-Dist: toml>=0.10.0
237
238
  Requires-Dist: openai-agents>=0.0.16
238
239
  Requires-Dist: mcp>=1.9.0
239
240
  Requires-Dist: tqdm>=4.60
241
+ Requires-Dist: requests>=2.25.0
240
242
  Dynamic: license-file
241
243
 
242
244
  # LeanExplore
@@ -261,7 +263,7 @@ If you use LeanExplore in your research or work, please cite it as follows:
261
263
 
262
264
  **General Citation:**
263
265
 
264
- Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. LeanExplore.com. Retrieved from [http://www.leanexplore.com](http://www.leanexplore.com) (GitHub: [https://github.com/justincasher/lean-explore](https://github.com/justincasher/lean-explore)).
266
+ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. LeanExplore.com. (GitHub: [https://github.com/justincasher/lean-explore](https://github.com/justincasher/lean-explore)).
265
267
 
266
268
  **BibTeX Entry:**
267
269
 
@@ -270,7 +272,6 @@ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. Le
270
272
  author = {Asher, Justin},
271
273
  title = {{LeanExplore: A search engine for Lean 4 declarations}},
272
274
  year = {2025},
273
- publisher = {LeanExplore.com},
274
275
  url = {http://www.leanexplore.com},
275
276
  note = {GitHub repository: https://github.com/justincasher/lean-explore}
276
277
  }
@@ -20,7 +20,7 @@ If you use LeanExplore in your research or work, please cite it as follows:
20
20
 
21
21
  **General Citation:**
22
22
 
23
- Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. LeanExplore.com. Retrieved from [http://www.leanexplore.com](http://www.leanexplore.com) (GitHub: [https://github.com/justincasher/lean-explore](https://github.com/justincasher/lean-explore)).
23
+ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. LeanExplore.com. (GitHub: [https://github.com/justincasher/lean-explore](https://github.com/justincasher/lean-explore)).
24
24
 
25
25
  **BibTeX Entry:**
26
26
 
@@ -29,8 +29,7 @@ Justin Asher. (2025). *LeanExplore: A search engine for Lean 4 declarations*. Le
29
29
  author = {Asher, Justin},
30
30
  title = {{LeanExplore: A search engine for Lean 4 declarations}},
31
31
  year = {2025},
32
- publisher = {LeanExplore.com},
33
32
  url = {http://www.leanexplore.com},
34
33
  note = {GitHub repository: https://github.com/justincasher/lean-explore}
35
34
  }
36
- ```
35
+ ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "lean-explore"
7
- version = "0.1.4"
7
+ version = "0.2.0"
8
8
  authors = [
9
9
  { name = "Justin Asher", email = "justinchadwickasher@gmail.com" },
10
10
  ]
@@ -35,8 +35,9 @@ dependencies = [
35
35
  "numpy>=1.20",
36
36
  "faiss-cpu>=1.7",
37
37
  "sentence-transformers>=2.2.0",
38
- "rapidfuzz>=3.0.0",
39
38
  "filelock>=3.0.0",
39
+ "nltk>=3.6",
40
+ "rank-bm25>=0.2.2",
40
41
 
41
42
  # API Client / Shared Data Models
42
43
  "httpx>=0.23.0",
@@ -52,6 +53,7 @@ dependencies = [
52
53
 
53
54
  # Utilities
54
55
  "tqdm>=4.60",
56
+ "requests>=2.25.0",
55
57
  ]
56
58
 
57
59
  [project.urls]
@@ -5,6 +5,7 @@
5
5
  This module includes functions to fetch toolchain data (database, FAISS index, etc.)
6
6
  from a remote source (Cloudflare R2), verify its integrity, decompress it,
7
7
  and place it in the appropriate local directory for the application to use.
8
+ It also provides a command to clean up this downloaded data.
8
9
  """
9
10
 
10
11
  import gzip
@@ -32,7 +33,7 @@ from lean_explore import defaults # For R2 URLs and local paths
32
33
  app = typer.Typer(
33
34
  name="data",
34
35
  help="Manage local data toolchains for Lean Explore (e.g., download, list, "
35
- "select).",
36
+ "select, clean).",
36
37
  no_args_is_help=True,
37
38
  )
38
39
 
@@ -167,10 +168,6 @@ def _download_file_with_progress(
167
168
  "reported size for progress bar if available, otherwise "
168
169
  "expected size.[/yellow]"
169
170
  )
170
- # Prefer expected_size_bytes if it's provided and server doesn't send
171
- # Content-Length or if we want to strictly adhere to manifest size for
172
- # progress. However, for live progress, server's content-length is
173
- # usually more accurate for what's being transferred.
174
171
  if (
175
172
  total_size_from_header == 0
176
173
  ): # If server didn't provide content-length
@@ -201,13 +198,11 @@ def _download_file_with_progress(
201
198
  finally:
202
199
  r.close()
203
200
 
204
- # Sanity check after download
205
201
  actual_downloaded_size = destination_path.stat().st_size
206
202
  if (
207
203
  total_size_from_header > 0
208
204
  and actual_downloaded_size != total_size_from_header
209
205
  ):
210
- # This might indicate an incomplete download if not all bytes were written.
211
206
  console.print(
212
207
  f"[orange3]Warning: For [cyan]{description}[/cyan], downloaded size "
213
208
  f"({actual_downloaded_size} bytes) differs from Content-Length header "
@@ -258,7 +253,6 @@ def _verify_sha256_checksum(file_path: pathlib.Path, expected_checksum: str) ->
258
253
  sha256_hash = hashlib.sha256()
259
254
  try:
260
255
  with open(file_path, "rb") as f:
261
- # Read and update hash string value in blocks of 4K
262
256
  for byte_block in iter(lambda: f.read(4096), b""):
263
257
  sha256_hash.update(byte_block)
264
258
  calculated_checksum = sha256_hash.hexdigest()
@@ -328,28 +322,18 @@ def main() -> None:
328
322
 
329
323
 
330
324
  @app.command()
331
- def fetch(
332
- version: str = typer.Argument(
333
- None,
334
- help=(
335
- "The toolchain version to fetch (e.g., 'stable', '0.1.0'). "
336
- "'stable' will attempt to use the 'default_toolchain' from the manifest."
337
- ),
338
- show_default=False,
339
- ),
340
- ) -> None:
341
- """Fetches and installs a specified data version from the remote repository.
342
-
343
- Downloads necessary assets like the database and FAISS index, verifies their
344
- integrity via SHA256 checksums, decompresses them, and places them into the
345
- appropriate local directory (e.g., ~/.lean_explore/data/toolchains/<version>/).
325
+ def fetch() -> None:
326
+ """Fetches and installs the default data toolchain from the remote repository.
327
+
328
+ This command identifies the 'default_toolchain' (often aliased as 'stable')
329
+ from the remote manifest, then downloads necessary assets like the database
330
+ and FAISS index. It verifies their integrity via SHA256 checksums,
331
+ decompresses them, and places them into the appropriate local versioned
332
+ directory (e.g., ~/.lean_explore/data/toolchains/<default_version>/).
346
333
  """
347
- console.rule(
348
- f"[bold blue]Fetching Lean Explore Data Toolchain: {version}[/bold blue]"
349
- )
334
+ console.rule("[bold blue]Fetching Default Lean Explore Data Toolchain[/bold blue]")
350
335
 
351
- if version is None:
352
- version = "stable"
336
+ version_to_request = "stable" # Always fetch the stable/default version
353
337
 
354
338
  # 1. Fetch and Parse Manifest
355
339
  console.print(f"Fetching data manifest from {defaults.R2_MANIFEST_DEFAULT_URL}...")
@@ -362,12 +346,12 @@ def fetch(
362
346
  console.print("[green]Manifest fetched successfully.[/green]")
363
347
 
364
348
  # 2. Resolve Target Version from Manifest
365
- version_info = _resolve_toolchain_version_info(manifest_data, version)
349
+ version_info = _resolve_toolchain_version_info(manifest_data, version_to_request)
366
350
  if not version_info:
367
351
  # _resolve_toolchain_version_info already prints detailed errors
368
352
  raise typer.Exit(code=1)
369
353
 
370
- resolved_version_key = version_info["_resolved_key"] # Key like "0.1.0"
354
+ resolved_version_key = version_info["_resolved_key"] # Key like "0.1.0" or "0.2.0"
371
355
  console.print(
372
356
  f"Processing toolchain version: [bold yellow]{resolved_version_key}"
373
357
  "[/bold yellow] "
@@ -400,12 +384,8 @@ def fetch(
400
384
  local_name = file_entry.get("local_name")
401
385
  remote_name = file_entry.get("remote_name")
402
386
  expected_checksum = file_entry.get("sha256")
403
- expected_size_compressed = file_entry.get(
404
- "size_bytes_compressed"
405
- ) # This is size of .gz
406
- assets_r2_path_prefix = version_info.get(
407
- "assets_base_path_r2", ""
408
- ) # e.g., "assets/0.1.0/"
387
+ expected_size_compressed = file_entry.get("size_bytes_compressed")
388
+ assets_r2_path_prefix = version_info.get("assets_base_path_r2", "")
409
389
 
410
390
  if not all([local_name, remote_name, expected_checksum]):
411
391
  console.print(
@@ -418,7 +398,7 @@ def fetch(
418
398
  console.rule(f"[bold cyan]Processing: {local_name}[/bold cyan]")
419
399
 
420
400
  final_local_path = local_version_dir / local_name
421
- temp_download_path = local_version_dir / remote_name # Path for the .gz file
401
+ temp_download_path = local_version_dir / remote_name
422
402
 
423
403
  remote_url = (
424
404
  defaults.R2_ASSETS_BASE_URL.rstrip("/")
@@ -473,9 +453,7 @@ def fetch(
473
453
  )
474
454
  if final_local_path.exists():
475
455
  final_local_path.unlink(missing_ok=True)
476
- if (
477
- temp_download_path.exists()
478
- ): # Ensure .gz is also removed on decompress failure
456
+ if temp_download_path.exists():
479
457
  temp_download_path.unlink(missing_ok=True)
480
458
  continue
481
459
 
@@ -500,7 +478,87 @@ def fetch(
500
478
  raise typer.Exit(code=1)
501
479
 
502
480
 
481
+ @app.command("clean")
482
+ def clean_data_toolchains() -> None:
483
+ """Removes all downloaded local data toolchains.
484
+
485
+ This command deletes all version-specific subdirectories and their contents
486
+ within the local toolchains storage directory (typically located at
487
+ ~/.lean_explore/data/toolchains/).
488
+
489
+ Configuration files will not be affected.
490
+ """
491
+ toolchains_dir = defaults.LEAN_EXPLORE_TOOLCHAINS_BASE_DIR
492
+ console.print(
493
+ f"Attempting to clean local data toolchains from: [dim]{toolchains_dir}[/dim]"
494
+ )
495
+
496
+ if not toolchains_dir.exists() or not any(toolchains_dir.iterdir()):
497
+ console.print("[yellow]No local toolchain data found to clean.[/yellow]")
498
+ raise typer.Exit(code=0)
499
+
500
+ console.print(
501
+ "[bold yellow]\nThis will delete all downloaded database files and other "
502
+ "toolchain assets stored locally.[/bold yellow]"
503
+ )
504
+ if not typer.confirm(
505
+ "Are you sure you want to proceed?",
506
+ default=False,
507
+ abort=True, # Typer will exit if user chooses 'no' (the default)
508
+ ):
509
+ # This line is effectively not reached if user aborts.
510
+ # Kept for logical structure understanding, but Typer handles the abort.
511
+ return
512
+
513
+ console.print(f"\nCleaning data from {toolchains_dir}...")
514
+ deleted_items_count = 0
515
+ errors_encountered = False
516
+ try:
517
+ for item_path in toolchains_dir.iterdir():
518
+ try:
519
+ if item_path.is_dir():
520
+ shutil.rmtree(item_path)
521
+ console.print(f" Removed directory: [dim]{item_path.name}[/dim]")
522
+ deleted_items_count += 1
523
+ elif item_path.is_file(): # Handle stray files if any
524
+ item_path.unlink()
525
+ console.print(f" Removed file: [dim]{item_path.name}[/dim]")
526
+ deleted_items_count += 1
527
+ except OSError as e:
528
+ console.print(
529
+ f"[bold red] Error removing {item_path.name}: {e}[/bold red]"
530
+ )
531
+ errors_encountered = True
532
+
533
+ console.print("") # Add a newline for better formatting after item list
534
+
535
+ if errors_encountered:
536
+ console.print(
537
+ "[bold orange3]Data cleaning process completed with some errors. "
538
+ "Please review messages above.[/bold orange3]"
539
+ )
540
+ raise typer.Exit(code=1)
541
+ elif deleted_items_count > 0:
542
+ console.print(
543
+ "[bold green]All local toolchain data has been successfully "
544
+ "cleaned.[/bold green]"
545
+ )
546
+ else:
547
+ # This case might occur if the directory contained no items
548
+ # that were directories or files, or if it became empty
549
+ # between the initial check and this point.
550
+ console.print(
551
+ "[yellow]No items were deleted. The toolchain directory might "
552
+ "have been empty or contained unexpected item types.[/yellow]"
553
+ )
554
+
555
+ except OSError as e: # Error iterating the directory itself
556
+ console.print(
557
+ f"[bold red]An error occurred while accessing toolchain directory "
558
+ f"for cleaning: {e}[/bold red]"
559
+ )
560
+ raise typer.Exit(code=1)
561
+
562
+
503
563
  if __name__ == "__main__":
504
- # This allows testing `python -m lean_explore.cli.data_commands fetch stable`
505
- # For actual CLI use, this app will be mounted in `main.py`.
506
564
  app()
@@ -31,7 +31,7 @@ LEAN_EXPLORE_TOOLCHAINS_BASE_DIR: Final[pathlib.Path] = (
31
31
  # In future enhancements, this could be determined dynamically
32
32
  # or from user configuration.
33
33
  # For now, it's set to the initial version of data provided ("0.1.0").
34
- DEFAULT_ACTIVE_TOOLCHAIN_VERSION: Final[str] = "0.1.0"
34
+ DEFAULT_ACTIVE_TOOLCHAIN_VERSION: Final[str] = "0.2.0"
35
35
 
36
36
  # Path to the data directory for the currently active toolchain version.
37
37
  # Example: ~/.lean_explore/data/toolchains/0.1.0/
@@ -98,20 +98,17 @@ DEFAULT_EMBEDDING_MODEL_NAME: Final[str] = "BAAI/bge-base-en-v1.5"
98
98
  # FAISS Search Parameters
99
99
  DEFAULT_FAISS_K: Final[int] = 100 # Number of nearest neighbors from FAISS
100
100
  DEFAULT_FAISS_NPROBE: Final[int] = 200 # For IVF-type FAISS indexes
101
+ DEFAULT_FAISS_OVERSAMPLING_FACTOR: Final[int] = (
102
+ 3 # Factor to multiply faiss_k by when package filters are active.
103
+ )
101
104
 
102
105
  # Scoring and Ranking Parameters
103
106
  DEFAULT_SEM_SIM_THRESHOLD: Final[float] = 0.525
104
- DEFAULT_PAGERANK_WEIGHT: Final[float] = 1.0
105
- DEFAULT_TEXT_RELEVANCE_WEIGHT: Final[float] = 0.2
106
- DEFAULT_NAME_MATCH_WEIGHT: Final[float] = 0.5
107
+ DEFAULT_PAGERANK_WEIGHT: Final[float] = 0.2
108
+ DEFAULT_TEXT_RELEVANCE_WEIGHT: Final[float] = 1.0
109
+ DEFAULT_NAME_MATCH_WEIGHT: Final[float] = 1.0 # Ensuring float for consistency
107
110
 
108
111
  # Output Parameters
109
112
  DEFAULT_RESULTS_LIMIT: Final[int] = (
110
113
  50 # Default number of final results to display/return
111
114
  )
112
-
113
-
114
- # --- Other Constants (if any emerge) ---
115
- # Example: If your application needs other hardcoded default values,
116
- # they can be added here.
117
- # DEFAULT_SOME_OTHER_PARAMETER: Final[Any] = "some_value"