deepresearch-flow 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/db.py +81 -0
- deepresearch_flow/paper/snapshot/api.py +25 -15
- deepresearch_flow/paper/snapshot/common.py +34 -0
- deepresearch_flow/paper/snapshot/mcp_server.py +686 -0
- deepresearch_flow/paper/snapshot/unpacker.py +259 -0
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.0.dist-info}/METADATA +3 -2
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.0.dist-info}/RECORD +11 -8
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.0.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.0.dist-info}/top_level.txt +0 -0
deepresearch_flow/paper/db.py
CHANGED
|
@@ -424,6 +424,87 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
424
424
|
click.echo(f"Wrote snapshot DB: {opts.output_db}")
|
|
425
425
|
click.echo(f"Wrote static export: {opts.static_export_dir}")
|
|
426
426
|
|
|
427
|
+
@snapshot_group.group("unpack")
|
|
428
|
+
def snapshot_unpack_group() -> None:
|
|
429
|
+
"""Unpack snapshot artifacts."""
|
|
430
|
+
|
|
431
|
+
@snapshot_unpack_group.command("md")
|
|
432
|
+
@click.option("--snapshot-db", "snapshot_db", required=True, help="Path to snapshot database")
|
|
433
|
+
@click.option(
|
|
434
|
+
"--static-export-dir",
|
|
435
|
+
"static_export_dir",
|
|
436
|
+
required=True,
|
|
437
|
+
help="Path to static export directory",
|
|
438
|
+
)
|
|
439
|
+
@click.option(
|
|
440
|
+
"--pdf-root",
|
|
441
|
+
"pdf_roots",
|
|
442
|
+
multiple=True,
|
|
443
|
+
required=True,
|
|
444
|
+
help="PDF root directories for name alignment (repeatable)",
|
|
445
|
+
)
|
|
446
|
+
@click.option("--md-output-dir", "md_output_dir", required=True, help="Output directory for Markdown")
|
|
447
|
+
@click.option(
|
|
448
|
+
"--md-translated-output-dir",
|
|
449
|
+
"md_translated_output_dir",
|
|
450
|
+
required=True,
|
|
451
|
+
help="Output directory for translated Markdown",
|
|
452
|
+
)
|
|
453
|
+
def snapshot_unpack_md(
|
|
454
|
+
snapshot_db: str,
|
|
455
|
+
static_export_dir: str,
|
|
456
|
+
pdf_roots: tuple[str, ...],
|
|
457
|
+
md_output_dir: str,
|
|
458
|
+
md_translated_output_dir: str,
|
|
459
|
+
) -> None:
|
|
460
|
+
"""Unpack source/translated markdown and align filenames to PDFs."""
|
|
461
|
+
from deepresearch_flow.paper.snapshot.unpacker import SnapshotUnpackMdOptions, unpack_md
|
|
462
|
+
|
|
463
|
+
opts = SnapshotUnpackMdOptions(
|
|
464
|
+
snapshot_db=Path(snapshot_db),
|
|
465
|
+
static_export_dir=Path(static_export_dir),
|
|
466
|
+
pdf_roots=[Path(path) for path in pdf_roots],
|
|
467
|
+
md_output_dir=Path(md_output_dir),
|
|
468
|
+
md_translated_output_dir=Path(md_translated_output_dir),
|
|
469
|
+
)
|
|
470
|
+
unpack_md(opts)
|
|
471
|
+
|
|
472
|
+
@snapshot_unpack_group.command("info")
|
|
473
|
+
@click.option("--snapshot-db", "snapshot_db", required=True, help="Path to snapshot database")
|
|
474
|
+
@click.option(
|
|
475
|
+
"--static-export-dir",
|
|
476
|
+
"static_export_dir",
|
|
477
|
+
required=True,
|
|
478
|
+
help="Path to static export directory",
|
|
479
|
+
)
|
|
480
|
+
@click.option(
|
|
481
|
+
"--pdf-root",
|
|
482
|
+
"pdf_roots",
|
|
483
|
+
multiple=True,
|
|
484
|
+
required=True,
|
|
485
|
+
help="PDF root directories for name alignment (repeatable)",
|
|
486
|
+
)
|
|
487
|
+
@click.option("--template", "template", required=True, help="Summary template tag")
|
|
488
|
+
@click.option("--output-json", "output_json", required=True, help="Output JSON file path")
|
|
489
|
+
def snapshot_unpack_info(
|
|
490
|
+
snapshot_db: str,
|
|
491
|
+
static_export_dir: str,
|
|
492
|
+
pdf_roots: tuple[str, ...],
|
|
493
|
+
template: str,
|
|
494
|
+
output_json: str,
|
|
495
|
+
) -> None:
|
|
496
|
+
"""Unpack aggregated paper_infos.json from snapshot summaries."""
|
|
497
|
+
from deepresearch_flow.paper.snapshot.unpacker import SnapshotUnpackInfoOptions, unpack_info
|
|
498
|
+
|
|
499
|
+
opts = SnapshotUnpackInfoOptions(
|
|
500
|
+
snapshot_db=Path(snapshot_db),
|
|
501
|
+
static_export_dir=Path(static_export_dir),
|
|
502
|
+
pdf_roots=[Path(path) for path in pdf_roots],
|
|
503
|
+
template=template,
|
|
504
|
+
output_json=Path(output_json),
|
|
505
|
+
)
|
|
506
|
+
unpack_info(opts)
|
|
507
|
+
|
|
427
508
|
@db_group.group("api")
|
|
428
509
|
def api_group() -> None:
|
|
429
510
|
"""Read-only JSON API server backed by a snapshot DB."""
|
|
@@ -11,8 +11,9 @@ from starlette.applications import Starlette
|
|
|
11
11
|
from starlette.middleware.cors import CORSMiddleware
|
|
12
12
|
from starlette.requests import Request
|
|
13
13
|
from starlette.responses import JSONResponse, Response
|
|
14
|
-
from starlette.routing import Route
|
|
14
|
+
from starlette.routing import Mount, Route
|
|
15
15
|
|
|
16
|
+
from deepresearch_flow.paper.snapshot.common import ApiLimits, _open_ro_conn
|
|
16
17
|
from deepresearch_flow.paper.snapshot.text import merge_adjacent_markers, remove_cjk_spaces, rewrite_search_query
|
|
17
18
|
|
|
18
19
|
_WHITESPACE_RE = re.compile(r"\s+")
|
|
@@ -87,13 +88,6 @@ _FACET_TYPE_TO_KEY = {
|
|
|
87
88
|
}
|
|
88
89
|
|
|
89
90
|
|
|
90
|
-
@dataclass(frozen=True)
|
|
91
|
-
class ApiLimits:
|
|
92
|
-
max_query_length: int = 500
|
|
93
|
-
max_page_size: int = 100
|
|
94
|
-
max_pagination_offset: int = 10_000 # page * page_size
|
|
95
|
-
|
|
96
|
-
|
|
97
91
|
@dataclass(frozen=True)
|
|
98
92
|
class SnapshotApiConfig:
|
|
99
93
|
snapshot_db: Path
|
|
@@ -110,12 +104,6 @@ def _json_error(status_code: int, *, error: str, detail: str) -> JSONResponse:
|
|
|
110
104
|
return JSONResponse({"error": error, "detail": detail}, status_code=status_code)
|
|
111
105
|
|
|
112
106
|
|
|
113
|
-
def _open_ro_conn(db_path: Path) -> sqlite3.Connection:
|
|
114
|
-
uri = f"file:{db_path.as_posix()}?mode=ro"
|
|
115
|
-
conn = sqlite3.connect(uri, uri=True)
|
|
116
|
-
conn.row_factory = sqlite3.Row
|
|
117
|
-
conn.execute("PRAGMA query_only=ON;")
|
|
118
|
-
return conn
|
|
119
107
|
|
|
120
108
|
|
|
121
109
|
def _snapshot_build_id(conn: sqlite3.Connection) -> str:
|
|
@@ -917,6 +905,22 @@ def create_app(
|
|
|
917
905
|
limits=limits or ApiLimits(),
|
|
918
906
|
)
|
|
919
907
|
|
|
908
|
+
# Lazy import to avoid circular dependency
|
|
909
|
+
from deepresearch_flow.paper.snapshot.mcp_server import (
|
|
910
|
+
McpSnapshotConfig,
|
|
911
|
+
create_mcp_app,
|
|
912
|
+
resolve_static_export_dir,
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
mcp_config = McpSnapshotConfig(
|
|
916
|
+
snapshot_db=snapshot_db,
|
|
917
|
+
static_base_url=_normalize_base_url(static_base_url),
|
|
918
|
+
static_export_dir=resolve_static_export_dir(),
|
|
919
|
+
limits=limits or ApiLimits(),
|
|
920
|
+
origin_allowlist=cors_allowed_origins or ["*"],
|
|
921
|
+
)
|
|
922
|
+
mcp_app, mcp_lifespan = create_mcp_app(mcp_config)
|
|
923
|
+
|
|
920
924
|
routes = [
|
|
921
925
|
Route("/api/v1/config", _api_config, methods=["GET"]),
|
|
922
926
|
Route("/api/v1/search", _api_search, methods=["GET"]),
|
|
@@ -927,9 +931,15 @@ def create_app(
|
|
|
927
931
|
Route("/api/v1/facets/{facet:str}/{facet_id:str}/stats", _api_facet_stats, methods=["GET"]),
|
|
928
932
|
Route("/api/v1/facets/{facet:str}/by-value/{value:str}/papers", _api_facet_by_value_papers, methods=["GET"]),
|
|
929
933
|
Route("/api/v1/facets/{facet:str}/by-value/{value:str}/stats", _api_facet_by_value_stats, methods=["GET"]),
|
|
934
|
+
Mount("/mcp", app=mcp_app),
|
|
930
935
|
]
|
|
931
936
|
|
|
932
|
-
|
|
937
|
+
# Pass MCP lifespan to ensure session manager initializes properly
|
|
938
|
+
# https://gofastmcp.com/deployment/http#mounting-in-starlette
|
|
939
|
+
app = Starlette(
|
|
940
|
+
routes=routes,
|
|
941
|
+
lifespan=mcp_lifespan,
|
|
942
|
+
)
|
|
933
943
|
if cfg.cors_allowed_origins:
|
|
934
944
|
app.add_middleware(
|
|
935
945
|
CORSMiddleware,
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Shared utilities for snapshot API and MCP server.
|
|
2
|
+
|
|
3
|
+
This module contains common types, configuration, and utilities used by both
|
|
4
|
+
the snapshot REST API and the MCP server to avoid circular imports.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import sqlite3
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class ApiLimits:
|
|
16
|
+
"""API rate and size limits."""
|
|
17
|
+
|
|
18
|
+
max_query_length: int = 500
|
|
19
|
+
max_page_size: int = 100
|
|
20
|
+
max_pagination_offset: int = 10_000
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _open_ro_conn(db_path: Path) -> sqlite3.Connection:
|
|
24
|
+
"""Open a read-only SQLite connection with Row factory.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
db_path: Path to the SQLite database file.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
sqlite3.Connection: A read-only connection with row_factory set to Row.
|
|
31
|
+
"""
|
|
32
|
+
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, check_same_thread=False)
|
|
33
|
+
conn.row_factory = sqlite3.Row
|
|
34
|
+
return conn
|
|
@@ -0,0 +1,686 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import re
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
from starlette.applications import Starlette
|
|
12
|
+
from starlette.middleware import Middleware
|
|
13
|
+
from starlette.middleware.base import BaseHTTPMiddleware
|
|
14
|
+
from starlette.requests import Request
|
|
15
|
+
from starlette.responses import Response
|
|
16
|
+
from starlette.routing import Mount
|
|
17
|
+
|
|
18
|
+
from fastmcp import FastMCP
|
|
19
|
+
|
|
20
|
+
from deepresearch_flow.paper.snapshot.common import ApiLimits, _open_ro_conn
|
|
21
|
+
from deepresearch_flow.paper.snapshot.text import merge_adjacent_markers, remove_cjk_spaces, rewrite_search_query
|
|
22
|
+
|
|
23
|
+
_SUPPORTED_PROTOCOL_VERSIONS = {"2025-03-26", "2025-06-18"}
|
|
24
|
+
_DEFAULT_MAX_CHARS = 50_000
|
|
25
|
+
_DEFAULT_TIMEOUT = 10.0
|
|
26
|
+
_PAPER_ID_PATTERN = re.compile(r'^[a-zA-Z0-9_-]+$')
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class McpToolError(Exception):
|
|
30
|
+
"""MCP tool exception for standardized error handling.
|
|
31
|
+
|
|
32
|
+
FastMCP will catch this exception and convert it to a proper
|
|
33
|
+
JSON-RPC error response that the client can understand.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, code: str, message: str, **details):
|
|
37
|
+
self.code = code
|
|
38
|
+
self.message = message
|
|
39
|
+
self.details = details
|
|
40
|
+
super().__init__(message)
|
|
41
|
+
|
|
42
|
+
def to_dict(self) -> dict[str, Any]:
|
|
43
|
+
"""Convert to error dictionary format."""
|
|
44
|
+
return {"error": self.code, "message": self.message, **self.details}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(frozen=True)
|
|
48
|
+
class McpSnapshotConfig:
|
|
49
|
+
snapshot_db: Path
|
|
50
|
+
static_base_url: str
|
|
51
|
+
static_export_dir: Path | None
|
|
52
|
+
limits: ApiLimits
|
|
53
|
+
origin_allowlist: list[str]
|
|
54
|
+
max_chars_default: int = _DEFAULT_MAX_CHARS
|
|
55
|
+
http_timeout: float = _DEFAULT_TIMEOUT
|
|
56
|
+
max_paper_id_length: int = 64
|
|
57
|
+
# HTTP client stored in object __dict__ to avoid dataclass frozen restriction
|
|
58
|
+
_http_client: httpx.Client | None = field(default=None, repr=False, compare=False)
|
|
59
|
+
|
|
60
|
+
def get_http_client(self) -> httpx.Client:
|
|
61
|
+
"""Get or create a shared HTTP client with connection pooling."""
|
|
62
|
+
if self._http_client is None:
|
|
63
|
+
object.__setattr__(
|
|
64
|
+
self,
|
|
65
|
+
'_http_client',
|
|
66
|
+
httpx.Client(
|
|
67
|
+
timeout=self.http_timeout,
|
|
68
|
+
follow_redirects=True,
|
|
69
|
+
limits=httpx.Limits(
|
|
70
|
+
max_keepalive_connections=10,
|
|
71
|
+
max_connections=20
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
return self._http_client
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class McpRequestGuardMiddleware(BaseHTTPMiddleware):
|
|
79
|
+
def __init__(self, app, *, origin_allowlist: list[str]) -> None:
|
|
80
|
+
super().__init__(app)
|
|
81
|
+
self._allowlist = [origin.lower() for origin in origin_allowlist]
|
|
82
|
+
|
|
83
|
+
async def dispatch(self, request: Request, call_next): # type: ignore[override]
|
|
84
|
+
if request.method == "GET":
|
|
85
|
+
return Response("Method Not Allowed", status_code=405)
|
|
86
|
+
if request.method not in {"POST", "OPTIONS"}:
|
|
87
|
+
return Response("Method Not Allowed", status_code=405)
|
|
88
|
+
origin = request.headers.get("origin")
|
|
89
|
+
if origin and not self._is_allowed_origin(origin):
|
|
90
|
+
return Response("Forbidden", status_code=403)
|
|
91
|
+
protocol = request.headers.get("mcp-protocol-version")
|
|
92
|
+
if protocol and protocol not in _SUPPORTED_PROTOCOL_VERSIONS:
|
|
93
|
+
return Response("Bad Request", status_code=400)
|
|
94
|
+
return await call_next(request)
|
|
95
|
+
|
|
96
|
+
def _is_allowed_origin(self, origin: str) -> bool:
|
|
97
|
+
if not self._allowlist or "*" in self._allowlist:
|
|
98
|
+
return True
|
|
99
|
+
return origin.lower() in self._allowlist
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
_CONFIG: McpSnapshotConfig | None = None
|
|
103
|
+
mcp = FastMCP("Paper DB MCP")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def configure(config: McpSnapshotConfig) -> None:
|
|
107
|
+
global _CONFIG
|
|
108
|
+
_CONFIG = config
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def create_mcp_app(config: McpSnapshotConfig) -> tuple[Starlette, Any]:
|
|
112
|
+
"""Create MCP app with middleware and return it along with lifespan.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Tuple of (wrapped_app, lifespan_context) for use by parent Starlette.
|
|
116
|
+
"""
|
|
117
|
+
configure(config)
|
|
118
|
+
mcp_app = mcp.http_app(path="/", stateless_http=True)
|
|
119
|
+
wrapped = Starlette(
|
|
120
|
+
routes=[Mount("/", app=mcp_app)],
|
|
121
|
+
middleware=[
|
|
122
|
+
Middleware(McpRequestGuardMiddleware, origin_allowlist=config.origin_allowlist),
|
|
123
|
+
],
|
|
124
|
+
)
|
|
125
|
+
return wrapped, mcp_app.lifespan
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _get_config() -> McpSnapshotConfig:
|
|
129
|
+
if _CONFIG is None:
|
|
130
|
+
raise RuntimeError("MCP server not configured")
|
|
131
|
+
return _CONFIG
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _validate_query(query: str, cfg: McpSnapshotConfig) -> str:
|
|
135
|
+
"""Validate search query string.
|
|
136
|
+
|
|
137
|
+
Raises:
|
|
138
|
+
McpToolError: If query is invalid or too long.
|
|
139
|
+
"""
|
|
140
|
+
if not query or not query.strip():
|
|
141
|
+
raise McpToolError("invalid_query", "Query cannot be empty")
|
|
142
|
+
if len(query) > cfg.limits.max_query_length:
|
|
143
|
+
raise McpToolError(
|
|
144
|
+
"query_too_long",
|
|
145
|
+
f"Query exceeds maximum length of {cfg.limits.max_query_length}",
|
|
146
|
+
length=len(query),
|
|
147
|
+
max_length=cfg.limits.max_query_length
|
|
148
|
+
)
|
|
149
|
+
return query.strip()
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _validate_paper_id(paper_id: str, cfg: McpSnapshotConfig) -> str:
|
|
153
|
+
"""Validate paper ID format.
|
|
154
|
+
|
|
155
|
+
Raises:
|
|
156
|
+
McpToolError: If paper_id is invalid.
|
|
157
|
+
"""
|
|
158
|
+
if not paper_id:
|
|
159
|
+
raise McpToolError("invalid_paper_id", "Paper ID cannot be empty")
|
|
160
|
+
if len(paper_id) > cfg.max_paper_id_length:
|
|
161
|
+
raise McpToolError(
|
|
162
|
+
"paper_id_too_long",
|
|
163
|
+
f"Paper ID exceeds maximum length of {cfg.max_paper_id_length}",
|
|
164
|
+
length=len(paper_id),
|
|
165
|
+
max_length=cfg.max_paper_id_length
|
|
166
|
+
)
|
|
167
|
+
if not _PAPER_ID_PATTERN.match(paper_id):
|
|
168
|
+
raise McpToolError(
|
|
169
|
+
"invalid_paper_id_format",
|
|
170
|
+
"Paper ID must contain only alphanumeric characters, hyphens, and underscores",
|
|
171
|
+
paper_id=paper_id
|
|
172
|
+
)
|
|
173
|
+
return paper_id
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _truncate(text: str, max_chars: int | None) -> str:
|
|
177
|
+
"""Truncate text with marker."""
|
|
178
|
+
if max_chars is None or max_chars <= 0:
|
|
179
|
+
return text
|
|
180
|
+
if len(text) <= max_chars:
|
|
181
|
+
return text
|
|
182
|
+
remaining = len(text) - max_chars
|
|
183
|
+
return f"{text[:max_chars]}\n[truncated: {remaining} more chars]"
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _read_static_text(rel_path: str) -> str | None:
|
|
187
|
+
"""Read static text from local export directory if available."""
|
|
188
|
+
cfg = _get_config()
|
|
189
|
+
if cfg.static_export_dir:
|
|
190
|
+
path = cfg.static_export_dir / rel_path
|
|
191
|
+
if path.exists():
|
|
192
|
+
return path.read_text(encoding="utf-8")
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _fetch_static_text(rel_path: str) -> str:
|
|
197
|
+
"""Fetch static text from HTTP remote."""
|
|
198
|
+
cfg = _get_config()
|
|
199
|
+
if cfg.static_base_url:
|
|
200
|
+
base = cfg.static_base_url.rstrip("/")
|
|
201
|
+
url = f"{base}/{rel_path.lstrip('/')}"
|
|
202
|
+
client = cfg.get_http_client()
|
|
203
|
+
response = client.get(url)
|
|
204
|
+
response.raise_for_status()
|
|
205
|
+
return response.text
|
|
206
|
+
raise FileNotFoundError("static_base_url not configured")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _load_static_text(rel_path: str) -> str:
|
|
210
|
+
"""Load static text with fallback: local first, then HTTP."""
|
|
211
|
+
try:
|
|
212
|
+
text = _read_static_text(rel_path)
|
|
213
|
+
if text is not None:
|
|
214
|
+
return text
|
|
215
|
+
return _fetch_static_text(rel_path)
|
|
216
|
+
except httpx.HTTPStatusError as exc:
|
|
217
|
+
raise RuntimeError(f"asset_fetch_failed:{exc.response.status_code}") from exc
|
|
218
|
+
except httpx.RequestError as exc:
|
|
219
|
+
raise RuntimeError("asset_fetch_failed:request_error") from exc
|
|
220
|
+
except FileNotFoundError as exc:
|
|
221
|
+
raise RuntimeError("asset_fetch_failed:not_configured") from exc
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _load_summary_json(paper_id: str, template: str | None) -> tuple[str | None, list[str] | None]:
|
|
225
|
+
"""Load summary JSON content and return available templates list."""
|
|
226
|
+
cfg = _get_config()
|
|
227
|
+
conn = _open_ro_conn(cfg.snapshot_db)
|
|
228
|
+
try:
|
|
229
|
+
row = conn.execute(
|
|
230
|
+
"SELECT preferred_summary_template, summary_asset_paths_json FROM paper WHERE paper_id = ?",
|
|
231
|
+
(paper_id,),
|
|
232
|
+
).fetchone()
|
|
233
|
+
if not row:
|
|
234
|
+
return None, None
|
|
235
|
+
preferred = row["preferred_summary_template"]
|
|
236
|
+
asset_paths = json.loads(row["summary_asset_paths_json"] or "{}")
|
|
237
|
+
available = sorted(asset_paths.keys())
|
|
238
|
+
selected = template if template else preferred
|
|
239
|
+
if not selected or selected not in asset_paths:
|
|
240
|
+
return None, available
|
|
241
|
+
rel_path = asset_paths[selected]
|
|
242
|
+
return _load_static_text(rel_path), available
|
|
243
|
+
finally:
|
|
244
|
+
conn.close()
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _load_source_markdown(paper_id: str) -> str | None:
|
|
248
|
+
"""Load source markdown for paper."""
|
|
249
|
+
cfg = _get_config()
|
|
250
|
+
conn = _open_ro_conn(cfg.snapshot_db)
|
|
251
|
+
try:
|
|
252
|
+
row = conn.execute(
|
|
253
|
+
"SELECT source_md_content_hash FROM paper WHERE paper_id = ?",
|
|
254
|
+
(paper_id,),
|
|
255
|
+
).fetchone()
|
|
256
|
+
if not row or not row["source_md_content_hash"]:
|
|
257
|
+
return None
|
|
258
|
+
rel_path = f"md/{row['source_md_content_hash']}.md"
|
|
259
|
+
return _load_static_text(rel_path)
|
|
260
|
+
finally:
|
|
261
|
+
conn.close()
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _load_translation_markdown(paper_id: str, lang: str) -> str | None:
|
|
265
|
+
"""Load translation markdown for paper and language."""
|
|
266
|
+
cfg = _get_config()
|
|
267
|
+
conn = _open_ro_conn(cfg.snapshot_db)
|
|
268
|
+
try:
|
|
269
|
+
row = conn.execute(
|
|
270
|
+
"SELECT translations_json FROM paper WHERE paper_id = ?",
|
|
271
|
+
(paper_id,),
|
|
272
|
+
).fetchone()
|
|
273
|
+
if not row or not row["translations_json"]:
|
|
274
|
+
return None
|
|
275
|
+
translations = json.loads(row["translations_json"])
|
|
276
|
+
rel_path = translations.get(lang)
|
|
277
|
+
if not rel_path:
|
|
278
|
+
return None
|
|
279
|
+
return _load_static_text(rel_path)
|
|
280
|
+
finally:
|
|
281
|
+
conn.close()
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
# ==================== MCP Tools ====================
|
|
285
|
+
|
|
286
|
+
@mcp.tool()
|
|
287
|
+
def search_papers(query: str, limit: int = 10) -> list[dict[str, Any]]:
|
|
288
|
+
"""Full-text search for papers (relevance-ranked).
|
|
289
|
+
|
|
290
|
+
Use when you only have topic keywords.
|
|
291
|
+
Returns paper_id, title, year, venue, snippet_markdown.
|
|
292
|
+
"""
|
|
293
|
+
cfg = _get_config()
|
|
294
|
+
query = _validate_query(query, cfg)
|
|
295
|
+
limit = min(max(1, int(limit)), cfg.limits.max_page_size)
|
|
296
|
+
|
|
297
|
+
conn = _open_ro_conn(cfg.snapshot_db)
|
|
298
|
+
try:
|
|
299
|
+
cur = conn.execute(
|
|
300
|
+
"""
|
|
301
|
+
SELECT paper_id, title, year, venue, abstract
|
|
302
|
+
FROM paper_search
|
|
303
|
+
WHERE paper_search MATCH ?
|
|
304
|
+
ORDER BY rank
|
|
305
|
+
LIMIT ?
|
|
306
|
+
""",
|
|
307
|
+
(rewrite_search_query(query), limit),
|
|
308
|
+
)
|
|
309
|
+
rows = cur.fetchall()
|
|
310
|
+
results: list[dict[str, Any]] = []
|
|
311
|
+
for row in rows:
|
|
312
|
+
snippet = str(row["abstract"] or "")
|
|
313
|
+
snippet = remove_cjk_spaces(snippet)
|
|
314
|
+
snippet, markers = merge_adjacent_markers(snippet)
|
|
315
|
+
results.append({
|
|
316
|
+
"paper_id": str(row["paper_id"]),
|
|
317
|
+
"title": str(row["title"]),
|
|
318
|
+
"year": str(row["year"]),
|
|
319
|
+
"venue": str(row["venue"]),
|
|
320
|
+
"snippet_markdown": snippet,
|
|
321
|
+
})
|
|
322
|
+
return results
|
|
323
|
+
finally:
|
|
324
|
+
conn.close()
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@mcp.tool()
|
|
328
|
+
def search_papers_by_keyword(keyword: str, limit: int = 10) -> list[dict[str, Any]]:
|
|
329
|
+
"""Search papers by keyword/tag (exact match).
|
|
330
|
+
|
|
331
|
+
Use when you know specific keywords or tags.
|
|
332
|
+
"""
|
|
333
|
+
cfg = _get_config()
|
|
334
|
+
limit = min(max(1, int(limit)), cfg.limits.max_page_size)
|
|
335
|
+
|
|
336
|
+
conn = _open_ro_conn(cfg.snapshot_db)
|
|
337
|
+
try:
|
|
338
|
+
rows = conn.execute(
|
|
339
|
+
"""
|
|
340
|
+
SELECT DISTINCT p.paper_id, p.title, p.year, p.venue, p.abstract
|
|
341
|
+
FROM paper p
|
|
342
|
+
JOIN paper_keyword pk ON pk.paper_id = p.paper_id
|
|
343
|
+
JOIN keyword k ON k.keyword_id = pk.keyword_id
|
|
344
|
+
WHERE k.value LIKE ?
|
|
345
|
+
ORDER BY p.year DESC, p.title ASC
|
|
346
|
+
LIMIT ?
|
|
347
|
+
""",
|
|
348
|
+
(f"%{keyword}%", limit),
|
|
349
|
+
).fetchall()
|
|
350
|
+
results: list[dict[str, Any]] = []
|
|
351
|
+
for row in rows:
|
|
352
|
+
snippet = str(row["abstract"] or "")
|
|
353
|
+
snippet = remove_cjk_spaces(snippet)
|
|
354
|
+
snippet, markers = merge_adjacent_markers(snippet)
|
|
355
|
+
results.append({
|
|
356
|
+
"paper_id": str(row["paper_id"]),
|
|
357
|
+
"title": str(row["title"]),
|
|
358
|
+
"year": str(row["year"]),
|
|
359
|
+
"venue": str(row["venue"]),
|
|
360
|
+
"snippet_markdown": snippet,
|
|
361
|
+
})
|
|
362
|
+
return results
|
|
363
|
+
finally:
|
|
364
|
+
conn.close()
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
@mcp.tool()
|
|
368
|
+
def get_paper_metadata(paper_id: str) -> dict[str, Any]:
|
|
369
|
+
"""Get paper metadata and available summary templates.
|
|
370
|
+
|
|
371
|
+
Call this first before requesting a summary to discover available templates.
|
|
372
|
+
"""
|
|
373
|
+
cfg = _get_config()
|
|
374
|
+
paper_id = _validate_paper_id(paper_id, cfg)
|
|
375
|
+
|
|
376
|
+
conn = _open_ro_conn(cfg.snapshot_db)
|
|
377
|
+
try:
|
|
378
|
+
row = conn.execute(
|
|
379
|
+
"""
|
|
380
|
+
SELECT paper_id, title, year, venue, doi, arxiv_id, openreview_id, paper_pw_url,
|
|
381
|
+
preferred_summary_template, summary_asset_paths_json
|
|
382
|
+
FROM paper WHERE paper_id = ?
|
|
383
|
+
""",
|
|
384
|
+
(paper_id,),
|
|
385
|
+
).fetchone()
|
|
386
|
+
if not row:
|
|
387
|
+
raise McpToolError("not_found", "paper not found", paper_id=paper_id)
|
|
388
|
+
|
|
389
|
+
asset_paths = json.loads(row["summary_asset_paths_json"] or "{}")
|
|
390
|
+
available = sorted(asset_paths.keys())
|
|
391
|
+
return {
|
|
392
|
+
"paper_id": str(row["paper_id"]),
|
|
393
|
+
"title": str(row["title"]),
|
|
394
|
+
"year": str(row["year"]),
|
|
395
|
+
"venue": str(row["venue"]),
|
|
396
|
+
"doi": row["doi"],
|
|
397
|
+
"arxiv_id": row["arxiv_id"],
|
|
398
|
+
"openreview_id": row["openreview_id"],
|
|
399
|
+
"paper_pw_url": row["paper_pw_url"],
|
|
400
|
+
"preferred_summary_template": row["preferred_summary_template"],
|
|
401
|
+
"available_summary_templates": available,
|
|
402
|
+
}
|
|
403
|
+
finally:
|
|
404
|
+
conn.close()
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
@mcp.tool()
|
|
408
|
+
def get_paper_summary(paper_id: str, template: str | None = None, max_chars: int | None = None) -> str:
|
|
409
|
+
"""Get summary JSON as raw string.
|
|
410
|
+
|
|
411
|
+
Uses preferred template if template is not specified.
|
|
412
|
+
Returns the full JSON content (not a URL).
|
|
413
|
+
"""
|
|
414
|
+
cfg = _get_config()
|
|
415
|
+
paper_id = _validate_paper_id(paper_id, cfg)
|
|
416
|
+
max_chars = max_chars if max_chars is not None else cfg.max_chars_default
|
|
417
|
+
|
|
418
|
+
try:
|
|
419
|
+
payload, available = _load_summary_json(paper_id, template)
|
|
420
|
+
except RuntimeError as exc:
|
|
421
|
+
raise McpToolError(
|
|
422
|
+
"asset_fetch_failed",
|
|
423
|
+
"Failed to fetch summary asset",
|
|
424
|
+
paper_id=paper_id,
|
|
425
|
+
template=template,
|
|
426
|
+
detail=str(exc),
|
|
427
|
+
) from exc
|
|
428
|
+
|
|
429
|
+
if payload is None:
|
|
430
|
+
raise McpToolError(
|
|
431
|
+
"template_not_available",
|
|
432
|
+
"Template not available",
|
|
433
|
+
paper_id=paper_id,
|
|
434
|
+
template=template,
|
|
435
|
+
available_summary_templates=available,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
return _truncate(payload, max_chars)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
@mcp.tool()
|
|
442
|
+
def get_paper_source(paper_id: str, max_chars: int | None = None) -> str:
|
|
443
|
+
"""Get source markdown text.
|
|
444
|
+
|
|
445
|
+
Content may be large; use max_chars to limit size.
|
|
446
|
+
"""
|
|
447
|
+
cfg = _get_config()
|
|
448
|
+
paper_id = _validate_paper_id(paper_id, cfg)
|
|
449
|
+
max_chars = max_chars if max_chars is not None else cfg.max_chars_default
|
|
450
|
+
|
|
451
|
+
try:
|
|
452
|
+
content = _load_source_markdown(paper_id)
|
|
453
|
+
except RuntimeError as exc:
|
|
454
|
+
raise McpToolError(
|
|
455
|
+
"asset_fetch_failed",
|
|
456
|
+
"Failed to fetch source asset",
|
|
457
|
+
paper_id=paper_id,
|
|
458
|
+
detail=str(exc),
|
|
459
|
+
) from exc
|
|
460
|
+
|
|
461
|
+
if content is None:
|
|
462
|
+
raise McpToolError(
|
|
463
|
+
"source_not_available",
|
|
464
|
+
"Source markdown not available",
|
|
465
|
+
paper_id=paper_id
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
return _truncate(content, max_chars)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
@mcp.tool()
|
|
472
|
+
def get_database_stats() -> dict[str, Any]:
|
|
473
|
+
"""Get database statistics.
|
|
474
|
+
|
|
475
|
+
Returns totals, year/month distributions, and top facets
|
|
476
|
+
(authors, venues, keywords, institutions, tags).
|
|
477
|
+
"""
|
|
478
|
+
cfg = _get_config()
|
|
479
|
+
conn = _open_ro_conn(cfg.snapshot_db)
|
|
480
|
+
try:
|
|
481
|
+
total_row = conn.execute("SELECT COUNT(*) AS c FROM paper").fetchone()
|
|
482
|
+
total = int(total_row["c"]) if total_row else 0
|
|
483
|
+
|
|
484
|
+
def top(table: str, limit: int = 20) -> list[dict[str, Any]]:
|
|
485
|
+
rows = conn.execute(
|
|
486
|
+
f"SELECT value, paper_count FROM {table} ORDER BY paper_count DESC, value ASC LIMIT ?",
|
|
487
|
+
(limit,),
|
|
488
|
+
).fetchall()
|
|
489
|
+
return [{"value": str(r["value"]), "paper_count": int(r["paper_count"])} for r in rows]
|
|
490
|
+
|
|
491
|
+
years = conn.execute(
|
|
492
|
+
"""
|
|
493
|
+
SELECT year AS value, paper_count
|
|
494
|
+
FROM year_count
|
|
495
|
+
ORDER BY CASE WHEN year GLOB '[0-9][0-9][0-9][0-9]' THEN 0 ELSE 1 END,
|
|
496
|
+
CAST(year AS INT) DESC, year ASC
|
|
497
|
+
LIMIT 50
|
|
498
|
+
""",
|
|
499
|
+
).fetchall()
|
|
500
|
+
months = conn.execute(
|
|
501
|
+
"""
|
|
502
|
+
SELECT month AS value, paper_count
|
|
503
|
+
FROM month_count
|
|
504
|
+
ORDER BY CASE WHEN month GLOB '[0-1][0-9]' THEN 0 ELSE 1 END,
|
|
505
|
+
CAST(month AS INT) ASC, month ASC
|
|
506
|
+
""",
|
|
507
|
+
).fetchall()
|
|
508
|
+
|
|
509
|
+
return {
|
|
510
|
+
"total": total,
|
|
511
|
+
"years": [{"value": str(r["value"]), "paper_count": int(r["paper_count"])} for r in years],
|
|
512
|
+
"months": [{"value": str(r["value"]), "paper_count": int(r["paper_count"])} for r in months],
|
|
513
|
+
"authors": top("author"),
|
|
514
|
+
"venues": top("venue"),
|
|
515
|
+
"institutions": top("institution"),
|
|
516
|
+
"keywords": top("keyword"),
|
|
517
|
+
"tags": top("tag"),
|
|
518
|
+
}
|
|
519
|
+
finally:
|
|
520
|
+
conn.close()
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
@mcp.tool()
|
|
524
|
+
def list_top_facets(category: str, limit: int = 20) -> list[dict[str, Any]]:
|
|
525
|
+
"""List top facet values.
|
|
526
|
+
|
|
527
|
+
Category: author | venue | keyword | institution | tag
|
|
528
|
+
"""
|
|
529
|
+
table_map = {
|
|
530
|
+
"author": "author",
|
|
531
|
+
"venue": "venue",
|
|
532
|
+
"keyword": "keyword",
|
|
533
|
+
"institution": "institution",
|
|
534
|
+
"tag": "tag",
|
|
535
|
+
}
|
|
536
|
+
table = table_map.get((category or "").strip().lower())
|
|
537
|
+
if not table:
|
|
538
|
+
raise McpToolError(
|
|
539
|
+
"invalid_category",
|
|
540
|
+
f"Invalid category: {category}. Must be one of: {', '.join(table_map.keys())}",
|
|
541
|
+
category=category
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
limit = max(1, int(limit))
|
|
545
|
+
cfg = _get_config()
|
|
546
|
+
conn = _open_ro_conn(cfg.snapshot_db)
|
|
547
|
+
try:
|
|
548
|
+
rows = conn.execute(
|
|
549
|
+
f"SELECT value, paper_count FROM {table} ORDER BY paper_count DESC, value ASC LIMIT ?",
|
|
550
|
+
(limit,),
|
|
551
|
+
).fetchall()
|
|
552
|
+
return [{"value": str(r["value"]), "paper_count": int(r["paper_count"])} for r in rows]
|
|
553
|
+
finally:
|
|
554
|
+
conn.close()
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
@mcp.tool()
|
|
558
|
+
def filter_papers(
|
|
559
|
+
author: str | None = None,
|
|
560
|
+
venue: str | None = None,
|
|
561
|
+
year: str | None = None,
|
|
562
|
+
keyword: str | None = None,
|
|
563
|
+
tag: str | None = None,
|
|
564
|
+
limit: int = 10,
|
|
565
|
+
) -> list[dict[str, Any]]:
|
|
566
|
+
"""Filter papers by structured fields.
|
|
567
|
+
|
|
568
|
+
Use for precise filtering by author, venue, year, keyword, or tag.
|
|
569
|
+
"""
|
|
570
|
+
cfg = _get_config()
|
|
571
|
+
limit = min(max(1, int(limit)), cfg.limits.max_page_size)
|
|
572
|
+
|
|
573
|
+
query = "SELECT DISTINCT p.paper_id, p.title, p.year, p.venue FROM paper p"
|
|
574
|
+
joins: list[str] = []
|
|
575
|
+
conditions: list[str] = []
|
|
576
|
+
params: list[Any] = []
|
|
577
|
+
|
|
578
|
+
if author:
|
|
579
|
+
joins.append("JOIN paper_author pa ON pa.paper_id = p.paper_id")
|
|
580
|
+
joins.append("JOIN author a ON a.author_id = pa.author_id")
|
|
581
|
+
conditions.append("a.value LIKE ?")
|
|
582
|
+
params.append(f"%{author}%")
|
|
583
|
+
if keyword:
|
|
584
|
+
joins.append("JOIN paper_keyword pk ON pk.paper_id = p.paper_id")
|
|
585
|
+
joins.append("JOIN keyword k ON k.keyword_id = pk.keyword_id")
|
|
586
|
+
conditions.append("k.value LIKE ?")
|
|
587
|
+
params.append(f"%{keyword}%")
|
|
588
|
+
if tag:
|
|
589
|
+
joins.append("JOIN paper_tag pt ON pt.paper_id = p.paper_id")
|
|
590
|
+
joins.append("JOIN tag t ON t.tag_id = pt.tag_id")
|
|
591
|
+
conditions.append("t.value LIKE ?")
|
|
592
|
+
params.append(f"%{tag}%")
|
|
593
|
+
if venue:
|
|
594
|
+
conditions.append("p.venue LIKE ?")
|
|
595
|
+
params.append(f"%{venue}%")
|
|
596
|
+
if year:
|
|
597
|
+
conditions.append("p.year = ?")
|
|
598
|
+
params.append(str(year))
|
|
599
|
+
|
|
600
|
+
if joins:
|
|
601
|
+
query += " " + " ".join(joins)
|
|
602
|
+
if conditions:
|
|
603
|
+
query += " WHERE " + " AND ".join(conditions)
|
|
604
|
+
query += " ORDER BY p.year DESC, p.title ASC LIMIT ?"
|
|
605
|
+
params.append(limit)
|
|
606
|
+
|
|
607
|
+
conn = _open_ro_conn(cfg.snapshot_db)
|
|
608
|
+
try:
|
|
609
|
+
rows = conn.execute(query, tuple(params)).fetchall()
|
|
610
|
+
return [
|
|
611
|
+
{
|
|
612
|
+
"paper_id": str(row["paper_id"]),
|
|
613
|
+
"title": str(row["title"]),
|
|
614
|
+
"year": str(row["year"]),
|
|
615
|
+
"venue": str(row["venue"]),
|
|
616
|
+
}
|
|
617
|
+
for row in rows
|
|
618
|
+
]
|
|
619
|
+
finally:
|
|
620
|
+
conn.close()
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
# ==================== MCP Resources ====================
|
|
624
|
+
|
|
625
|
+
@mcp.resource("paper://{paper_id}/metadata")
|
|
626
|
+
def resource_metadata(paper_id: str) -> str:
|
|
627
|
+
"""Resource: metadata as JSON string."""
|
|
628
|
+
payload = get_paper_metadata(paper_id)
|
|
629
|
+
return json.dumps(payload, ensure_ascii=False)
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
@mcp.resource("paper://{paper_id}/summary")
|
|
633
|
+
def resource_summary_default(paper_id: str) -> str:
|
|
634
|
+
"""Resource: preferred summary JSON string."""
|
|
635
|
+
payload = get_paper_summary(paper_id)
|
|
636
|
+
return payload # Already a JSON string
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
@mcp.resource("paper://{paper_id}/summary/{template}")
|
|
640
|
+
def resource_summary_template(paper_id: str, template: str) -> str:
|
|
641
|
+
"""Resource: summary JSON string for a specific template."""
|
|
642
|
+
payload = get_paper_summary(paper_id, template=template)
|
|
643
|
+
return payload # Already a JSON string
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
@mcp.resource("paper://{paper_id}/source")
|
|
647
|
+
def resource_source(paper_id: str) -> str:
|
|
648
|
+
"""Resource: source markdown text."""
|
|
649
|
+
payload = get_paper_source(paper_id)
|
|
650
|
+
return payload
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
@mcp.resource("paper://{paper_id}/translation/{lang}")
|
|
654
|
+
def resource_translation(paper_id: str, lang: str) -> str:
|
|
655
|
+
"""Resource: translated markdown text."""
|
|
656
|
+
cfg = _get_config()
|
|
657
|
+
paper_id = _validate_paper_id(paper_id, cfg)
|
|
658
|
+
|
|
659
|
+
try:
|
|
660
|
+
content = _load_translation_markdown(paper_id, lang.lower())
|
|
661
|
+
except RuntimeError as exc:
|
|
662
|
+
raise McpToolError(
|
|
663
|
+
"asset_fetch_failed",
|
|
664
|
+
"Failed to fetch translation asset",
|
|
665
|
+
paper_id=paper_id,
|
|
666
|
+
lang=lang,
|
|
667
|
+
detail=str(exc),
|
|
668
|
+
) from exc
|
|
669
|
+
|
|
670
|
+
if content is None:
|
|
671
|
+
raise McpToolError(
|
|
672
|
+
"translation_not_available",
|
|
673
|
+
"Translation not available",
|
|
674
|
+
paper_id=paper_id,
|
|
675
|
+
lang=lang,
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
return _truncate(content, cfg.max_chars_default)
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def resolve_static_export_dir() -> Path | None:
|
|
682
|
+
"""Resolve static export directory from environment variable."""
|
|
683
|
+
value = os.getenv("PAPER_DB_STATIC_EXPORT_DIR")
|
|
684
|
+
if not value:
|
|
685
|
+
return None
|
|
686
|
+
return Path(value)
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""Unpack snapshot to recover original files with readable names.
|
|
2
|
+
|
|
3
|
+
This is the reverse operation of builder.build_snapshot().
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
import hashlib
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import re
|
|
13
|
+
import sqlite3
|
|
14
|
+
from typing import Any, Iterable
|
|
15
|
+
|
|
16
|
+
from rich.console import Console
|
|
17
|
+
from rich.table import Table
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class SnapshotUnpackBaseOptions:
|
|
22
|
+
snapshot_db: Path
|
|
23
|
+
static_export_dir: Path
|
|
24
|
+
pdf_roots: list[Path]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class SnapshotUnpackMdOptions(SnapshotUnpackBaseOptions):
|
|
29
|
+
md_output_dir: Path
|
|
30
|
+
md_translated_output_dir: Path
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class SnapshotUnpackInfoOptions(SnapshotUnpackBaseOptions):
|
|
35
|
+
template: str
|
|
36
|
+
output_json: Path
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class UnpackCounts:
|
|
41
|
+
total: int = 0
|
|
42
|
+
succeeded: int = 0
|
|
43
|
+
failed: int = 0
|
|
44
|
+
missing_pdf: int = 0
|
|
45
|
+
translated_succeeded: int = 0
|
|
46
|
+
translated_failed: int = 0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _sanitize_filename(title: str) -> str:
|
|
50
|
+
"""Convert title to safe filename."""
|
|
51
|
+
sanitized = re.sub(r'[<>:"/\\|?*]', "_", title)
|
|
52
|
+
if len(sanitized) > 200:
|
|
53
|
+
sanitized = sanitized[:200]
|
|
54
|
+
sanitized = sanitized.strip()
|
|
55
|
+
if not sanitized:
|
|
56
|
+
sanitized = "untitled"
|
|
57
|
+
return sanitized
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _hash_file(path: Path) -> str:
|
|
61
|
+
digest = hashlib.sha256()
|
|
62
|
+
with path.open("rb") as handle:
|
|
63
|
+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
64
|
+
digest.update(chunk)
|
|
65
|
+
return digest.hexdigest()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _build_pdf_hash_index(pdf_roots: Iterable[Path]) -> dict[str, Path]:
|
|
69
|
+
index: dict[str, Path] = {}
|
|
70
|
+
for root in pdf_roots:
|
|
71
|
+
if root.is_file() and root.suffix.lower() == ".pdf":
|
|
72
|
+
pdf_hash = _hash_file(root)
|
|
73
|
+
index.setdefault(pdf_hash, root)
|
|
74
|
+
continue
|
|
75
|
+
if not root.is_dir():
|
|
76
|
+
continue
|
|
77
|
+
for path in root.rglob("*.pdf"):
|
|
78
|
+
if not path.is_file():
|
|
79
|
+
continue
|
|
80
|
+
pdf_hash = _hash_file(path)
|
|
81
|
+
index.setdefault(pdf_hash, path)
|
|
82
|
+
return index
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _unique_base_name(base: str, paper_id: str, used: set[str]) -> str:
|
|
86
|
+
candidate = base
|
|
87
|
+
if candidate in used:
|
|
88
|
+
candidate = f"{base}_{paper_id}"
|
|
89
|
+
counter = 1
|
|
90
|
+
while candidate in used:
|
|
91
|
+
candidate = f"{base}_{paper_id}_{counter}"
|
|
92
|
+
counter += 1
|
|
93
|
+
used.add(candidate)
|
|
94
|
+
return candidate
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _open_snapshot_db(path: Path) -> sqlite3.Connection:
|
|
98
|
+
conn = sqlite3.connect(path)
|
|
99
|
+
conn.row_factory = sqlite3.Row
|
|
100
|
+
return conn
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _print_summary(title: str, counts: UnpackCounts) -> None:
|
|
104
|
+
table = Table(title=title, header_style="bold cyan", title_style="bold magenta")
|
|
105
|
+
table.add_column("Metric", style="cyan", no_wrap=True)
|
|
106
|
+
table.add_column("Value", style="white", overflow="fold")
|
|
107
|
+
table.add_row("Total", str(counts.total))
|
|
108
|
+
table.add_row("Succeeded", str(counts.succeeded))
|
|
109
|
+
table.add_row("Failed", str(counts.failed))
|
|
110
|
+
table.add_row("Missing PDF", str(counts.missing_pdf))
|
|
111
|
+
if counts.translated_succeeded or counts.translated_failed:
|
|
112
|
+
table.add_row("Translated succeeded", str(counts.translated_succeeded))
|
|
113
|
+
table.add_row("Translated failed", str(counts.translated_failed))
|
|
114
|
+
Console().print(table)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def unpack_md(opts: SnapshotUnpackMdOptions) -> None:
|
|
118
|
+
"""Unpack source/translated markdown and align filenames to PDFs."""
|
|
119
|
+
opts.md_output_dir.mkdir(parents=True, exist_ok=True)
|
|
120
|
+
opts.md_translated_output_dir.mkdir(parents=True, exist_ok=True)
|
|
121
|
+
|
|
122
|
+
pdf_index = _build_pdf_hash_index(opts.pdf_roots)
|
|
123
|
+
used_names: set[str] = set()
|
|
124
|
+
counts = UnpackCounts()
|
|
125
|
+
|
|
126
|
+
conn = _open_snapshot_db(opts.snapshot_db)
|
|
127
|
+
try:
|
|
128
|
+
cursor = conn.execute(
|
|
129
|
+
"""
|
|
130
|
+
SELECT
|
|
131
|
+
paper_id,
|
|
132
|
+
title,
|
|
133
|
+
source_hash,
|
|
134
|
+
pdf_content_hash,
|
|
135
|
+
source_md_content_hash
|
|
136
|
+
FROM paper
|
|
137
|
+
ORDER BY paper_index, title
|
|
138
|
+
"""
|
|
139
|
+
)
|
|
140
|
+
for row in cursor.fetchall():
|
|
141
|
+
counts.total += 1
|
|
142
|
+
paper_id = str(row["paper_id"])
|
|
143
|
+
title = str(row["title"] or "")
|
|
144
|
+
pdf_hash = row["pdf_content_hash"]
|
|
145
|
+
md_hash = row["source_md_content_hash"]
|
|
146
|
+
|
|
147
|
+
base = ""
|
|
148
|
+
if pdf_hash and pdf_hash in pdf_index:
|
|
149
|
+
base = pdf_index[pdf_hash].stem
|
|
150
|
+
else:
|
|
151
|
+
counts.missing_pdf += 1
|
|
152
|
+
base = _sanitize_filename(title)
|
|
153
|
+
base = _unique_base_name(base, paper_id, used_names)
|
|
154
|
+
|
|
155
|
+
if md_hash:
|
|
156
|
+
src_md = opts.static_export_dir / "md" / f"{md_hash}.md"
|
|
157
|
+
if src_md.exists():
|
|
158
|
+
dst_md = opts.md_output_dir / f"{base}.md"
|
|
159
|
+
try:
|
|
160
|
+
dst_md.write_text(src_md.read_text(encoding="utf-8"), encoding="utf-8")
|
|
161
|
+
counts.succeeded += 1
|
|
162
|
+
except OSError:
|
|
163
|
+
counts.failed += 1
|
|
164
|
+
else:
|
|
165
|
+
counts.failed += 1
|
|
166
|
+
else:
|
|
167
|
+
counts.failed += 1
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
for tr_row in conn.execute(
|
|
171
|
+
"SELECT lang, md_content_hash FROM paper_translation WHERE paper_id = ?",
|
|
172
|
+
(paper_id,),
|
|
173
|
+
):
|
|
174
|
+
lang = str(tr_row["lang"] or "").lower()
|
|
175
|
+
tr_hash = tr_row["md_content_hash"]
|
|
176
|
+
if not lang or not tr_hash:
|
|
177
|
+
counts.translated_failed += 1
|
|
178
|
+
continue
|
|
179
|
+
src_tr = opts.static_export_dir / "md_translate" / lang / f"{tr_hash}.md"
|
|
180
|
+
if not src_tr.exists():
|
|
181
|
+
counts.translated_failed += 1
|
|
182
|
+
continue
|
|
183
|
+
dst_tr = opts.md_translated_output_dir / f"{base}.{lang}.md"
|
|
184
|
+
try:
|
|
185
|
+
dst_tr.write_text(src_tr.read_text(encoding="utf-8"), encoding="utf-8")
|
|
186
|
+
counts.translated_succeeded += 1
|
|
187
|
+
except OSError:
|
|
188
|
+
counts.translated_failed += 1
|
|
189
|
+
finally:
|
|
190
|
+
conn.close()
|
|
191
|
+
|
|
192
|
+
_print_summary("snapshot unpack md summary", counts)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def unpack_info(opts: SnapshotUnpackInfoOptions) -> None:
|
|
196
|
+
"""Unpack aggregated paper_infos.json from snapshot summaries."""
|
|
197
|
+
pdf_index = _build_pdf_hash_index(opts.pdf_roots)
|
|
198
|
+
counts = UnpackCounts()
|
|
199
|
+
items: list[dict[str, Any]] = []
|
|
200
|
+
|
|
201
|
+
conn = _open_snapshot_db(opts.snapshot_db)
|
|
202
|
+
try:
|
|
203
|
+
cursor = conn.execute(
|
|
204
|
+
"""
|
|
205
|
+
SELECT
|
|
206
|
+
paper_id,
|
|
207
|
+
title,
|
|
208
|
+
source_hash,
|
|
209
|
+
pdf_content_hash
|
|
210
|
+
FROM paper
|
|
211
|
+
ORDER BY paper_index, title
|
|
212
|
+
"""
|
|
213
|
+
)
|
|
214
|
+
for row in cursor.fetchall():
|
|
215
|
+
counts.total += 1
|
|
216
|
+
paper_id = str(row["paper_id"])
|
|
217
|
+
pdf_hash = row["pdf_content_hash"]
|
|
218
|
+
if not (pdf_hash and pdf_hash in pdf_index):
|
|
219
|
+
counts.missing_pdf += 1
|
|
220
|
+
|
|
221
|
+
summary_path = opts.static_export_dir / "summary" / paper_id / f"{opts.template}.json"
|
|
222
|
+
fallback_path = opts.static_export_dir / "summary" / f"{paper_id}.json"
|
|
223
|
+
target_path = summary_path if summary_path.exists() else fallback_path
|
|
224
|
+
used_fallback = target_path == fallback_path
|
|
225
|
+
if not target_path.exists():
|
|
226
|
+
counts.failed += 1
|
|
227
|
+
continue
|
|
228
|
+
try:
|
|
229
|
+
payload = json.loads(target_path.read_text(encoding="utf-8"))
|
|
230
|
+
except json.JSONDecodeError:
|
|
231
|
+
counts.failed += 1
|
|
232
|
+
continue
|
|
233
|
+
if not isinstance(payload, dict):
|
|
234
|
+
counts.failed += 1
|
|
235
|
+
continue
|
|
236
|
+
|
|
237
|
+
base = ""
|
|
238
|
+
if pdf_hash and pdf_hash in pdf_index:
|
|
239
|
+
base = pdf_index[pdf_hash].stem
|
|
240
|
+
else:
|
|
241
|
+
base = _sanitize_filename(str(row["title"] or ""))
|
|
242
|
+
source_path = f"{base}.md" if base else ""
|
|
243
|
+
|
|
244
|
+
payload["paper_id"] = paper_id
|
|
245
|
+
payload["paper_title"] = str(row["title"] or "")
|
|
246
|
+
payload["source_path"] = source_path
|
|
247
|
+
payload["source_hash"] = str(row["source_hash"] or "")
|
|
248
|
+
|
|
249
|
+
if used_fallback:
|
|
250
|
+
counts.failed += 1
|
|
251
|
+
else:
|
|
252
|
+
counts.succeeded += 1
|
|
253
|
+
items.append(payload)
|
|
254
|
+
finally:
|
|
255
|
+
conn.close()
|
|
256
|
+
|
|
257
|
+
opts.output_json.parent.mkdir(parents=True, exist_ok=True)
|
|
258
|
+
opts.output_json.write_text(json.dumps(items, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
259
|
+
_print_summary("snapshot unpack info summary", counts)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deepresearch-flow
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Workflow tools for paper extraction, review, and research automation.
|
|
5
5
|
Author-email: DengQi <dengqi935@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -49,6 +49,7 @@ Requires-Dist: jinja2>=3.1.3
|
|
|
49
49
|
Requires-Dist: json-repair>=0.55.1
|
|
50
50
|
Requires-Dist: jsonschema>=4.26.0
|
|
51
51
|
Requires-Dist: markdown-it-py>=3.0.0
|
|
52
|
+
Requires-Dist: fastmcp>=3.0.0b1
|
|
52
53
|
Requires-Dist: mdit-py-plugins>=0.4.0
|
|
53
54
|
Requires-Dist: pypdf>=6.6.2
|
|
54
55
|
Requires-Dist: pylatexenc>=2.10
|
|
@@ -56,7 +57,7 @@ Requires-Dist: pybtex>=0.24.0
|
|
|
56
57
|
Requires-Dist: rich>=14.3.1
|
|
57
58
|
Requires-Dist: rumdl>=0.1.6
|
|
58
59
|
Requires-Dist: starlette>=0.52.1
|
|
59
|
-
Requires-Dist: tqdm>=4.
|
|
60
|
+
Requires-Dist: tqdm>=4.67.2
|
|
60
61
|
Requires-Dist: uvicorn>=0.27.1
|
|
61
62
|
Dynamic: license-file
|
|
62
63
|
|
|
@@ -4,7 +4,7 @@ deepresearch_flow/cli.py,sha256=t4oowCNWldL0DrVJ4d0UlRkuGU2qHej_G0mAc_quteQ,455
|
|
|
4
4
|
deepresearch_flow/paper/__init__.py,sha256=sunaOkcgAJBrfmcaJTumcWbPGVUSGWvOv2a2Yidzy0A,43
|
|
5
5
|
deepresearch_flow/paper/cli.py,sha256=68d-yccScU0yL6d7eqZVdudPO6i_in8F4v-hKDWILMo,13647
|
|
6
6
|
deepresearch_flow/paper/config.py,sha256=V7z4ApPXCV1acSl2FU3nZGq6nt8uisMhm0GtOq5zzmg,12021
|
|
7
|
-
deepresearch_flow/paper/db.py,sha256=
|
|
7
|
+
deepresearch_flow/paper/db.py,sha256=RvUN9jeoaEgLNvf8NhWYD-cgIIMZwdZRK3cq17pNWZI,94727
|
|
8
8
|
deepresearch_flow/paper/db_ops.py,sha256=cb64jn2ax39i3tCS-0DYmlsJdGX3uBS2u5ncUIbUBic,73980
|
|
9
9
|
deepresearch_flow/paper/extract.py,sha256=78ASAyNLfCl1AsAk2o_v1vskZCNZuayaCHgr0S4V2Vs,87632
|
|
10
10
|
deepresearch_flow/paper/llm.py,sha256=mHfs5IkT3Q6BOh46MDlfUmgVTX24WRf0IKKoOnN8nV8,4007
|
|
@@ -43,11 +43,14 @@ deepresearch_flow/paper/schemas/default_paper_schema.json,sha256=6h_2ayHolJj8JMn
|
|
|
43
43
|
deepresearch_flow/paper/schemas/eight_questions_schema.json,sha256=VFKKpdZkgPdQkYIW5jyrZQ7c2TlQZwB4svVWfoiwxdg,1005
|
|
44
44
|
deepresearch_flow/paper/schemas/three_pass_schema.json,sha256=8aNr4EdRiilxszIRBCC4hRNXrfIOcdnVW4Qhe6Fnh0o,689
|
|
45
45
|
deepresearch_flow/paper/snapshot/__init__.py,sha256=1VLO36xxDB3J5Yoo-HH9vyI-4ev2HcivXN0sNLg8O5k,102
|
|
46
|
-
deepresearch_flow/paper/snapshot/api.py,sha256=
|
|
46
|
+
deepresearch_flow/paper/snapshot/api.py,sha256=F_qehvCjxTBTGj9FmqP4NnJQayUPJm0N5e_8mm5JlDQ,37405
|
|
47
47
|
deepresearch_flow/paper/snapshot/builder.py,sha256=HbRcfNteMoP4RnQ4y2onZCm9XfnIvzXLn_EwsLZsDzY,38692
|
|
48
|
+
deepresearch_flow/paper/snapshot/common.py,sha256=KAhlGlPgabOCe9Faps8BoDqin71qpkCfaL_ADCr_9vg,917
|
|
48
49
|
deepresearch_flow/paper/snapshot/identity.py,sha256=k9x1EZPFBU1qgxzkTGvwVtDjLgcosmM_udPuvRLl0uI,7748
|
|
50
|
+
deepresearch_flow/paper/snapshot/mcp_server.py,sha256=lvgbXmuZCZ_zaQMdZEMjN-OChHPdoZ9MmuuQ-7ORias,22901
|
|
49
51
|
deepresearch_flow/paper/snapshot/schema.py,sha256=DcVmAklLYyEeDoVV9jYw7hoMHnHd9Eziivl-LP2busY,8991
|
|
50
52
|
deepresearch_flow/paper/snapshot/text.py,sha256=0RnxLowa6AdirdLsUYym6BhWbjwiP2Qj2oZeA-pjmdE,4368
|
|
53
|
+
deepresearch_flow/paper/snapshot/unpacker.py,sha256=ScKSFdrQLJHrITHe9KAxgAEH-vAAnXLolvW9zeJ3wsc,8575
|
|
51
54
|
deepresearch_flow/paper/snapshot/tests/__init__.py,sha256=G0IowrxHjGUIaqxcw6SvlcLFAtE5ZsleG6ECgd-sIdk,52
|
|
52
55
|
deepresearch_flow/paper/snapshot/tests/test_identity.py,sha256=KDFixAUU9l68KOum7gf1IrD0Oy18dBCSXG7RbJTqflA,4520
|
|
53
56
|
deepresearch_flow/paper/templates/__init__.py,sha256=p8W6kINvrf-T2X6Ow4GMr28syVOorFuMn0pbmieVzAw,35
|
|
@@ -463,9 +466,9 @@ deepresearch_flow/translator/placeholder.py,sha256=mEgqA-dPdOsIhno0h_hzfpXpY2asb
|
|
|
463
466
|
deepresearch_flow/translator/prompts.py,sha256=EvfBvBIpQXARDj4m87GAyFXJGL8EJeahj_rOmp9mv68,5556
|
|
464
467
|
deepresearch_flow/translator/protector.py,sha256=yUMuS2FgVofK_MRXrcauLRiwNvdCCjNAnh6CcNd686o,11777
|
|
465
468
|
deepresearch_flow/translator/segment.py,sha256=rBFMCLTrvm2GrPc_hNFymi-8Ih2DAtUQlZHCRE9nLaM,5146
|
|
466
|
-
deepresearch_flow-0.
|
|
467
|
-
deepresearch_flow-0.
|
|
468
|
-
deepresearch_flow-0.
|
|
469
|
-
deepresearch_flow-0.
|
|
470
|
-
deepresearch_flow-0.
|
|
471
|
-
deepresearch_flow-0.
|
|
469
|
+
deepresearch_flow-0.7.0.dist-info/licenses/LICENSE,sha256=hT8F2Py1pe6flxq3Ufdm2UKFk0B8CBm0aAQfsLXfvjw,1063
|
|
470
|
+
deepresearch_flow-0.7.0.dist-info/METADATA,sha256=aluWW1CXPeSWCLKopChdbgl_GHEQHByua1fBohr6Mzg,26728
|
|
471
|
+
deepresearch_flow-0.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
472
|
+
deepresearch_flow-0.7.0.dist-info/entry_points.txt,sha256=1uIKscs0YRMg_mFsg9NjsaTt4CvQqQ_-zGERUKhhL_Y,65
|
|
473
|
+
deepresearch_flow-0.7.0.dist-info/top_level.txt,sha256=qBl4RvPJNJUbL8CFfMNWxY0HpQLx5RlF_ko-z_aKpm0,18
|
|
474
|
+
deepresearch_flow-0.7.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|