deepresearch-flow 0.6.1__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/__init__.py +1 -1
- deepresearch_flow/paper/db.py +81 -0
- deepresearch_flow/paper/snapshot/api.py +25 -15
- deepresearch_flow/paper/snapshot/common.py +34 -0
- deepresearch_flow/paper/snapshot/mcp_server.py +686 -0
- deepresearch_flow/paper/snapshot/unpacker.py +259 -0
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.1.dist-info}/METADATA +153 -4
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.1.dist-info}/RECORD +12 -9
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.1.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.1.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.1.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.6.1.dist-info → deepresearch_flow-0.7.1.dist-info}/top_level.txt +0 -0
deepresearch_flow/__init__.py
CHANGED
deepresearch_flow/paper/db.py
CHANGED
|
@@ -424,6 +424,87 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
424
424
|
click.echo(f"Wrote snapshot DB: {opts.output_db}")
|
|
425
425
|
click.echo(f"Wrote static export: {opts.static_export_dir}")
|
|
426
426
|
|
|
427
|
+
@snapshot_group.group("unpack")
|
|
428
|
+
def snapshot_unpack_group() -> None:
|
|
429
|
+
"""Unpack snapshot artifacts."""
|
|
430
|
+
|
|
431
|
+
@snapshot_unpack_group.command("md")
|
|
432
|
+
@click.option("--snapshot-db", "snapshot_db", required=True, help="Path to snapshot database")
|
|
433
|
+
@click.option(
|
|
434
|
+
"--static-export-dir",
|
|
435
|
+
"static_export_dir",
|
|
436
|
+
required=True,
|
|
437
|
+
help="Path to static export directory",
|
|
438
|
+
)
|
|
439
|
+
@click.option(
|
|
440
|
+
"--pdf-root",
|
|
441
|
+
"pdf_roots",
|
|
442
|
+
multiple=True,
|
|
443
|
+
required=True,
|
|
444
|
+
help="PDF root directories for name alignment (repeatable)",
|
|
445
|
+
)
|
|
446
|
+
@click.option("--md-output-dir", "md_output_dir", required=True, help="Output directory for Markdown")
|
|
447
|
+
@click.option(
|
|
448
|
+
"--md-translated-output-dir",
|
|
449
|
+
"md_translated_output_dir",
|
|
450
|
+
required=True,
|
|
451
|
+
help="Output directory for translated Markdown",
|
|
452
|
+
)
|
|
453
|
+
def snapshot_unpack_md(
|
|
454
|
+
snapshot_db: str,
|
|
455
|
+
static_export_dir: str,
|
|
456
|
+
pdf_roots: tuple[str, ...],
|
|
457
|
+
md_output_dir: str,
|
|
458
|
+
md_translated_output_dir: str,
|
|
459
|
+
) -> None:
|
|
460
|
+
"""Unpack source/translated markdown and align filenames to PDFs."""
|
|
461
|
+
from deepresearch_flow.paper.snapshot.unpacker import SnapshotUnpackMdOptions, unpack_md
|
|
462
|
+
|
|
463
|
+
opts = SnapshotUnpackMdOptions(
|
|
464
|
+
snapshot_db=Path(snapshot_db),
|
|
465
|
+
static_export_dir=Path(static_export_dir),
|
|
466
|
+
pdf_roots=[Path(path) for path in pdf_roots],
|
|
467
|
+
md_output_dir=Path(md_output_dir),
|
|
468
|
+
md_translated_output_dir=Path(md_translated_output_dir),
|
|
469
|
+
)
|
|
470
|
+
unpack_md(opts)
|
|
471
|
+
|
|
472
|
+
@snapshot_unpack_group.command("info")
|
|
473
|
+
@click.option("--snapshot-db", "snapshot_db", required=True, help="Path to snapshot database")
|
|
474
|
+
@click.option(
|
|
475
|
+
"--static-export-dir",
|
|
476
|
+
"static_export_dir",
|
|
477
|
+
required=True,
|
|
478
|
+
help="Path to static export directory",
|
|
479
|
+
)
|
|
480
|
+
@click.option(
|
|
481
|
+
"--pdf-root",
|
|
482
|
+
"pdf_roots",
|
|
483
|
+
multiple=True,
|
|
484
|
+
required=True,
|
|
485
|
+
help="PDF root directories for name alignment (repeatable)",
|
|
486
|
+
)
|
|
487
|
+
@click.option("--template", "template", required=True, help="Summary template tag")
|
|
488
|
+
@click.option("--output-json", "output_json", required=True, help="Output JSON file path")
|
|
489
|
+
def snapshot_unpack_info(
|
|
490
|
+
snapshot_db: str,
|
|
491
|
+
static_export_dir: str,
|
|
492
|
+
pdf_roots: tuple[str, ...],
|
|
493
|
+
template: str,
|
|
494
|
+
output_json: str,
|
|
495
|
+
) -> None:
|
|
496
|
+
"""Unpack aggregated paper_infos.json from snapshot summaries."""
|
|
497
|
+
from deepresearch_flow.paper.snapshot.unpacker import SnapshotUnpackInfoOptions, unpack_info
|
|
498
|
+
|
|
499
|
+
opts = SnapshotUnpackInfoOptions(
|
|
500
|
+
snapshot_db=Path(snapshot_db),
|
|
501
|
+
static_export_dir=Path(static_export_dir),
|
|
502
|
+
pdf_roots=[Path(path) for path in pdf_roots],
|
|
503
|
+
template=template,
|
|
504
|
+
output_json=Path(output_json),
|
|
505
|
+
)
|
|
506
|
+
unpack_info(opts)
|
|
507
|
+
|
|
427
508
|
@db_group.group("api")
|
|
428
509
|
def api_group() -> None:
|
|
429
510
|
"""Read-only JSON API server backed by a snapshot DB."""
|
|
@@ -11,8 +11,9 @@ from starlette.applications import Starlette
|
|
|
11
11
|
from starlette.middleware.cors import CORSMiddleware
|
|
12
12
|
from starlette.requests import Request
|
|
13
13
|
from starlette.responses import JSONResponse, Response
|
|
14
|
-
from starlette.routing import Route
|
|
14
|
+
from starlette.routing import Mount, Route
|
|
15
15
|
|
|
16
|
+
from deepresearch_flow.paper.snapshot.common import ApiLimits, _open_ro_conn
|
|
16
17
|
from deepresearch_flow.paper.snapshot.text import merge_adjacent_markers, remove_cjk_spaces, rewrite_search_query
|
|
17
18
|
|
|
18
19
|
_WHITESPACE_RE = re.compile(r"\s+")
|
|
@@ -87,13 +88,6 @@ _FACET_TYPE_TO_KEY = {
|
|
|
87
88
|
}
|
|
88
89
|
|
|
89
90
|
|
|
90
|
-
@dataclass(frozen=True)
|
|
91
|
-
class ApiLimits:
|
|
92
|
-
max_query_length: int = 500
|
|
93
|
-
max_page_size: int = 100
|
|
94
|
-
max_pagination_offset: int = 10_000 # page * page_size
|
|
95
|
-
|
|
96
|
-
|
|
97
91
|
@dataclass(frozen=True)
|
|
98
92
|
class SnapshotApiConfig:
|
|
99
93
|
snapshot_db: Path
|
|
@@ -110,12 +104,6 @@ def _json_error(status_code: int, *, error: str, detail: str) -> JSONResponse:
|
|
|
110
104
|
return JSONResponse({"error": error, "detail": detail}, status_code=status_code)
|
|
111
105
|
|
|
112
106
|
|
|
113
|
-
def _open_ro_conn(db_path: Path) -> sqlite3.Connection:
|
|
114
|
-
uri = f"file:{db_path.as_posix()}?mode=ro"
|
|
115
|
-
conn = sqlite3.connect(uri, uri=True)
|
|
116
|
-
conn.row_factory = sqlite3.Row
|
|
117
|
-
conn.execute("PRAGMA query_only=ON;")
|
|
118
|
-
return conn
|
|
119
107
|
|
|
120
108
|
|
|
121
109
|
def _snapshot_build_id(conn: sqlite3.Connection) -> str:
|
|
@@ -917,6 +905,22 @@ def create_app(
|
|
|
917
905
|
limits=limits or ApiLimits(),
|
|
918
906
|
)
|
|
919
907
|
|
|
908
|
+
# Lazy import to avoid circular dependency
|
|
909
|
+
from deepresearch_flow.paper.snapshot.mcp_server import (
|
|
910
|
+
McpSnapshotConfig,
|
|
911
|
+
create_mcp_app,
|
|
912
|
+
resolve_static_export_dir,
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
mcp_config = McpSnapshotConfig(
|
|
916
|
+
snapshot_db=snapshot_db,
|
|
917
|
+
static_base_url=_normalize_base_url(static_base_url),
|
|
918
|
+
static_export_dir=resolve_static_export_dir(),
|
|
919
|
+
limits=limits or ApiLimits(),
|
|
920
|
+
origin_allowlist=cors_allowed_origins or ["*"],
|
|
921
|
+
)
|
|
922
|
+
mcp_app, mcp_lifespan = create_mcp_app(mcp_config)
|
|
923
|
+
|
|
920
924
|
routes = [
|
|
921
925
|
Route("/api/v1/config", _api_config, methods=["GET"]),
|
|
922
926
|
Route("/api/v1/search", _api_search, methods=["GET"]),
|
|
@@ -927,9 +931,15 @@ def create_app(
|
|
|
927
931
|
Route("/api/v1/facets/{facet:str}/{facet_id:str}/stats", _api_facet_stats, methods=["GET"]),
|
|
928
932
|
Route("/api/v1/facets/{facet:str}/by-value/{value:str}/papers", _api_facet_by_value_papers, methods=["GET"]),
|
|
929
933
|
Route("/api/v1/facets/{facet:str}/by-value/{value:str}/stats", _api_facet_by_value_stats, methods=["GET"]),
|
|
934
|
+
Mount("/mcp", app=mcp_app),
|
|
930
935
|
]
|
|
931
936
|
|
|
932
|
-
|
|
937
|
+
# Pass MCP lifespan to ensure session manager initializes properly
|
|
938
|
+
# https://gofastmcp.com/deployment/http#mounting-in-starlette
|
|
939
|
+
app = Starlette(
|
|
940
|
+
routes=routes,
|
|
941
|
+
lifespan=mcp_lifespan,
|
|
942
|
+
)
|
|
933
943
|
if cfg.cors_allowed_origins:
|
|
934
944
|
app.add_middleware(
|
|
935
945
|
CORSMiddleware,
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Shared utilities for snapshot API and MCP server.
|
|
2
|
+
|
|
3
|
+
This module contains common types, configuration, and utilities used by both
|
|
4
|
+
the snapshot REST API and the MCP server to avoid circular imports.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import sqlite3
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class ApiLimits:
|
|
16
|
+
"""API rate and size limits."""
|
|
17
|
+
|
|
18
|
+
max_query_length: int = 500
|
|
19
|
+
max_page_size: int = 100
|
|
20
|
+
max_pagination_offset: int = 10_000
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _open_ro_conn(db_path: Path) -> sqlite3.Connection:
|
|
24
|
+
"""Open a read-only SQLite connection with Row factory.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
db_path: Path to the SQLite database file.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
sqlite3.Connection: A read-only connection with row_factory set to Row.
|
|
31
|
+
"""
|
|
32
|
+
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, check_same_thread=False)
|
|
33
|
+
conn.row_factory = sqlite3.Row
|
|
34
|
+
return conn
|