deepresearch-flow 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,7 +31,8 @@ from deepresearch_flow.paper.template_registry import (
31
31
  from deepresearch_flow.paper.render import resolve_render_template, render_papers
32
32
 
33
33
  try:
34
- from pybtex.database import parse_file
34
+ from pybtex.database import BibliographyData, parse_file
35
+ from pybtex.database.output.bibtex import Writer
35
36
  PYBTEX_AVAILABLE = True
36
37
  except ImportError:
37
38
  PYBTEX_AVAILABLE = False
@@ -423,6 +424,87 @@ def register_db_commands(db_group: click.Group) -> None:
423
424
  click.echo(f"Wrote snapshot DB: {opts.output_db}")
424
425
  click.echo(f"Wrote static export: {opts.static_export_dir}")
425
426
 
427
+ @snapshot_group.group("unpack")
428
+ def snapshot_unpack_group() -> None:
429
+ """Unpack snapshot artifacts."""
430
+
431
+ @snapshot_unpack_group.command("md")
432
+ @click.option("--snapshot-db", "snapshot_db", required=True, help="Path to snapshot database")
433
+ @click.option(
434
+ "--static-export-dir",
435
+ "static_export_dir",
436
+ required=True,
437
+ help="Path to static export directory",
438
+ )
439
+ @click.option(
440
+ "--pdf-root",
441
+ "pdf_roots",
442
+ multiple=True,
443
+ required=True,
444
+ help="PDF root directories for name alignment (repeatable)",
445
+ )
446
+ @click.option("--md-output-dir", "md_output_dir", required=True, help="Output directory for Markdown")
447
+ @click.option(
448
+ "--md-translated-output-dir",
449
+ "md_translated_output_dir",
450
+ required=True,
451
+ help="Output directory for translated Markdown",
452
+ )
453
+ def snapshot_unpack_md(
454
+ snapshot_db: str,
455
+ static_export_dir: str,
456
+ pdf_roots: tuple[str, ...],
457
+ md_output_dir: str,
458
+ md_translated_output_dir: str,
459
+ ) -> None:
460
+ """Unpack source/translated markdown and align filenames to PDFs."""
461
+ from deepresearch_flow.paper.snapshot.unpacker import SnapshotUnpackMdOptions, unpack_md
462
+
463
+ opts = SnapshotUnpackMdOptions(
464
+ snapshot_db=Path(snapshot_db),
465
+ static_export_dir=Path(static_export_dir),
466
+ pdf_roots=[Path(path) for path in pdf_roots],
467
+ md_output_dir=Path(md_output_dir),
468
+ md_translated_output_dir=Path(md_translated_output_dir),
469
+ )
470
+ unpack_md(opts)
471
+
472
+ @snapshot_unpack_group.command("info")
473
+ @click.option("--snapshot-db", "snapshot_db", required=True, help="Path to snapshot database")
474
+ @click.option(
475
+ "--static-export-dir",
476
+ "static_export_dir",
477
+ required=True,
478
+ help="Path to static export directory",
479
+ )
480
+ @click.option(
481
+ "--pdf-root",
482
+ "pdf_roots",
483
+ multiple=True,
484
+ required=True,
485
+ help="PDF root directories for name alignment (repeatable)",
486
+ )
487
+ @click.option("--template", "template", required=True, help="Summary template tag")
488
+ @click.option("--output-json", "output_json", required=True, help="Output JSON file path")
489
+ def snapshot_unpack_info(
490
+ snapshot_db: str,
491
+ static_export_dir: str,
492
+ pdf_roots: tuple[str, ...],
493
+ template: str,
494
+ output_json: str,
495
+ ) -> None:
496
+ """Unpack aggregated paper_infos.json from snapshot summaries."""
497
+ from deepresearch_flow.paper.snapshot.unpacker import SnapshotUnpackInfoOptions, unpack_info
498
+
499
+ opts = SnapshotUnpackInfoOptions(
500
+ snapshot_db=Path(snapshot_db),
501
+ static_export_dir=Path(static_export_dir),
502
+ pdf_roots=[Path(path) for path in pdf_roots],
503
+ template=template,
504
+ output_json=Path(output_json),
505
+ )
506
+ unpack_info(opts)
507
+
426
508
  @db_group.group("api")
427
509
  def api_group() -> None:
428
510
  """Read-only JSON API server backed by a snapshot DB."""
@@ -1015,11 +1097,18 @@ def register_db_commands(db_group: click.Group) -> None:
1015
1097
  def merge_group() -> None:
1016
1098
  """Merge paper JSON inputs."""
1017
1099
 
1018
- def _summarize_merge(output_path: Path, merged: list[dict[str, Any]], *, input_count: int) -> None:
1100
+ def _summarize_merge(output_path: Path, merged: Any, *, input_count: int) -> None:
1101
+ items: list[dict[str, Any]] = []
1102
+ if isinstance(merged, dict):
1103
+ raw_items = merged.get("papers")
1104
+ if isinstance(raw_items, list):
1105
+ items = [item for item in raw_items if isinstance(item, dict)]
1106
+ elif isinstance(merged, list):
1107
+ items = [item for item in merged if isinstance(item, dict)]
1108
+
1019
1109
  field_set: set[str] = set()
1020
- for item in merged:
1021
- if isinstance(item, dict):
1022
- field_set.update(item.keys())
1110
+ for item in items:
1111
+ field_set.update(item.keys())
1023
1112
  field_list = sorted(field_set)
1024
1113
 
1025
1114
  console = Console()
@@ -1027,7 +1116,7 @@ def register_db_commands(db_group: click.Group) -> None:
1027
1116
  summary.add_column("Metric", style="bold")
1028
1117
  summary.add_column("Value")
1029
1118
  summary.add_row("Inputs", str(input_count))
1030
- summary.add_row("Items", str(len(merged)))
1119
+ summary.add_row("Items", str(len(items)))
1031
1120
  summary.add_row("Fields", str(len(field_list)))
1032
1121
  summary.add_row("Output", str(output_path))
1033
1122
  console.print(summary)
@@ -1039,17 +1128,65 @@ def register_db_commands(db_group: click.Group) -> None:
1039
1128
  field_table.add_row(name)
1040
1129
  console.print(field_table)
1041
1130
 
1131
+ def _bibtex_entry_score(entry: Any) -> int:
1132
+ fields = getattr(entry, "fields", {}) or {}
1133
+ persons = getattr(entry, "persons", {}) or {}
1134
+ person_count = sum(len(people) for people in persons.values())
1135
+ return len(fields) + len(persons) + person_count
1136
+
1137
+ def _summarize_bibtex_merge(output_path: Path, *, input_count: int, entry_count: int, duplicate_count: int) -> None:
1138
+ summary = Table(title="BibTeX Merge Summary")
1139
+ summary.add_column("Metric", style="bold")
1140
+ summary.add_column("Value")
1141
+ summary.add_row("Inputs", str(input_count))
1142
+ summary.add_row("Entries", str(entry_count))
1143
+ summary.add_row("Duplicates", str(duplicate_count))
1144
+ summary.add_row("Output", str(output_path))
1145
+ Console().print(summary)
1146
+
1042
1147
  @merge_group.command("library")
1043
1148
  @click.option("-i", "--inputs", "input_paths", multiple=True, required=True, help="Input JSON files")
1149
+ @click.option("--template-tag", "template_tag", default=None, help="Template tag for merged output")
1044
1150
  @click.option("-o", "--output", "output_path", required=True, help="Output JSON file path")
1045
- def merge_library(input_paths: Iterable[str], output_path: str) -> None:
1151
+ def merge_library(input_paths: Iterable[str], template_tag: str | None, output_path: str) -> None:
1046
1152
  paths = [Path(path) for path in input_paths]
1047
1153
  merged: list[dict[str, Any]] = []
1154
+ tag_candidates: list[str] = []
1048
1155
  for path in paths:
1049
- merged.extend(load_json(path))
1156
+ payload = load_json(path)
1157
+ if isinstance(payload, dict):
1158
+ tag = str(payload.get("template_tag") or "")
1159
+ if tag:
1160
+ tag_candidates.append(tag)
1161
+ papers = payload.get("papers")
1162
+ if isinstance(papers, list):
1163
+ merged.extend(papers)
1164
+ else:
1165
+ raise click.ClickException("Input JSON must be a list or {template_tag, papers}")
1166
+ elif isinstance(payload, list):
1167
+ merged.extend(payload)
1168
+ else:
1169
+ raise click.ClickException("Input JSON must be a list or {template_tag, papers}")
1170
+ if not template_tag:
1171
+ inferred = ""
1172
+ for paper in merged:
1173
+ if not isinstance(paper, dict):
1174
+ continue
1175
+ inferred = str(paper.get("prompt_template") or paper.get("template_tag") or "")
1176
+ if inferred:
1177
+ break
1178
+ if inferred:
1179
+ template_tag = inferred
1180
+ if tag_candidates and not template_tag:
1181
+ template_tag = tag_candidates[0]
1182
+ if not template_tag:
1183
+ template_tag = "unknown"
1184
+ if tag_candidates and any(tag != template_tag for tag in tag_candidates):
1185
+ click.echo("Warning: multiple template_tag values detected in inputs; using first")
1050
1186
  output = Path(output_path)
1051
- write_json(output, merged)
1052
- _summarize_merge(output, merged, input_count=len(paths))
1187
+ bundle = {"template_tag": template_tag, "papers": merged}
1188
+ write_json(output, bundle)
1189
+ _summarize_merge(output, bundle, input_count=len(paths))
1053
1190
 
1054
1191
  @merge_group.command("templates")
1055
1192
  @click.option("-i", "--inputs", "input_paths", multiple=True, required=True, help="Input JSON files")
@@ -1201,6 +1338,62 @@ def register_db_commands(db_group: click.Group) -> None:
1201
1338
  sample_table.add_row(*row)
1202
1339
  Console().print(sample_table)
1203
1340
 
1341
+ @merge_group.command("bibtex")
1342
+ @click.option("-i", "--input", "input_paths", multiple=True, required=True, help="Input BibTeX file paths")
1343
+ @click.option("-o", "--output", "output_path", required=True, help="Output BibTeX file path")
1344
+ def merge_bibtex(input_paths: Iterable[str], output_path: str) -> None:
1345
+ if not PYBTEX_AVAILABLE:
1346
+ raise click.ClickException("pybtex is required for merge bibtex")
1347
+
1348
+ paths = [Path(path) for path in input_paths]
1349
+ if not paths:
1350
+ raise click.ClickException("No BibTeX inputs provided")
1351
+
1352
+ for path in paths:
1353
+ if not path.is_file():
1354
+ raise click.ClickException(f"BibTeX file not found: {path}")
1355
+
1356
+ merged_entries: dict[str, tuple[Any, int]] = {}
1357
+ duplicate_keys: list[str] = []
1358
+ duplicate_seen: set[str] = set()
1359
+
1360
+ for path in paths:
1361
+ bib_data = parse_file(str(path))
1362
+ for key, entry in bib_data.entries.items():
1363
+ score = _bibtex_entry_score(entry)
1364
+ if key not in merged_entries:
1365
+ merged_entries[key] = (entry, score)
1366
+ continue
1367
+ if key not in duplicate_seen:
1368
+ duplicate_seen.add(key)
1369
+ duplicate_keys.append(key)
1370
+ _, existing_score = merged_entries[key]
1371
+ if score > existing_score:
1372
+ merged_entries[key] = (entry, score)
1373
+
1374
+ output = Path(output_path)
1375
+ output.parent.mkdir(parents=True, exist_ok=True)
1376
+ out_data = BibliographyData()
1377
+ for key, (entry, _) in merged_entries.items():
1378
+ out_data.entries[key] = entry
1379
+ with output.open("w", encoding="utf-8") as handle:
1380
+ Writer().write_stream(out_data, handle)
1381
+
1382
+ _summarize_bibtex_merge(
1383
+ output,
1384
+ input_count=len(paths),
1385
+ entry_count=len(merged_entries),
1386
+ duplicate_count=len(duplicate_keys),
1387
+ )
1388
+
1389
+ if duplicate_keys:
1390
+ preview_limit = 20
1391
+ preview = ", ".join(duplicate_keys[:preview_limit])
1392
+ if len(duplicate_keys) > preview_limit:
1393
+ preview = f"{preview}, ... (+{len(duplicate_keys) - preview_limit} more)"
1394
+ note = "Kept entry with most fields; ties keep first input order."
1395
+ Console().print(Panel(f"{note}\n{preview}", title=f"Duplicate keys ({len(duplicate_keys)})", style="yellow"))
1396
+
1204
1397
  @db_group.command("render-md")
1205
1398
  @click.option("-i", "--input", "input_path", required=True, help="Input JSON file path")
1206
1399
  @click.option("-d", "--output-dir", "output_dir", default="rendered_md", help="Output directory")
@@ -11,8 +11,9 @@ from starlette.applications import Starlette
11
11
  from starlette.middleware.cors import CORSMiddleware
12
12
  from starlette.requests import Request
13
13
  from starlette.responses import JSONResponse, Response
14
- from starlette.routing import Route
14
+ from starlette.routing import Mount, Route
15
15
 
16
+ from deepresearch_flow.paper.snapshot.common import ApiLimits, _open_ro_conn
16
17
  from deepresearch_flow.paper.snapshot.text import merge_adjacent_markers, remove_cjk_spaces, rewrite_search_query
17
18
 
18
19
  _WHITESPACE_RE = re.compile(r"\s+")
@@ -87,13 +88,6 @@ _FACET_TYPE_TO_KEY = {
87
88
  }
88
89
 
89
90
 
90
- @dataclass(frozen=True)
91
- class ApiLimits:
92
- max_query_length: int = 500
93
- max_page_size: int = 100
94
- max_pagination_offset: int = 10_000 # page * page_size
95
-
96
-
97
91
  @dataclass(frozen=True)
98
92
  class SnapshotApiConfig:
99
93
  snapshot_db: Path
@@ -110,12 +104,6 @@ def _json_error(status_code: int, *, error: str, detail: str) -> JSONResponse:
110
104
  return JSONResponse({"error": error, "detail": detail}, status_code=status_code)
111
105
 
112
106
 
113
- def _open_ro_conn(db_path: Path) -> sqlite3.Connection:
114
- uri = f"file:{db_path.as_posix()}?mode=ro"
115
- conn = sqlite3.connect(uri, uri=True)
116
- conn.row_factory = sqlite3.Row
117
- conn.execute("PRAGMA query_only=ON;")
118
- return conn
119
107
 
120
108
 
121
109
  def _snapshot_build_id(conn: sqlite3.Connection) -> str:
@@ -917,6 +905,22 @@ def create_app(
917
905
  limits=limits or ApiLimits(),
918
906
  )
919
907
 
908
+ # Lazy import to avoid circular dependency
909
+ from deepresearch_flow.paper.snapshot.mcp_server import (
910
+ McpSnapshotConfig,
911
+ create_mcp_app,
912
+ resolve_static_export_dir,
913
+ )
914
+
915
+ mcp_config = McpSnapshotConfig(
916
+ snapshot_db=snapshot_db,
917
+ static_base_url=_normalize_base_url(static_base_url),
918
+ static_export_dir=resolve_static_export_dir(),
919
+ limits=limits or ApiLimits(),
920
+ origin_allowlist=cors_allowed_origins or ["*"],
921
+ )
922
+ mcp_app, mcp_lifespan = create_mcp_app(mcp_config)
923
+
920
924
  routes = [
921
925
  Route("/api/v1/config", _api_config, methods=["GET"]),
922
926
  Route("/api/v1/search", _api_search, methods=["GET"]),
@@ -927,9 +931,15 @@ def create_app(
927
931
  Route("/api/v1/facets/{facet:str}/{facet_id:str}/stats", _api_facet_stats, methods=["GET"]),
928
932
  Route("/api/v1/facets/{facet:str}/by-value/{value:str}/papers", _api_facet_by_value_papers, methods=["GET"]),
929
933
  Route("/api/v1/facets/{facet:str}/by-value/{value:str}/stats", _api_facet_by_value_stats, methods=["GET"]),
934
+ Mount("/mcp", app=mcp_app),
930
935
  ]
931
936
 
932
- app = Starlette(routes=routes)
937
+ # Pass MCP lifespan to ensure session manager initializes properly
938
+ # https://gofastmcp.com/deployment/http#mounting-in-starlette
939
+ app = Starlette(
940
+ routes=routes,
941
+ lifespan=mcp_lifespan,
942
+ )
933
943
  if cfg.cors_allowed_origins:
934
944
  app.add_middleware(
935
945
  CORSMiddleware,
@@ -0,0 +1,34 @@
1
+ """Shared utilities for snapshot API and MCP server.
2
+
3
+ This module contains common types, configuration, and utilities used by both
4
+ the snapshot REST API and the MCP server to avoid circular imports.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ import sqlite3
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class ApiLimits:
16
+ """API rate and size limits."""
17
+
18
+ max_query_length: int = 500
19
+ max_page_size: int = 100
20
+ max_pagination_offset: int = 10_000
21
+
22
+
23
+ def _open_ro_conn(db_path: Path) -> sqlite3.Connection:
24
+ """Open a read-only SQLite connection with Row factory.
25
+
26
+ Args:
27
+ db_path: Path to the SQLite database file.
28
+
29
+ Returns:
30
+ sqlite3.Connection: A read-only connection with row_factory set to Row.
31
+ """
32
+ conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, check_same_thread=False)
33
+ conn.row_factory = sqlite3.Row
34
+ return conn