deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. deepresearch_flow/paper/cli.py +63 -0
  2. deepresearch_flow/paper/config.py +87 -12
  3. deepresearch_flow/paper/db.py +1041 -34
  4. deepresearch_flow/paper/db_ops.py +124 -19
  5. deepresearch_flow/paper/extract.py +1546 -152
  6. deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
  7. deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
  8. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
  9. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
  10. deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
  11. deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
  14. deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
  15. deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
  16. deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
  17. deepresearch_flow/paper/providers/azure_openai.py +45 -3
  18. deepresearch_flow/paper/providers/openai_compatible.py +45 -3
  19. deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
  20. deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
  21. deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
  22. deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
  23. deepresearch_flow/paper/snapshot/__init__.py +4 -0
  24. deepresearch_flow/paper/snapshot/api.py +941 -0
  25. deepresearch_flow/paper/snapshot/builder.py +965 -0
  26. deepresearch_flow/paper/snapshot/identity.py +239 -0
  27. deepresearch_flow/paper/snapshot/schema.py +245 -0
  28. deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
  29. deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
  30. deepresearch_flow/paper/snapshot/text.py +154 -0
  31. deepresearch_flow/paper/template_registry.py +1 -0
  32. deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
  33. deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
  34. deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
  35. deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
  36. deepresearch_flow/paper/web/app.py +10 -3
  37. deepresearch_flow/recognize/cli.py +380 -103
  38. deepresearch_flow/recognize/markdown.py +31 -7
  39. deepresearch_flow/recognize/math.py +47 -12
  40. deepresearch_flow/recognize/mermaid.py +320 -10
  41. deepresearch_flow/recognize/organize.py +29 -7
  42. deepresearch_flow/translator/cli.py +71 -20
  43. deepresearch_flow/translator/engine.py +220 -81
  44. deepresearch_flow/translator/prompts.py +19 -2
  45. deepresearch_flow/translator/protector.py +15 -3
  46. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
  47. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +51 -43
  48. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
  49. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
  50. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
  51. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
@@ -5,10 +5,13 @@ from __future__ import annotations
5
5
  import asyncio
6
6
  import json
7
7
  import re
8
+ import shutil
8
9
  from pathlib import Path
9
10
  from typing import Any, Iterable
10
11
  import difflib
11
12
 
13
+ from tqdm import tqdm
14
+
12
15
  import click
13
16
  import httpx
14
17
  from rich.console import Console
@@ -19,7 +22,12 @@ from deepresearch_flow.paper.config import load_config, resolve_api_keys
19
22
  from deepresearch_flow.paper.extract import parse_model_ref
20
23
  from deepresearch_flow.paper.llm import backoff_delay, call_provider
21
24
  from deepresearch_flow.paper.providers.base import ProviderError
22
- from deepresearch_flow.paper.template_registry import list_template_names
25
+ from deepresearch_flow.paper.schema import SchemaError, load_schema
26
+ from deepresearch_flow.paper.template_registry import (
27
+ get_stage_definitions,
28
+ list_template_names,
29
+ load_schema_for_template,
30
+ )
23
31
  from deepresearch_flow.paper.render import resolve_render_template, render_papers
24
32
 
25
33
  try:
@@ -42,6 +50,74 @@ def write_json(path: Path, data: Any) -> None:
42
50
  path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
43
51
 
44
52
 
53
+ def load_json_payload(path: Path) -> tuple[list[dict[str, Any]], dict[str, Any] | None]:
54
+ try:
55
+ data = json.loads(path.read_text(encoding="utf-8"))
56
+ except json.JSONDecodeError as exc:
57
+ raise click.ClickException(f"Invalid JSON in {path}: {exc}") from exc
58
+
59
+ if isinstance(data, list):
60
+ return data, None
61
+ if isinstance(data, dict):
62
+ papers = data.get("papers")
63
+ if isinstance(papers, list):
64
+ return papers, data
65
+ raise click.ClickException(f"JSON object missing 'papers' list: {path}")
66
+
67
+ raise click.ClickException(f"Unsupported JSON structure in {path}")
68
+
69
+
70
+ def is_empty_value(value: Any) -> bool:
71
+ if value is None:
72
+ return True
73
+ if isinstance(value, str):
74
+ return value.strip() == ""
75
+ if isinstance(value, list) or isinstance(value, dict):
76
+ return len(value) == 0
77
+ return False
78
+
79
+
80
+ def export_compare_csv(results: list[Any], output_path: Path) -> None:
81
+ output_path.parent.mkdir(parents=True, exist_ok=True)
82
+
83
+ import csv
84
+
85
+ with open(output_path, "w", newline="", encoding="utf-8") as handle:
86
+ writer = csv.writer(handle)
87
+ writer.writerow([
88
+ "Side", "Source Hash", "Title", "Match Status", "Match Type",
89
+ "Match Score", "Source Path", "Other Source Hash", "Other Title",
90
+ "Other Source Path", "Lang"
91
+ ])
92
+ for result in results:
93
+ writer.writerow([
94
+ result.side,
95
+ result.source_hash,
96
+ result.title,
97
+ result.match_status,
98
+ result.match_type or "",
99
+ f"{result.match_score:.4f}",
100
+ result.source_path or "",
101
+ result.other_source_hash or "",
102
+ result.other_title or "",
103
+ result.other_source_path or "",
104
+ result.lang or "",
105
+ ])
106
+
107
+
108
+ def export_only_in_b_paths(results: list[Any], output_path: Path) -> int:
109
+ output_path.parent.mkdir(parents=True, exist_ok=True)
110
+ lines = []
111
+ for result in results:
112
+ if result.side != "B" or result.match_status != "only_in_B":
113
+ continue
114
+ if result.source_path:
115
+ lines.append(result.source_path)
116
+
117
+ output_path.write_text("\n".join(lines) + ("\n" if lines else ""), encoding="utf-8")
118
+ return len(lines)
119
+
120
+
45
121
  def normalize_authors(value: Any) -> list[str]:
46
122
  if value is None:
47
123
  return []
@@ -133,6 +209,18 @@ def parse_year_month(date_str: str | None) -> tuple[str | None, str | None]:
133
209
  return year, None
134
210
 
135
211
 
212
+ def resolve_relative_path(path: Path, roots: Iterable[Path]) -> Path:
213
+ resolved = path.resolve()
214
+ roots_by_depth = sorted(roots, key=lambda r: len(str(r.resolve())), reverse=True)
215
+ for root in roots_by_depth:
216
+ root_resolved = root.resolve()
217
+ try:
218
+ return resolved.relative_to(root_resolved)
219
+ except ValueError:
220
+ continue
221
+ return Path(path.name)
222
+
223
+
136
224
  def clean_journal_name(name: str | None) -> str:
137
225
  if not name:
138
226
  return "Unknown"
@@ -266,6 +354,147 @@ def parse_tag_list(text: str) -> list[str]:
266
354
 
267
355
 
268
356
  def register_db_commands(db_group: click.Group) -> None:
357
+ @db_group.group("snapshot")
358
+ def snapshot_group() -> None:
359
+ """Build production snapshot artifacts (SQLite + static export)."""
360
+
361
+ @snapshot_group.command("build")
362
+ @click.option("-i", "--input", "input_paths", multiple=True, required=True, help="Input JSON file path")
363
+ @click.option("-b", "--bibtex", "bibtex_path", default=None, help="Optional BibTeX file path")
364
+ @click.option(
365
+ "--md-root",
366
+ "md_roots",
367
+ multiple=True,
368
+ default=(),
369
+ help="Optional markdown root directory (repeatable) for source viewing",
370
+ )
371
+ @click.option(
372
+ "--md-translated-root",
373
+ "md_translated_roots",
374
+ multiple=True,
375
+ default=(),
376
+ help="Optional markdown root directory (repeatable) for translated viewing",
377
+ )
378
+ @click.option(
379
+ "--pdf-root",
380
+ "pdf_roots",
381
+ multiple=True,
382
+ default=(),
383
+ help="Optional PDF root directory (repeatable) for PDF discovery",
384
+ )
385
+ @click.option("--output-db", "output_db", default="paper_snapshot.db", show_default=True, help="Output DB path")
386
+ @click.option(
387
+ "--static-export-dir",
388
+ "static_export_dir",
389
+ default="paper-static",
390
+ show_default=True,
391
+ help="Output directory for hashed static assets",
392
+ )
393
+ @click.option(
394
+ "--previous-snapshot-db",
395
+ "previous_snapshot_db",
396
+ default=None,
397
+ help="Optional previous snapshot DB path for identity continuity",
398
+ )
399
+ def snapshot_build(
400
+ input_paths: tuple[str, ...],
401
+ bibtex_path: str | None,
402
+ md_roots: tuple[str, ...],
403
+ md_translated_roots: tuple[str, ...],
404
+ pdf_roots: tuple[str, ...],
405
+ output_db: str,
406
+ static_export_dir: str,
407
+ previous_snapshot_db: str | None,
408
+ ) -> None:
409
+ """Build a production snapshot (SQLite + static export)."""
410
+ from deepresearch_flow.paper.snapshot.builder import SnapshotBuildOptions, build_snapshot
411
+
412
+ opts = SnapshotBuildOptions(
413
+ input_paths=[Path(path) for path in input_paths],
414
+ bibtex_path=Path(bibtex_path) if bibtex_path else None,
415
+ md_roots=[Path(root) for root in md_roots],
416
+ md_translated_roots=[Path(root) for root in md_translated_roots],
417
+ pdf_roots=[Path(root) for root in pdf_roots],
418
+ output_db=Path(output_db),
419
+ static_export_dir=Path(static_export_dir),
420
+ previous_snapshot_db=Path(previous_snapshot_db) if previous_snapshot_db else None,
421
+ )
422
+ build_snapshot(opts)
423
+ click.echo(f"Wrote snapshot DB: {opts.output_db}")
424
+ click.echo(f"Wrote static export: {opts.static_export_dir}")
425
+
426
+ @db_group.group("api")
427
+ def api_group() -> None:
428
+ """Read-only JSON API server backed by a snapshot DB."""
429
+
430
+ @api_group.command("serve")
431
+ @click.option("--snapshot-db", "snapshot_db", required=True, help="Path to paper_snapshot.db")
432
+ @click.option(
433
+ "--static-base-url",
434
+ "static_base_url",
435
+ default=None,
436
+ help="Static asset base URL (e.g. https://static.example.com)",
437
+ )
438
+ @click.option(
439
+ "--cors-origin",
440
+ "cors_origins",
441
+ multiple=True,
442
+ default=(),
443
+ help="Allowed CORS origin (repeatable; default is '*')",
444
+ )
445
+ @click.option("--max-query-length", "max_query_length", type=int, default=500, show_default=True)
446
+ @click.option("--max-page-size", "max_page_size", type=int, default=100, show_default=True)
447
+ @click.option("--max-pagination-offset", "max_pagination_offset", type=int, default=10000, show_default=True)
448
+ @click.option("--host", default="127.0.0.1", show_default=True, help="Bind host")
449
+ @click.option("--port", default=8001, type=int, show_default=True, help="Bind port")
450
+ def api_serve(
451
+ snapshot_db: str,
452
+ static_base_url: str | None,
453
+ cors_origins: tuple[str, ...],
454
+ max_query_length: int,
455
+ max_page_size: int,
456
+ max_pagination_offset: int,
457
+ host: str,
458
+ port: int,
459
+ ) -> None:
460
+ """Serve the snapshot-backed JSON API."""
461
+ import os
462
+ import uvicorn
463
+
464
+ from deepresearch_flow.paper.snapshot.api import ApiLimits, create_app
465
+
466
+ static_base_url_value = (
467
+ static_base_url
468
+ or os.getenv("PAPER_DB_STATIC_BASE")
469
+ or os.getenv("PAPER_DB_STATIC_BASE_URL")
470
+ or ""
471
+ )
472
+ api_base_url = os.getenv("PAPER_DB_API_BASE") or ""
473
+ if api_base_url and host == "127.0.0.1" and port == 8001:
474
+ from urllib.parse import urlparse
475
+
476
+ parsed = urlparse(api_base_url)
477
+ if not parsed.scheme:
478
+ parsed = urlparse(f"http://{api_base_url}")
479
+ if parsed.hostname:
480
+ host = parsed.hostname
481
+ if parsed.port:
482
+ port = parsed.port
483
+ cors_allowed = list(cors_origins) if cors_origins else ["*"]
484
+ limits = ApiLimits(
485
+ max_query_length=max_query_length,
486
+ max_page_size=max_page_size,
487
+ max_pagination_offset=max_pagination_offset,
488
+ )
489
+ app = create_app(
490
+ snapshot_db=Path(snapshot_db),
491
+ static_base_url=static_base_url_value,
492
+ cors_allowed_origins=cors_allowed,
493
+ limits=limits,
494
+ )
495
+ click.echo(f"Serving API on http://{host}:{port} (Ctrl+C to stop)")
496
+ uvicorn.run(app, host=host, port=port, log_level="info")
497
+
269
498
  @db_group.command("append-bibtex")
270
499
  @click.option("-i", "--input", "input_path", required=True, help="Input JSON file path")
271
500
  @click.option("-b", "--bibtex", "bibtex_path", required=True, help="Input BibTeX file path")
@@ -782,15 +1011,195 @@ def register_db_commands(db_group: click.Group) -> None:
782
1011
  write_json(Path(output_path), filtered)
783
1012
  click.echo(f"Filtered down to {len(filtered)} papers")
784
1013
 
785
- @db_group.command("merge")
1014
+ @db_group.group("merge")
1015
+ def merge_group() -> None:
1016
+ """Merge paper JSON inputs."""
1017
+
1018
+ def _summarize_merge(output_path: Path, merged: list[dict[str, Any]], *, input_count: int) -> None:
1019
+ field_set: set[str] = set()
1020
+ for item in merged:
1021
+ if isinstance(item, dict):
1022
+ field_set.update(item.keys())
1023
+ field_list = sorted(field_set)
1024
+
1025
+ console = Console()
1026
+ summary = Table(title="Merge Summary")
1027
+ summary.add_column("Metric", style="bold")
1028
+ summary.add_column("Value")
1029
+ summary.add_row("Inputs", str(input_count))
1030
+ summary.add_row("Items", str(len(merged)))
1031
+ summary.add_row("Fields", str(len(field_list)))
1032
+ summary.add_row("Output", str(output_path))
1033
+ console.print(summary)
1034
+
1035
+ if field_list:
1036
+ field_table = Table(title="Fields")
1037
+ field_table.add_column("Name")
1038
+ for name in field_list:
1039
+ field_table.add_row(name)
1040
+ console.print(field_table)
1041
+
1042
+ @merge_group.command("library")
1043
+ @click.option("-i", "--inputs", "input_paths", multiple=True, required=True, help="Input JSON files")
1044
+ @click.option("-o", "--output", "output_path", required=True, help="Output JSON file path")
1045
+ def merge_library(input_paths: Iterable[str], output_path: str) -> None:
1046
+ paths = [Path(path) for path in input_paths]
1047
+ merged: list[dict[str, Any]] = []
1048
+ for path in paths:
1049
+ merged.extend(load_json(path))
1050
+ output = Path(output_path)
1051
+ write_json(output, merged)
1052
+ _summarize_merge(output, merged, input_count=len(paths))
1053
+
1054
+ @merge_group.command("templates")
786
1055
  @click.option("-i", "--inputs", "input_paths", multiple=True, required=True, help="Input JSON files")
787
1056
  @click.option("-o", "--output", "output_path", required=True, help="Output JSON file path")
788
- def merge_papers(input_paths: Iterable[str], output_path: str) -> None:
1057
+ def merge_templates(input_paths: Iterable[str], output_path: str) -> None:
1058
+ from deepresearch_flow.paper import db_ops
1059
+
1060
+ paths = [Path(path) for path in input_paths]
1061
+ inputs = db_ops._load_paper_inputs(paths)
1062
+ if not inputs:
1063
+ raise click.ClickException("No input JSON files provided")
1064
+
1065
+ groups: list[dict[str, Any]] = []
1066
+ base_papers: list[dict[str, Any]] = []
1067
+ hash_to_group: dict[str, int] = {}
1068
+ paper_id_to_group: dict[int, int] = {}
1069
+ paper_index: dict[str, list[dict[str, Any]]] = {}
1070
+
1071
+ def rebuild_index() -> None:
1072
+ nonlocal paper_index, paper_id_to_group
1073
+ paper_index = db_ops._build_paper_index(base_papers)
1074
+ paper_id_to_group = {id(paper): idx for idx, paper in enumerate(base_papers)}
1075
+
1076
+ def add_group(template_tag: str, paper: dict[str, Any]) -> None:
1077
+ group = {
1078
+ "templates": {template_tag: paper},
1079
+ "template_order": [template_tag],
1080
+ }
1081
+ groups.append(group)
1082
+ base_papers.append(paper)
1083
+ source_hash = str(paper.get("source_hash") or "")
1084
+ if source_hash:
1085
+ hash_to_group[source_hash] = len(groups) - 1
1086
+ rebuild_index()
1087
+
1088
+ stats: dict[str, dict[str, int]] = {}
1089
+ diff_counts: dict[tuple[str, str], int] = {}
1090
+ diff_samples: list[tuple[str, str, str, str, str]] = []
1091
+ first_tag = str(inputs[0].get("template_tag") or "")
1092
+ base_items = inputs[0].get("papers") or []
1093
+ stats[first_tag] = {"total": len(base_items), "matched": len(base_items), "skipped": 0}
1094
+ for paper in base_items:
1095
+ if not isinstance(paper, dict):
1096
+ raise click.ClickException("Input papers must be objects")
1097
+ db_ops._prepare_paper_matching_fields(paper)
1098
+ add_group(first_tag, paper)
1099
+
1100
+ for bundle in inputs[1:]:
1101
+ template_tag = str(bundle.get("template_tag") or "")
1102
+ items = bundle.get("papers") or []
1103
+ matched = 0
1104
+ skipped = 0
1105
+ for paper in items:
1106
+ if not isinstance(paper, dict):
1107
+ raise click.ClickException("Input papers must be objects")
1108
+ db_ops._prepare_paper_matching_fields(paper)
1109
+ source_hash = str(paper.get("source_hash") or "")
1110
+ match_idx: int | None = None
1111
+ if source_hash and source_hash in hash_to_group:
1112
+ match_idx = hash_to_group[source_hash]
1113
+ else:
1114
+ match_paper, _, _ = db_ops._resolve_paper_by_title_and_meta(
1115
+ paper, paper_index
1116
+ )
1117
+ if match_paper is not None:
1118
+ match_idx = paper_id_to_group.get(id(match_paper))
1119
+ if match_idx is None:
1120
+ skipped += 1
1121
+ continue
1122
+ matched += 1
1123
+ group = groups[match_idx]
1124
+ base_templates = group.get("templates") or {}
1125
+ base_paper = base_templates.get(first_tag)
1126
+ if isinstance(base_paper, dict):
1127
+ for field in ("source_hash", "paper_title", "publication_date"):
1128
+ base_value = str(base_paper.get(field) or "")
1129
+ other_value = str(paper.get(field) or "")
1130
+ if base_value == other_value:
1131
+ continue
1132
+ diff_counts[(template_tag, field)] = diff_counts.get(
1133
+ (template_tag, field), 0
1134
+ ) + 1
1135
+ if len(diff_samples) < 50:
1136
+ diff_samples.append(
1137
+ (
1138
+ template_tag,
1139
+ field,
1140
+ str(base_paper.get("paper_title") or ""),
1141
+ base_value,
1142
+ other_value,
1143
+ )
1144
+ )
1145
+ templates = group.setdefault("templates", {})
1146
+ templates[template_tag] = paper
1147
+ order = group.setdefault("template_order", [])
1148
+ if template_tag not in order:
1149
+ order.append(template_tag)
1150
+ stats[template_tag] = {"total": len(items), "matched": matched, "skipped": skipped}
1151
+
789
1152
  merged: list[dict[str, Any]] = []
790
- for path in input_paths:
791
- merged.extend(load_json(Path(path)))
792
- write_json(Path(output_path), merged)
793
- click.echo(f"Merged {len(input_paths)} files into {output_path}")
1153
+ for group in groups:
1154
+ templates = group.get("templates") or {}
1155
+ order = group.get("template_order") or list(templates.keys())
1156
+ entry: dict[str, Any] = {}
1157
+ for tag in order:
1158
+ paper = templates.get(tag)
1159
+ if not isinstance(paper, dict):
1160
+ continue
1161
+ for key, value in paper.items():
1162
+ if key not in entry:
1163
+ entry[key] = value
1164
+ merged.append(entry)
1165
+
1166
+ output = Path(output_path)
1167
+ write_json(output, merged)
1168
+ _summarize_merge(output, merged, input_count=len(paths))
1169
+
1170
+ stat_table = Table(title="Template Merge Stats")
1171
+ stat_table.add_column("Template")
1172
+ stat_table.add_column("Total", justify="right")
1173
+ stat_table.add_column("Matched", justify="right")
1174
+ stat_table.add_column("Skipped", justify="right")
1175
+ for tag, values in stats.items():
1176
+ stat_table.add_row(
1177
+ tag or "(unknown)",
1178
+ str(values.get("total", 0)),
1179
+ str(values.get("matched", 0)),
1180
+ str(values.get("skipped", 0)),
1181
+ )
1182
+ Console().print(stat_table)
1183
+
1184
+ if diff_counts:
1185
+ diff_table = Table(title="Template Field Diff Summary")
1186
+ diff_table.add_column("Template")
1187
+ diff_table.add_column("Field")
1188
+ diff_table.add_column("Count", justify="right")
1189
+ for (template_tag, field), count in sorted(diff_counts.items()):
1190
+ diff_table.add_row(template_tag or "(unknown)", field, str(count))
1191
+ Console().print(diff_table)
1192
+
1193
+ if diff_samples:
1194
+ sample_table = Table(title="Template Field Diff Samples (up to 50)")
1195
+ sample_table.add_column("Template")
1196
+ sample_table.add_column("Field")
1197
+ sample_table.add_column("Base Title")
1198
+ sample_table.add_column("Base Value")
1199
+ sample_table.add_column("Other Value")
1200
+ for row in diff_samples:
1201
+ sample_table.add_row(*row)
1202
+ Console().print(sample_table)
794
1203
 
795
1204
  @db_group.command("render-md")
796
1205
  @click.option("-i", "--input", "input_path", required=True, help="Input JSON file path")
@@ -843,6 +1252,614 @@ def register_db_commands(db_group: click.Group) -> None:
843
1252
  rendered = render_papers(papers, out_dir, template, output_language)
844
1253
  click.echo(f"Rendered {rendered} markdown files")
845
1254
 
1255
+ @db_group.command("extract")
1256
+ @click.option("--json", "target_json", default=None, help="Target JSON database path")
1257
+ @click.option("--input-json", "input_json", default=None, help="Reference JSON file path")
1258
+ @click.option(
1259
+ "--pdf-root", "pdf_roots", multiple=True, help="PDF root directories for reference (repeatable)"
1260
+ )
1261
+ @click.option(
1262
+ "--md-root", "md_roots", multiple=True, help="Markdown root directories for reference (repeatable)"
1263
+ )
1264
+ @click.option(
1265
+ "--md-translated-root", "md_translated_roots", multiple=True,
1266
+ help="Translated Markdown root directories to extract from (repeatable)"
1267
+ )
1268
+ @click.option(
1269
+ "--md-source-root", "md_source_roots", multiple=True,
1270
+ help="Source Markdown root directories to extract from (repeatable)"
1271
+ )
1272
+ @click.option("--output-json", "output_json", default=None, help="Output JSON file path")
1273
+ @click.option(
1274
+ "--output-md-translated-root",
1275
+ "output_md_translated_root",
1276
+ default=None,
1277
+ help="Output directory for matched translated Markdown",
1278
+ )
1279
+ @click.option(
1280
+ "--output-md-root",
1281
+ "output_md_root",
1282
+ default=None,
1283
+ help="Output directory for matched source Markdown",
1284
+ )
1285
+ @click.option(
1286
+ "-b",
1287
+ "--input-bibtex",
1288
+ "input_bibtex",
1289
+ default=None,
1290
+ help="Reference BibTeX file path",
1291
+ )
1292
+ @click.option("--lang", "lang", default=None, help="Language code for translated Markdown (e.g., zh)")
1293
+ @click.option("--output-csv", "output_csv", default=None, help="Path to export results as CSV")
1294
+ def extract(
1295
+ target_json: str | None,
1296
+ input_json: str | None,
1297
+ pdf_roots: tuple[str, ...],
1298
+ md_roots: tuple[str, ...],
1299
+ md_translated_roots: tuple[str, ...],
1300
+ md_source_roots: tuple[str, ...],
1301
+ output_json: str | None,
1302
+ output_md_translated_root: str | None,
1303
+ output_md_root: str | None,
1304
+ input_bibtex: str | None,
1305
+ lang: str | None,
1306
+ output_csv: str | None,
1307
+ ) -> None:
1308
+ from deepresearch_flow.paper import db_ops
1309
+ from deepresearch_flow.paper.utils import stable_hash
1310
+
1311
+ if input_json and input_bibtex:
1312
+ raise click.ClickException("Use only one of --input-json or --input-bibtex")
1313
+
1314
+ if target_json is None and input_json is not None:
1315
+ target_json = input_json
1316
+
1317
+ has_reference = bool(pdf_roots or md_roots or input_json or input_bibtex)
1318
+ if not has_reference:
1319
+ raise click.ClickException(
1320
+ "Provide at least one reference input: --pdf-root, --md-root, --input-json, or --input-bibtex"
1321
+ )
1322
+ if not target_json and not md_translated_roots and not md_source_roots:
1323
+ raise click.ClickException(
1324
+ "Provide --json and/or --md-translated-root and/or --md-source-root"
1325
+ )
1326
+ if target_json and not output_json:
1327
+ raise click.ClickException("--output-json is required when using --json")
1328
+ if output_json and not target_json:
1329
+ raise click.ClickException("--json is required when using --output-json")
1330
+ if md_translated_roots and not output_md_translated_root:
1331
+ raise click.ClickException(
1332
+ "--output-md-translated-root is required when using --md-translated-root"
1333
+ )
1334
+ if output_md_translated_root and not md_translated_roots:
1335
+ raise click.ClickException(
1336
+ "--md-translated-root is required when using --output-md-translated-root"
1337
+ )
1338
+ if md_source_roots and not output_md_root:
1339
+ raise click.ClickException("--output-md-root is required when using --md-source-root")
1340
+ if output_md_root and not md_source_roots:
1341
+ raise click.ClickException("--md-source-root is required when using --output-md-root")
1342
+ if md_translated_roots and not lang:
1343
+ raise click.ClickException("--lang is required when extracting translated Markdown")
1344
+
1345
+ pdf_root_paths = [Path(path) for path in pdf_roots]
1346
+ md_root_paths = [Path(path) for path in md_roots]
1347
+ translated_root_paths = [Path(path) for path in md_translated_roots]
1348
+ source_root_paths = [Path(path) for path in md_source_roots]
1349
+ reference_json_path = Path(input_json) if input_json else None
1350
+ reference_bibtex_path = Path(input_bibtex) if input_bibtex else None
1351
+
1352
+ reference_papers: list[dict[str, Any]] = []
1353
+ if reference_json_path:
1354
+ if not reference_json_path.is_file():
1355
+ raise click.ClickException(f"Reference JSON not found: {reference_json_path}")
1356
+ reference_papers, _ = load_json_payload(reference_json_path)
1357
+ if reference_bibtex_path:
1358
+ if not reference_bibtex_path.is_file():
1359
+ raise click.ClickException(f"Reference BibTeX not found: {reference_bibtex_path}")
1360
+ if not db_ops.PYBTEX_AVAILABLE:
1361
+ raise click.ClickException("pybtex is required for --input-bibtex support")
1362
+ bib_data = db_ops.parse_file(str(reference_bibtex_path))
1363
+ for key, entry in bib_data.entries.items():
1364
+ title = entry.fields.get("title")
1365
+ if not title:
1366
+ continue
1367
+ year = entry.fields.get("year") or ""
1368
+ year = str(year) if str(year).isdigit() else ""
1369
+ authors = []
1370
+ for person in entry.persons.get("author", []):
1371
+ authors.append(str(person))
1372
+ reference_papers.append(
1373
+ {
1374
+ "paper_title": str(title),
1375
+ "paper_authors": authors,
1376
+ "publication_date": year,
1377
+ "source_path": f"bibtex:{key}",
1378
+ }
1379
+ )
1380
+
1381
+ reference_index: dict[str, list[dict[str, Any]]] = {}
1382
+ for paper in reference_papers:
1383
+ if "source_path" not in paper and reference_json_path:
1384
+ paper["source_path"] = str(reference_json_path)
1385
+ db_ops._prepare_paper_matching_fields(paper)
1386
+ if reference_papers:
1387
+ reference_index = db_ops._build_paper_index(reference_papers)
1388
+
1389
+ all_results: list[Any] = []
1390
+
1391
+ if target_json:
1392
+ target_json_path = Path(target_json)
1393
+ if not target_json_path.is_file():
1394
+ raise click.ClickException(f"Target JSON not found: {target_json_path}")
1395
+ papers, payload = load_json_payload(target_json_path)
1396
+
1397
+ results: list[Any] = []
1398
+ matched_indices: set[int]
1399
+ if pdf_root_paths or md_root_paths:
1400
+ results, match_pairs, _, _ = db_ops.compare_datasets_with_pairs(
1401
+ json_paths_a=[target_json_path],
1402
+ pdf_roots_b=pdf_root_paths,
1403
+ md_roots_b=md_root_paths,
1404
+ bibtex_path=None,
1405
+ lang=None,
1406
+ show_progress=True,
1407
+ )
1408
+ matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
1409
+ all_results.extend(results)
1410
+ else:
1411
+ matched_indices = set(range(len(papers)))
1412
+
1413
+ matched_reference_ids: set[int] = set()
1414
+ if reference_index:
1415
+ def detail_score(paper: dict[str, Any]) -> tuple[int, int]:
1416
+ non_empty = 0
1417
+ total_len = 0
1418
+ for value in paper.values():
1419
+ if value is None:
1420
+ continue
1421
+ if isinstance(value, (list, dict)):
1422
+ if value:
1423
+ non_empty += 1
1424
+ total_len += len(
1425
+ json.dumps(value, ensure_ascii=False, sort_keys=True)
1426
+ )
1427
+ else:
1428
+ text = str(value).strip()
1429
+ if text:
1430
+ non_empty += 1
1431
+ total_len += len(text)
1432
+ return non_empty, total_len
1433
+
1434
+ def resolve_reference_match(
1435
+ paper: dict[str, Any],
1436
+ ) -> tuple[dict[str, Any] | None, str | None, float]:
1437
+ match_paper, match_type, match_score = db_ops._resolve_paper_by_title_and_meta(
1438
+ paper, reference_index
1439
+ )
1440
+ if match_paper is not None:
1441
+ return match_paper, match_type, match_score
1442
+ year = str(paper.get("_year") or "").strip()
1443
+ if not year.isdigit():
1444
+ return None, None, 0.0
1445
+ authors = paper.get("_authors") or []
1446
+ author_key = ""
1447
+ if authors:
1448
+ author_key = db_ops._normalize_author_key(str(authors[0]))
1449
+ candidates: list[dict[str, Any]] = []
1450
+ fallback_type = "year_relaxed"
1451
+ if author_key:
1452
+ candidates = reference_index.get(f"authoryear:{year}:{author_key}", [])
1453
+ if candidates:
1454
+ fallback_type = "author_year_relaxed"
1455
+ if not candidates:
1456
+ candidates = reference_index.get(f"year:{year}", [])
1457
+ if not candidates:
1458
+ return None, None, 0.0
1459
+ title_key = db_ops._normalize_title_key(str(paper.get("paper_title") or ""))
1460
+ match, score = db_ops._adaptive_similarity_match_papers(title_key, candidates)
1461
+ if match is None:
1462
+ return candidates[0], fallback_type, 0.0
1463
+ return match, fallback_type, score
1464
+
1465
+ base_indices = set(matched_indices)
1466
+ best_matches: dict[int, tuple[int, tuple[int, int], str | None, float]] = {}
1467
+ for idx, paper in enumerate(papers):
1468
+ if idx not in matched_indices:
1469
+ continue
1470
+ db_ops._prepare_paper_matching_fields(paper)
1471
+ match_paper, match_type, match_score = resolve_reference_match(paper)
1472
+ if match_paper is None:
1473
+ continue
1474
+ ref_id = id(match_paper)
1475
+ score = detail_score(paper)
1476
+ current = best_matches.get(ref_id)
1477
+ if current is None:
1478
+ best_matches[ref_id] = (idx, score, match_type, match_score)
1479
+ continue
1480
+ if score > current[1] or (score == current[1] and match_score > current[3]):
1481
+ best_matches[ref_id] = (idx, score, match_type, match_score)
1482
+
1483
+ matched_reference_ids = set(best_matches.keys())
1484
+ matched_indices = {idx for idx, *_ in best_matches.values()}
1485
+
1486
+ matched_papers = [paper for idx, paper in enumerate(papers) if idx in matched_indices]
1487
+ deduped_papers: list[Any] = []
1488
+ seen_titles: set[str] = set()
1489
+ for paper in matched_papers:
1490
+ title_key = db_ops._normalize_title_key(str(paper.get("paper_title") or ""))
1491
+ if title_key:
1492
+ if title_key in seen_titles:
1493
+ continue
1494
+ seen_titles.add(title_key)
1495
+ deduped_papers.append(paper)
1496
+ if len(deduped_papers) != len(matched_papers):
1497
+ removed = len(matched_papers) - len(deduped_papers)
1498
+ click.echo(f"Deduplicated {removed} entries by normalized title.")
1499
+ matched_papers = deduped_papers
1500
+ output_path = Path(output_json) if output_json else None
1501
+ if output_path is None:
1502
+ raise click.ClickException("--output-json is required when using --json")
1503
+ output_path.parent.mkdir(parents=True, exist_ok=True)
1504
+ if payload is None:
1505
+ write_json(output_path, matched_papers)
1506
+ else:
1507
+ output_payload = dict(payload)
1508
+ output_payload["papers"] = matched_papers
1509
+ write_json(output_path, output_payload)
1510
+ click.echo(f"Extracted {len(matched_papers)} JSON entries to {output_path}")
1511
+
1512
+ if output_csv and reference_papers:
1513
+ match_meta_by_ref_id = {
1514
+ ref_id: (idx, match_type, match_score)
1515
+ for ref_id, (idx, _, match_type, match_score) in best_matches.items()
1516
+ }
1517
+ for ref in reference_papers:
1518
+ ref_id = id(ref)
1519
+ ref_title = str(ref.get("paper_title") or "")
1520
+ ref_hash = stable_hash(str(ref_title or ref.get("source_path") or ""))
1521
+ ref_path = str(ref.get("source_path") or "")
1522
+ if ref_id in match_meta_by_ref_id:
1523
+ idx, match_type, match_score = match_meta_by_ref_id[ref_id]
1524
+ paper = papers[idx]
1525
+ paper_hash = str(paper.get("source_hash") or "") or stable_hash(
1526
+ str(paper.get("paper_title") or "")
1527
+ )
1528
+ all_results.append(
1529
+ db_ops.CompareResult(
1530
+ side="MATCH",
1531
+ source_hash=ref_hash,
1532
+ title=ref_title,
1533
+ match_status="matched_pair",
1534
+ match_type=match_type,
1535
+ match_score=match_score,
1536
+ source_path=ref_path,
1537
+ other_source_hash=paper_hash,
1538
+ other_title=str(paper.get("paper_title") or ""),
1539
+ other_source_path=str(paper.get("source_path") or ""),
1540
+ lang=None,
1541
+ )
1542
+ )
1543
+ continue
1544
+ all_results.append(
1545
+ db_ops.CompareResult(
1546
+ side="B",
1547
+ source_hash=ref_hash,
1548
+ title=ref_title,
1549
+ match_status="only_in_B",
1550
+ match_type=None,
1551
+ match_score=0.0,
1552
+ source_path=ref_path,
1553
+ other_source_hash=None,
1554
+ other_title=None,
1555
+ other_source_path=None,
1556
+ lang=None,
1557
+ )
1558
+ )
1559
+
1560
+ for idx in sorted(base_indices - matched_indices):
1561
+ paper = papers[idx]
1562
+ paper_title = str(paper.get("paper_title") or "")
1563
+ paper_hash = str(paper.get("source_hash") or "") or stable_hash(paper_title)
1564
+ all_results.append(
1565
+ db_ops.CompareResult(
1566
+ side="A",
1567
+ source_hash=paper_hash,
1568
+ title=paper_title,
1569
+ match_status="only_in_A",
1570
+ match_type=None,
1571
+ match_score=0.0,
1572
+ source_path=str(paper.get("source_path") or ""),
1573
+ other_source_hash=None,
1574
+ other_title=None,
1575
+ other_source_path=None,
1576
+ lang=None,
1577
+ )
1578
+ )
1579
+
1580
+ copied_count = 0
1581
+ if md_translated_roots:
1582
+ output_root = Path(output_md_translated_root) if output_md_translated_root else None
1583
+ if output_root is None:
1584
+ raise click.ClickException(
1585
+ "--output-md-translated-root is required when using --md-translated-root"
1586
+ )
1587
+ results, match_pairs, dataset_a, _ = compare_datasets_with_pairs(
1588
+ md_translated_roots_a=translated_root_paths,
1589
+ pdf_roots_b=pdf_root_paths,
1590
+ md_roots_b=md_root_paths,
1591
+ lang=lang,
1592
+ show_progress=True,
1593
+ )
1594
+ matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
1595
+ copy_iter = tqdm(
1596
+ enumerate(dataset_a.papers),
1597
+ total=len(dataset_a.papers),
1598
+ desc="copy translated",
1599
+ unit="file",
1600
+ )
1601
+ for idx, paper in copy_iter:
1602
+ if idx not in matched_indices:
1603
+ continue
1604
+ source_path = paper.get("source_path")
1605
+ if not source_path:
1606
+ continue
1607
+ source = Path(str(source_path))
1608
+ relative = resolve_relative_path(source, translated_root_paths)
1609
+ destination = output_root / relative
1610
+ destination.parent.mkdir(parents=True, exist_ok=True)
1611
+ shutil.copy2(source, destination)
1612
+ copied_count += 1
1613
+ click.echo(
1614
+ f"Copied {copied_count} translated Markdown files to {output_root}"
1615
+ )
1616
+ all_results.extend(results)
1617
+
1618
+ if md_source_roots:
1619
+ output_root = Path(output_md_root) if output_md_root else None
1620
+ if output_root is None:
1621
+ raise click.ClickException("--output-md-root is required when using --md-source-root")
1622
+ results, match_pairs, dataset_a, _ = compare_datasets_with_pairs(
1623
+ md_roots_a=source_root_paths,
1624
+ pdf_roots_b=pdf_root_paths,
1625
+ md_roots_b=md_root_paths,
1626
+ lang=None,
1627
+ show_progress=True,
1628
+ )
1629
+ matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
1630
+ copied_source = 0
1631
+ copy_iter = tqdm(
1632
+ enumerate(dataset_a.papers),
1633
+ total=len(dataset_a.papers),
1634
+ desc="copy source",
1635
+ unit="file",
1636
+ )
1637
+ for idx, paper in copy_iter:
1638
+ if idx not in matched_indices:
1639
+ continue
1640
+ source_path = paper.get("source_path")
1641
+ if not source_path:
1642
+ continue
1643
+ source = Path(str(source_path))
1644
+ relative = resolve_relative_path(source, source_root_paths)
1645
+ destination = output_root / relative
1646
+ destination.parent.mkdir(parents=True, exist_ok=True)
1647
+ shutil.copy2(source, destination)
1648
+ copied_source += 1
1649
+ click.echo(f"Copied {copied_source} source Markdown files to {output_root}")
1650
+ copied_count += copied_source
1651
+ all_results.extend(results)
1652
+
1653
+ if output_csv:
1654
+ output_path = Path(output_csv)
1655
+ export_compare_csv(all_results, output_path)
1656
+ click.echo(f"Results exported to: {output_path}")
1657
+
1658
+ @db_group.command("verify")
1659
+ @click.option("--input-json", "input_json", required=True, help="Input JSON file path")
1660
+ @click.option(
1661
+ "--output-json",
1662
+ "output_json",
1663
+ required=True,
1664
+ help="Output verification report JSON path",
1665
+ )
1666
+ @click.option(
1667
+ "--prompt-template",
1668
+ "prompt_template",
1669
+ default=None,
1670
+ type=click.Choice(list_template_names()),
1671
+ help="Prompt template to load schema (e.g., deep_read)",
1672
+ )
1673
+ @click.option(
1674
+ "-s",
1675
+ "--schema-json",
1676
+ "--schema",
1677
+ "schema_json",
1678
+ default=None,
1679
+ help="Custom schema JSON path",
1680
+ )
1681
+ @click.option(
1682
+ "--ignore-field",
1683
+ "ignore_fields",
1684
+ multiple=True,
1685
+ help="Schema field to ignore when checking empties (repeatable)",
1686
+ )
1687
+ def verify(
1688
+ input_json: str,
1689
+ output_json: str,
1690
+ prompt_template: str | None,
1691
+ schema_json: str | None,
1692
+ ignore_fields: tuple[str, ...],
1693
+ ) -> None:
1694
+ if prompt_template and schema_json:
1695
+ raise click.ClickException("Use only one of --prompt-template or --schema-json")
1696
+ if not prompt_template and not schema_json:
1697
+ raise click.ClickException("Provide --prompt-template or --schema-json")
1698
+
1699
+ input_path = Path(input_json)
1700
+ if not input_path.is_file():
1701
+ raise click.ClickException(f"Input JSON not found: {input_path}")
1702
+
1703
+ papers, payload = load_json_payload(input_path)
1704
+ template_tag = (
1705
+ prompt_template
1706
+ or (payload.get("template_tag") if isinstance(payload, dict) else None)
1707
+ or "custom"
1708
+ )
1709
+
1710
+ try:
1711
+ if schema_json:
1712
+ schema = load_schema(schema_json)
1713
+ else:
1714
+ schema = load_schema_for_template(prompt_template or template_tag)
1715
+ except SchemaError as exc:
1716
+ raise click.ClickException(str(exc)) from exc
1717
+ except ValueError as exc:
1718
+ raise click.ClickException(str(exc)) from exc
1719
+
1720
+ ignore_set = {field.strip() for field in ignore_fields if field.strip()}
1721
+ properties = schema.get("properties", {})
1722
+ schema_fields = sorted(
1723
+ field
1724
+ for field in (set(properties.keys()) | set(schema.get("required", [])))
1725
+ if field not in ignore_set
1726
+ )
1727
+ if not schema_fields:
1728
+ raise click.ClickException("Schema does not define any properties")
1729
+
1730
+ stage_defs = get_stage_definitions(prompt_template or template_tag)
1731
+ field_stage_map: dict[str, str] = {}
1732
+ for stage_def in stage_defs:
1733
+ for field in stage_def.fields:
1734
+ if field in ignore_set:
1735
+ continue
1736
+ field_stage_map.setdefault(field, stage_def.name)
1737
+
1738
+ report_items: list[dict[str, Any]] = []
1739
+ for paper in papers:
1740
+ if not isinstance(paper, dict):
1741
+ continue
1742
+ missing_fields = [
1743
+ field
1744
+ for field in schema_fields
1745
+ if field not in paper or is_empty_value(paper.get(field))
1746
+ ]
1747
+ if not missing_fields:
1748
+ continue
1749
+ item: dict[str, Any] = {
1750
+ "source_path": str(paper.get("source_path") or ""),
1751
+ "paper_title": str(paper.get("paper_title") or ""),
1752
+ "missing_fields": missing_fields,
1753
+ }
1754
+ if field_stage_map and all(field in field_stage_map for field in missing_fields):
1755
+ item["retry_stages"] = sorted(
1756
+ {field_stage_map[field] for field in missing_fields}
1757
+ )
1758
+ report_items.append(item)
1759
+
1760
+ report_payload = {
1761
+ "template_tag": template_tag,
1762
+ "schema_fields": schema_fields,
1763
+ "items": report_items,
1764
+ }
1765
+
1766
+ output_path = Path(output_json)
1767
+ output_path.parent.mkdir(parents=True, exist_ok=True)
1768
+ write_json(output_path, report_payload)
1769
+
1770
+ console = Console()
1771
+ total_missing = sum(len(item["missing_fields"]) for item in report_items)
1772
+ summary_table = Table(title="db verify summary")
1773
+ summary_table.add_column("Metric", style="cyan")
1774
+ summary_table.add_column("Value", style="white", overflow="fold")
1775
+ summary_table.add_row("Input", str(input_path))
1776
+ summary_table.add_row("Template", template_tag)
1777
+ summary_table.add_row("Items", str(len(papers)))
1778
+ summary_table.add_row("Items with missing fields", str(len(report_items)))
1779
+ summary_table.add_row("Total missing fields", str(total_missing))
1780
+ if ignore_set:
1781
+ summary_table.add_row("Ignored fields", ", ".join(sorted(ignore_set)))
1782
+ summary_table.add_row("Output", str(output_path))
1783
+ console.print(summary_table)
1784
+
1785
+ if report_items:
1786
+ field_counts: dict[str, int] = {field: 0 for field in schema_fields}
1787
+ for item in report_items:
1788
+ for field in item["missing_fields"]:
1789
+ field_counts[field] = field_counts.get(field, 0) + 1
1790
+
1791
+ count_table = Table(title="Missing field counts")
1792
+ count_table.add_column("Field", style="cyan")
1793
+ count_table.add_column("Missing", style="yellow", justify="right")
1794
+ for field, count in sorted(field_counts.items(), key=lambda x: (-x[1], x[0])):
1795
+ if count:
1796
+ count_table.add_row(field, str(count))
1797
+ console.print(count_table)
1798
+
1799
+ detail_table = Table(title="Missing field details")
1800
+ detail_table.add_column("#", style="dim", justify="right")
1801
+ detail_table.add_column("Title", style="white", overflow="fold")
1802
+ detail_table.add_column("Source Path", style="cyan", overflow="fold")
1803
+ detail_table.add_column("Missing Fields", style="yellow", overflow="fold")
1804
+ detail_table.add_column("Retry Stages", style="green", overflow="fold")
1805
+ for idx, item in enumerate(report_items, start=1):
1806
+ retry_stages = item.get("retry_stages") or []
1807
+ detail_table.add_row(
1808
+ str(idx),
1809
+ item.get("paper_title") or "",
1810
+ item.get("source_path") or "",
1811
+ ", ".join(item.get("missing_fields", [])),
1812
+ ", ".join(retry_stages),
1813
+ )
1814
+ console.print(detail_table)
1815
+ else:
1816
+ console.print(Panel("[green]No missing fields detected.[/green]", expand=False))
1817
+
1818
+ @db_group.command("transfer-pdfs")
1819
+ @click.option("--input-list", "input_list", required=True, help="Text file containing PDF paths")
1820
+ @click.option("--output-dir", "output_dir", required=True, help="Output directory")
1821
+ @click.option("--move", "move_files", is_flag=True, help="Move PDFs instead of copying")
1822
+ @click.option("--copy", "copy_files", is_flag=True, help="Copy PDFs instead of moving")
1823
+ def transfer_pdfs(
1824
+ input_list: str,
1825
+ output_dir: str,
1826
+ move_files: bool,
1827
+ copy_files: bool,
1828
+ ) -> None:
1829
+ if move_files == copy_files:
1830
+ raise click.ClickException("Specify exactly one of --move or --copy")
1831
+
1832
+ list_path = Path(input_list)
1833
+ if not list_path.is_file():
1834
+ raise click.ClickException(f"Input list not found: {list_path}")
1835
+
1836
+ destination_root = Path(output_dir)
1837
+ destination_root.mkdir(parents=True, exist_ok=True)
1838
+
1839
+ entries = [line.strip() for line in list_path.read_text(encoding="utf-8").splitlines()]
1840
+ entries = [line for line in entries if line]
1841
+
1842
+ processed = 0
1843
+ missing = 0
1844
+ transfer_iter = tqdm(entries, total=len(entries), desc="transfer pdfs", unit="file")
1845
+ for raw in transfer_iter:
1846
+ source = Path(raw).expanduser()
1847
+ if not source.is_file():
1848
+ missing += 1
1849
+ continue
1850
+ destination = destination_root / source.name
1851
+ destination.parent.mkdir(parents=True, exist_ok=True)
1852
+ if move_files:
1853
+ shutil.move(str(source), str(destination))
1854
+ else:
1855
+ shutil.copy2(source, destination)
1856
+ processed += 1
1857
+
1858
+ action = "Moved" if move_files else "Copied"
1859
+ click.echo(f"{action} {processed} PDFs to {destination_root}")
1860
+ if missing:
1861
+ click.echo(f"Skipped {missing} missing paths")
1862
+
846
1863
  @db_group.command("compare")
847
1864
  @click.option(
848
1865
  "-ia", "--input-a", "input_paths_a", multiple=True, help="Input JSON files for side A (repeatable)"
@@ -875,6 +1892,12 @@ def register_db_commands(db_group: click.Group) -> None:
875
1892
  @click.option(
876
1893
  "--output-csv", "output_csv", default=None, help="Path to export results as CSV"
877
1894
  )
1895
+ @click.option(
1896
+ "--output-only-in-b",
1897
+ "output_only_in_b",
1898
+ default=None,
1899
+ help="Path to export only-in-B source paths as a newline list",
1900
+ )
878
1901
  @click.option(
879
1902
  "--sample-limit", "sample_limit", default=5, type=int, show_default=True,
880
1903
  help="Number of sample items to show in terminal output"
@@ -891,12 +1914,12 @@ def register_db_commands(db_group: click.Group) -> None:
891
1914
  bibtex_path: str | None,
892
1915
  lang: str | None,
893
1916
  output_csv: str | None,
1917
+ output_only_in_b: str | None,
894
1918
  sample_limit: int,
895
1919
  ) -> None:
896
1920
  """Compare two datasets and report matches and differences."""
897
1921
  from deepresearch_flow.paper.db_ops import compare_datasets
898
- import csv
899
-
1922
+
900
1923
  # Validate that at least one input is provided for each side
901
1924
  has_input_a = bool(input_paths_a or pdf_roots_a or md_roots_a or md_translated_roots_a)
902
1925
  has_input_b = bool(input_paths_b or pdf_roots_b or md_roots_b or md_translated_roots_b)
@@ -925,6 +1948,7 @@ def register_db_commands(db_group: click.Group) -> None:
925
1948
  md_translated_roots_b=[Path(p) for p in md_translated_roots_b],
926
1949
  bibtex_path=Path(bibtex_path) if bibtex_path else None,
927
1950
  lang=lang,
1951
+ show_progress=True,
928
1952
  )
929
1953
  except ValueError as exc:
930
1954
  raise click.ClickException(str(exc)) from exc
@@ -998,31 +2022,14 @@ def register_db_commands(db_group: click.Group) -> None:
998
2022
  # Export to CSV if requested
999
2023
  if output_csv:
1000
2024
  output_path = Path(output_csv)
1001
- output_path.parent.mkdir(parents=True, exist_ok=True)
1002
-
1003
- with open(output_path, "w", newline="", encoding="utf-8") as f:
1004
- writer = csv.writer(f)
1005
- writer.writerow([
1006
- "Side", "Source Hash", "Title", "Match Status", "Match Type",
1007
- "Match Score", "Source Path", "Other Source Hash", "Other Title",
1008
- "Other Source Path", "Lang"
1009
- ])
1010
- for r in results:
1011
- writer.writerow([
1012
- r.side,
1013
- r.source_hash,
1014
- r.title,
1015
- r.match_status,
1016
- r.match_type or "",
1017
- f"{r.match_score:.4f}",
1018
- r.source_path or "",
1019
- r.other_source_hash or "",
1020
- r.other_title or "",
1021
- r.other_source_path or "",
1022
- r.lang or "",
1023
- ])
1024
-
2025
+ export_compare_csv(results, output_path)
1025
2026
  console.print(f"\n[green]Results exported to: {output_path}[/green]")
1026
-
2027
+ if output_only_in_b:
2028
+ output_path = Path(output_only_in_b)
2029
+ count = export_only_in_b_paths(results, output_path)
2030
+ console.print(
2031
+ f"\n[green]Only-in-B list exported ({count} items): {output_path}[/green]"
2032
+ )
2033
+
1027
2034
  # Print final counts
1028
2035
  console.print(f"\nTotal results: {len(results)}")