deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. deepresearch_flow/paper/cli.py +63 -0
  2. deepresearch_flow/paper/config.py +87 -12
  3. deepresearch_flow/paper/db.py +1154 -35
  4. deepresearch_flow/paper/db_ops.py +124 -19
  5. deepresearch_flow/paper/extract.py +1546 -152
  6. deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
  7. deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
  8. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
  9. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
  10. deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
  11. deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
  14. deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
  15. deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
  16. deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
  17. deepresearch_flow/paper/providers/azure_openai.py +45 -3
  18. deepresearch_flow/paper/providers/openai_compatible.py +45 -3
  19. deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
  20. deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
  21. deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
  22. deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
  23. deepresearch_flow/paper/snapshot/__init__.py +4 -0
  24. deepresearch_flow/paper/snapshot/api.py +941 -0
  25. deepresearch_flow/paper/snapshot/builder.py +965 -0
  26. deepresearch_flow/paper/snapshot/identity.py +239 -0
  27. deepresearch_flow/paper/snapshot/schema.py +245 -0
  28. deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
  29. deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
  30. deepresearch_flow/paper/snapshot/text.py +154 -0
  31. deepresearch_flow/paper/template_registry.py +1 -0
  32. deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
  33. deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
  34. deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
  35. deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
  36. deepresearch_flow/paper/web/app.py +10 -3
  37. deepresearch_flow/recognize/cli.py +380 -103
  38. deepresearch_flow/recognize/markdown.py +31 -7
  39. deepresearch_flow/recognize/math.py +47 -12
  40. deepresearch_flow/recognize/mermaid.py +320 -10
  41. deepresearch_flow/recognize/organize.py +29 -7
  42. deepresearch_flow/translator/cli.py +71 -20
  43. deepresearch_flow/translator/engine.py +220 -81
  44. deepresearch_flow/translator/prompts.py +19 -2
  45. deepresearch_flow/translator/protector.py +15 -3
  46. deepresearch_flow-0.6.1.dist-info/METADATA +849 -0
  47. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/RECORD +51 -43
  48. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/WHEEL +1 -1
  49. deepresearch_flow-0.5.1.dist-info/METADATA +0 -440
  50. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/entry_points.txt +0 -0
  51. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/licenses/LICENSE +0 -0
  52. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/top_level.txt +0 -0
@@ -5,10 +5,13 @@ from __future__ import annotations
5
5
  import asyncio
6
6
  import json
7
7
  import re
8
+ import shutil
8
9
  from pathlib import Path
9
10
  from typing import Any, Iterable
10
11
  import difflib
11
12
 
13
+ from tqdm import tqdm
14
+
12
15
  import click
13
16
  import httpx
14
17
  from rich.console import Console
@@ -19,11 +22,17 @@ from deepresearch_flow.paper.config import load_config, resolve_api_keys
19
22
  from deepresearch_flow.paper.extract import parse_model_ref
20
23
  from deepresearch_flow.paper.llm import backoff_delay, call_provider
21
24
  from deepresearch_flow.paper.providers.base import ProviderError
22
- from deepresearch_flow.paper.template_registry import list_template_names
25
+ from deepresearch_flow.paper.schema import SchemaError, load_schema
26
+ from deepresearch_flow.paper.template_registry import (
27
+ get_stage_definitions,
28
+ list_template_names,
29
+ load_schema_for_template,
30
+ )
23
31
  from deepresearch_flow.paper.render import resolve_render_template, render_papers
24
32
 
25
33
  try:
26
- from pybtex.database import parse_file
34
+ from pybtex.database import BibliographyData, parse_file
35
+ from pybtex.database.output.bibtex import Writer
27
36
  PYBTEX_AVAILABLE = True
28
37
  except ImportError:
29
38
  PYBTEX_AVAILABLE = False
@@ -42,6 +51,74 @@ def write_json(path: Path, data: Any) -> None:
42
51
  path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
43
52
 
44
53
 
54
+ def load_json_payload(path: Path) -> tuple[list[dict[str, Any]], dict[str, Any] | None]:
55
+ try:
56
+ data = json.loads(path.read_text(encoding="utf-8"))
57
+ except json.JSONDecodeError as exc:
58
+ raise click.ClickException(f"Invalid JSON in {path}: {exc}") from exc
59
+
60
+ if isinstance(data, list):
61
+ return data, None
62
+ if isinstance(data, dict):
63
+ papers = data.get("papers")
64
+ if isinstance(papers, list):
65
+ return papers, data
66
+ raise click.ClickException(f"JSON object missing 'papers' list: {path}")
67
+
68
+ raise click.ClickException(f"Unsupported JSON structure in {path}")
69
+
70
+
71
+ def is_empty_value(value: Any) -> bool:
72
+ if value is None:
73
+ return True
74
+ if isinstance(value, str):
75
+ return value.strip() == ""
76
+ if isinstance(value, list) or isinstance(value, dict):
77
+ return len(value) == 0
78
+ return False
79
+
80
+
81
+ def export_compare_csv(results: list[Any], output_path: Path) -> None:
82
+ output_path.parent.mkdir(parents=True, exist_ok=True)
83
+
84
+ import csv
85
+
86
+ with open(output_path, "w", newline="", encoding="utf-8") as handle:
87
+ writer = csv.writer(handle)
88
+ writer.writerow([
89
+ "Side", "Source Hash", "Title", "Match Status", "Match Type",
90
+ "Match Score", "Source Path", "Other Source Hash", "Other Title",
91
+ "Other Source Path", "Lang"
92
+ ])
93
+ for result in results:
94
+ writer.writerow([
95
+ result.side,
96
+ result.source_hash,
97
+ result.title,
98
+ result.match_status,
99
+ result.match_type or "",
100
+ f"{result.match_score:.4f}",
101
+ result.source_path or "",
102
+ result.other_source_hash or "",
103
+ result.other_title or "",
104
+ result.other_source_path or "",
105
+ result.lang or "",
106
+ ])
107
+
108
+
109
+ def export_only_in_b_paths(results: list[Any], output_path: Path) -> int:
110
+ output_path.parent.mkdir(parents=True, exist_ok=True)
111
+ lines = []
112
+ for result in results:
113
+ if result.side != "B" or result.match_status != "only_in_B":
114
+ continue
115
+ if result.source_path:
116
+ lines.append(result.source_path)
117
+
118
+ output_path.write_text("\n".join(lines) + ("\n" if lines else ""), encoding="utf-8")
119
+ return len(lines)
120
+
121
+
45
122
  def normalize_authors(value: Any) -> list[str]:
46
123
  if value is None:
47
124
  return []
@@ -133,6 +210,18 @@ def parse_year_month(date_str: str | None) -> tuple[str | None, str | None]:
133
210
  return year, None
134
211
 
135
212
 
213
+ def resolve_relative_path(path: Path, roots: Iterable[Path]) -> Path:
214
+ resolved = path.resolve()
215
+ roots_by_depth = sorted(roots, key=lambda r: len(str(r.resolve())), reverse=True)
216
+ for root in roots_by_depth:
217
+ root_resolved = root.resolve()
218
+ try:
219
+ return resolved.relative_to(root_resolved)
220
+ except ValueError:
221
+ continue
222
+ return Path(path.name)
223
+
224
+
136
225
  def clean_journal_name(name: str | None) -> str:
137
226
  if not name:
138
227
  return "Unknown"
@@ -266,6 +355,147 @@ def parse_tag_list(text: str) -> list[str]:
266
355
 
267
356
 
268
357
  def register_db_commands(db_group: click.Group) -> None:
358
+ @db_group.group("snapshot")
359
+ def snapshot_group() -> None:
360
+ """Build production snapshot artifacts (SQLite + static export)."""
361
+
362
+ @snapshot_group.command("build")
363
+ @click.option("-i", "--input", "input_paths", multiple=True, required=True, help="Input JSON file path")
364
+ @click.option("-b", "--bibtex", "bibtex_path", default=None, help="Optional BibTeX file path")
365
+ @click.option(
366
+ "--md-root",
367
+ "md_roots",
368
+ multiple=True,
369
+ default=(),
370
+ help="Optional markdown root directory (repeatable) for source viewing",
371
+ )
372
+ @click.option(
373
+ "--md-translated-root",
374
+ "md_translated_roots",
375
+ multiple=True,
376
+ default=(),
377
+ help="Optional markdown root directory (repeatable) for translated viewing",
378
+ )
379
+ @click.option(
380
+ "--pdf-root",
381
+ "pdf_roots",
382
+ multiple=True,
383
+ default=(),
384
+ help="Optional PDF root directory (repeatable) for PDF discovery",
385
+ )
386
+ @click.option("--output-db", "output_db", default="paper_snapshot.db", show_default=True, help="Output DB path")
387
+ @click.option(
388
+ "--static-export-dir",
389
+ "static_export_dir",
390
+ default="paper-static",
391
+ show_default=True,
392
+ help="Output directory for hashed static assets",
393
+ )
394
+ @click.option(
395
+ "--previous-snapshot-db",
396
+ "previous_snapshot_db",
397
+ default=None,
398
+ help="Optional previous snapshot DB path for identity continuity",
399
+ )
400
+ def snapshot_build(
401
+ input_paths: tuple[str, ...],
402
+ bibtex_path: str | None,
403
+ md_roots: tuple[str, ...],
404
+ md_translated_roots: tuple[str, ...],
405
+ pdf_roots: tuple[str, ...],
406
+ output_db: str,
407
+ static_export_dir: str,
408
+ previous_snapshot_db: str | None,
409
+ ) -> None:
410
+ """Build a production snapshot (SQLite + static export)."""
411
+ from deepresearch_flow.paper.snapshot.builder import SnapshotBuildOptions, build_snapshot
412
+
413
+ opts = SnapshotBuildOptions(
414
+ input_paths=[Path(path) for path in input_paths],
415
+ bibtex_path=Path(bibtex_path) if bibtex_path else None,
416
+ md_roots=[Path(root) for root in md_roots],
417
+ md_translated_roots=[Path(root) for root in md_translated_roots],
418
+ pdf_roots=[Path(root) for root in pdf_roots],
419
+ output_db=Path(output_db),
420
+ static_export_dir=Path(static_export_dir),
421
+ previous_snapshot_db=Path(previous_snapshot_db) if previous_snapshot_db else None,
422
+ )
423
+ build_snapshot(opts)
424
+ click.echo(f"Wrote snapshot DB: {opts.output_db}")
425
+ click.echo(f"Wrote static export: {opts.static_export_dir}")
426
+
427
+ @db_group.group("api")
428
+ def api_group() -> None:
429
+ """Read-only JSON API server backed by a snapshot DB."""
430
+
431
+ @api_group.command("serve")
432
+ @click.option("--snapshot-db", "snapshot_db", required=True, help="Path to paper_snapshot.db")
433
+ @click.option(
434
+ "--static-base-url",
435
+ "static_base_url",
436
+ default=None,
437
+ help="Static asset base URL (e.g. https://static.example.com)",
438
+ )
439
+ @click.option(
440
+ "--cors-origin",
441
+ "cors_origins",
442
+ multiple=True,
443
+ default=(),
444
+ help="Allowed CORS origin (repeatable; default is '*')",
445
+ )
446
+ @click.option("--max-query-length", "max_query_length", type=int, default=500, show_default=True)
447
+ @click.option("--max-page-size", "max_page_size", type=int, default=100, show_default=True)
448
+ @click.option("--max-pagination-offset", "max_pagination_offset", type=int, default=10000, show_default=True)
449
+ @click.option("--host", default="127.0.0.1", show_default=True, help="Bind host")
450
+ @click.option("--port", default=8001, type=int, show_default=True, help="Bind port")
451
+ def api_serve(
452
+ snapshot_db: str,
453
+ static_base_url: str | None,
454
+ cors_origins: tuple[str, ...],
455
+ max_query_length: int,
456
+ max_page_size: int,
457
+ max_pagination_offset: int,
458
+ host: str,
459
+ port: int,
460
+ ) -> None:
461
+ """Serve the snapshot-backed JSON API."""
462
+ import os
463
+ import uvicorn
464
+
465
+ from deepresearch_flow.paper.snapshot.api import ApiLimits, create_app
466
+
467
+ static_base_url_value = (
468
+ static_base_url
469
+ or os.getenv("PAPER_DB_STATIC_BASE")
470
+ or os.getenv("PAPER_DB_STATIC_BASE_URL")
471
+ or ""
472
+ )
473
+ api_base_url = os.getenv("PAPER_DB_API_BASE") or ""
474
+ if api_base_url and host == "127.0.0.1" and port == 8001:
475
+ from urllib.parse import urlparse
476
+
477
+ parsed = urlparse(api_base_url)
478
+ if not parsed.scheme:
479
+ parsed = urlparse(f"http://{api_base_url}")
480
+ if parsed.hostname:
481
+ host = parsed.hostname
482
+ if parsed.port:
483
+ port = parsed.port
484
+ cors_allowed = list(cors_origins) if cors_origins else ["*"]
485
+ limits = ApiLimits(
486
+ max_query_length=max_query_length,
487
+ max_page_size=max_page_size,
488
+ max_pagination_offset=max_pagination_offset,
489
+ )
490
+ app = create_app(
491
+ snapshot_db=Path(snapshot_db),
492
+ static_base_url=static_base_url_value,
493
+ cors_allowed_origins=cors_allowed,
494
+ limits=limits,
495
+ )
496
+ click.echo(f"Serving API on http://{host}:{port} (Ctrl+C to stop)")
497
+ uvicorn.run(app, host=host, port=port, log_level="info")
498
+
269
499
  @db_group.command("append-bibtex")
270
500
  @click.option("-i", "--input", "input_path", required=True, help="Input JSON file path")
271
501
  @click.option("-b", "--bibtex", "bibtex_path", required=True, help="Input BibTeX file path")
@@ -782,15 +1012,306 @@ def register_db_commands(db_group: click.Group) -> None:
782
1012
  write_json(Path(output_path), filtered)
783
1013
  click.echo(f"Filtered down to {len(filtered)} papers")
784
1014
 
785
- @db_group.command("merge")
1015
+ @db_group.group("merge")
1016
+ def merge_group() -> None:
1017
+ """Merge paper JSON inputs."""
1018
+
1019
+ def _summarize_merge(output_path: Path, merged: Any, *, input_count: int) -> None:
1020
+ items: list[dict[str, Any]] = []
1021
+ if isinstance(merged, dict):
1022
+ raw_items = merged.get("papers")
1023
+ if isinstance(raw_items, list):
1024
+ items = [item for item in raw_items if isinstance(item, dict)]
1025
+ elif isinstance(merged, list):
1026
+ items = [item for item in merged if isinstance(item, dict)]
1027
+
1028
+ field_set: set[str] = set()
1029
+ for item in items:
1030
+ field_set.update(item.keys())
1031
+ field_list = sorted(field_set)
1032
+
1033
+ console = Console()
1034
+ summary = Table(title="Merge Summary")
1035
+ summary.add_column("Metric", style="bold")
1036
+ summary.add_column("Value")
1037
+ summary.add_row("Inputs", str(input_count))
1038
+ summary.add_row("Items", str(len(items)))
1039
+ summary.add_row("Fields", str(len(field_list)))
1040
+ summary.add_row("Output", str(output_path))
1041
+ console.print(summary)
1042
+
1043
+ if field_list:
1044
+ field_table = Table(title="Fields")
1045
+ field_table.add_column("Name")
1046
+ for name in field_list:
1047
+ field_table.add_row(name)
1048
+ console.print(field_table)
1049
+
1050
+ def _bibtex_entry_score(entry: Any) -> int:
1051
+ fields = getattr(entry, "fields", {}) or {}
1052
+ persons = getattr(entry, "persons", {}) or {}
1053
+ person_count = sum(len(people) for people in persons.values())
1054
+ return len(fields) + len(persons) + person_count
1055
+
1056
+ def _summarize_bibtex_merge(output_path: Path, *, input_count: int, entry_count: int, duplicate_count: int) -> None:
1057
+ summary = Table(title="BibTeX Merge Summary")
1058
+ summary.add_column("Metric", style="bold")
1059
+ summary.add_column("Value")
1060
+ summary.add_row("Inputs", str(input_count))
1061
+ summary.add_row("Entries", str(entry_count))
1062
+ summary.add_row("Duplicates", str(duplicate_count))
1063
+ summary.add_row("Output", str(output_path))
1064
+ Console().print(summary)
1065
+
1066
+ @merge_group.command("library")
786
1067
  @click.option("-i", "--inputs", "input_paths", multiple=True, required=True, help="Input JSON files")
1068
+ @click.option("--template-tag", "template_tag", default=None, help="Template tag for merged output")
787
1069
  @click.option("-o", "--output", "output_path", required=True, help="Output JSON file path")
788
- def merge_papers(input_paths: Iterable[str], output_path: str) -> None:
1070
+ def merge_library(input_paths: Iterable[str], template_tag: str | None, output_path: str) -> None:
1071
+ paths = [Path(path) for path in input_paths]
789
1072
  merged: list[dict[str, Any]] = []
790
- for path in input_paths:
791
- merged.extend(load_json(Path(path)))
792
- write_json(Path(output_path), merged)
793
- click.echo(f"Merged {len(input_paths)} files into {output_path}")
1073
+ tag_candidates: list[str] = []
1074
+ for path in paths:
1075
+ payload = load_json(path)
1076
+ if isinstance(payload, dict):
1077
+ tag = str(payload.get("template_tag") or "")
1078
+ if tag:
1079
+ tag_candidates.append(tag)
1080
+ papers = payload.get("papers")
1081
+ if isinstance(papers, list):
1082
+ merged.extend(papers)
1083
+ else:
1084
+ raise click.ClickException("Input JSON must be a list or {template_tag, papers}")
1085
+ elif isinstance(payload, list):
1086
+ merged.extend(payload)
1087
+ else:
1088
+ raise click.ClickException("Input JSON must be a list or {template_tag, papers}")
1089
+ if not template_tag:
1090
+ inferred = ""
1091
+ for paper in merged:
1092
+ if not isinstance(paper, dict):
1093
+ continue
1094
+ inferred = str(paper.get("prompt_template") or paper.get("template_tag") or "")
1095
+ if inferred:
1096
+ break
1097
+ if inferred:
1098
+ template_tag = inferred
1099
+ if tag_candidates and not template_tag:
1100
+ template_tag = tag_candidates[0]
1101
+ if not template_tag:
1102
+ template_tag = "unknown"
1103
+ if tag_candidates and any(tag != template_tag for tag in tag_candidates):
1104
+ click.echo("Warning: multiple template_tag values detected in inputs; using first")
1105
+ output = Path(output_path)
1106
+ bundle = {"template_tag": template_tag, "papers": merged}
1107
+ write_json(output, bundle)
1108
+ _summarize_merge(output, bundle, input_count=len(paths))
1109
+
1110
+ @merge_group.command("templates")
1111
+ @click.option("-i", "--inputs", "input_paths", multiple=True, required=True, help="Input JSON files")
1112
+ @click.option("-o", "--output", "output_path", required=True, help="Output JSON file path")
1113
+ def merge_templates(input_paths: Iterable[str], output_path: str) -> None:
1114
+ from deepresearch_flow.paper import db_ops
1115
+
1116
+ paths = [Path(path) for path in input_paths]
1117
+ inputs = db_ops._load_paper_inputs(paths)
1118
+ if not inputs:
1119
+ raise click.ClickException("No input JSON files provided")
1120
+
1121
+ groups: list[dict[str, Any]] = []
1122
+ base_papers: list[dict[str, Any]] = []
1123
+ hash_to_group: dict[str, int] = {}
1124
+ paper_id_to_group: dict[int, int] = {}
1125
+ paper_index: dict[str, list[dict[str, Any]]] = {}
1126
+
1127
+ def rebuild_index() -> None:
1128
+ nonlocal paper_index, paper_id_to_group
1129
+ paper_index = db_ops._build_paper_index(base_papers)
1130
+ paper_id_to_group = {id(paper): idx for idx, paper in enumerate(base_papers)}
1131
+
1132
+ def add_group(template_tag: str, paper: dict[str, Any]) -> None:
1133
+ group = {
1134
+ "templates": {template_tag: paper},
1135
+ "template_order": [template_tag],
1136
+ }
1137
+ groups.append(group)
1138
+ base_papers.append(paper)
1139
+ source_hash = str(paper.get("source_hash") or "")
1140
+ if source_hash:
1141
+ hash_to_group[source_hash] = len(groups) - 1
1142
+ rebuild_index()
1143
+
1144
+ stats: dict[str, dict[str, int]] = {}
1145
+ diff_counts: dict[tuple[str, str], int] = {}
1146
+ diff_samples: list[tuple[str, str, str, str, str]] = []
1147
+ first_tag = str(inputs[0].get("template_tag") or "")
1148
+ base_items = inputs[0].get("papers") or []
1149
+ stats[first_tag] = {"total": len(base_items), "matched": len(base_items), "skipped": 0}
1150
+ for paper in base_items:
1151
+ if not isinstance(paper, dict):
1152
+ raise click.ClickException("Input papers must be objects")
1153
+ db_ops._prepare_paper_matching_fields(paper)
1154
+ add_group(first_tag, paper)
1155
+
1156
+ for bundle in inputs[1:]:
1157
+ template_tag = str(bundle.get("template_tag") or "")
1158
+ items = bundle.get("papers") or []
1159
+ matched = 0
1160
+ skipped = 0
1161
+ for paper in items:
1162
+ if not isinstance(paper, dict):
1163
+ raise click.ClickException("Input papers must be objects")
1164
+ db_ops._prepare_paper_matching_fields(paper)
1165
+ source_hash = str(paper.get("source_hash") or "")
1166
+ match_idx: int | None = None
1167
+ if source_hash and source_hash in hash_to_group:
1168
+ match_idx = hash_to_group[source_hash]
1169
+ else:
1170
+ match_paper, _, _ = db_ops._resolve_paper_by_title_and_meta(
1171
+ paper, paper_index
1172
+ )
1173
+ if match_paper is not None:
1174
+ match_idx = paper_id_to_group.get(id(match_paper))
1175
+ if match_idx is None:
1176
+ skipped += 1
1177
+ continue
1178
+ matched += 1
1179
+ group = groups[match_idx]
1180
+ base_templates = group.get("templates") or {}
1181
+ base_paper = base_templates.get(first_tag)
1182
+ if isinstance(base_paper, dict):
1183
+ for field in ("source_hash", "paper_title", "publication_date"):
1184
+ base_value = str(base_paper.get(field) or "")
1185
+ other_value = str(paper.get(field) or "")
1186
+ if base_value == other_value:
1187
+ continue
1188
+ diff_counts[(template_tag, field)] = diff_counts.get(
1189
+ (template_tag, field), 0
1190
+ ) + 1
1191
+ if len(diff_samples) < 50:
1192
+ diff_samples.append(
1193
+ (
1194
+ template_tag,
1195
+ field,
1196
+ str(base_paper.get("paper_title") or ""),
1197
+ base_value,
1198
+ other_value,
1199
+ )
1200
+ )
1201
+ templates = group.setdefault("templates", {})
1202
+ templates[template_tag] = paper
1203
+ order = group.setdefault("template_order", [])
1204
+ if template_tag not in order:
1205
+ order.append(template_tag)
1206
+ stats[template_tag] = {"total": len(items), "matched": matched, "skipped": skipped}
1207
+
1208
+ merged: list[dict[str, Any]] = []
1209
+ for group in groups:
1210
+ templates = group.get("templates") or {}
1211
+ order = group.get("template_order") or list(templates.keys())
1212
+ entry: dict[str, Any] = {}
1213
+ for tag in order:
1214
+ paper = templates.get(tag)
1215
+ if not isinstance(paper, dict):
1216
+ continue
1217
+ for key, value in paper.items():
1218
+ if key not in entry:
1219
+ entry[key] = value
1220
+ merged.append(entry)
1221
+
1222
+ output = Path(output_path)
1223
+ write_json(output, merged)
1224
+ _summarize_merge(output, merged, input_count=len(paths))
1225
+
1226
+ stat_table = Table(title="Template Merge Stats")
1227
+ stat_table.add_column("Template")
1228
+ stat_table.add_column("Total", justify="right")
1229
+ stat_table.add_column("Matched", justify="right")
1230
+ stat_table.add_column("Skipped", justify="right")
1231
+ for tag, values in stats.items():
1232
+ stat_table.add_row(
1233
+ tag or "(unknown)",
1234
+ str(values.get("total", 0)),
1235
+ str(values.get("matched", 0)),
1236
+ str(values.get("skipped", 0)),
1237
+ )
1238
+ Console().print(stat_table)
1239
+
1240
+ if diff_counts:
1241
+ diff_table = Table(title="Template Field Diff Summary")
1242
+ diff_table.add_column("Template")
1243
+ diff_table.add_column("Field")
1244
+ diff_table.add_column("Count", justify="right")
1245
+ for (template_tag, field), count in sorted(diff_counts.items()):
1246
+ diff_table.add_row(template_tag or "(unknown)", field, str(count))
1247
+ Console().print(diff_table)
1248
+
1249
+ if diff_samples:
1250
+ sample_table = Table(title="Template Field Diff Samples (up to 50)")
1251
+ sample_table.add_column("Template")
1252
+ sample_table.add_column("Field")
1253
+ sample_table.add_column("Base Title")
1254
+ sample_table.add_column("Base Value")
1255
+ sample_table.add_column("Other Value")
1256
+ for row in diff_samples:
1257
+ sample_table.add_row(*row)
1258
+ Console().print(sample_table)
1259
+
1260
+ @merge_group.command("bibtex")
1261
+ @click.option("-i", "--input", "input_paths", multiple=True, required=True, help="Input BibTeX file paths")
1262
+ @click.option("-o", "--output", "output_path", required=True, help="Output BibTeX file path")
1263
+ def merge_bibtex(input_paths: Iterable[str], output_path: str) -> None:
1264
+ if not PYBTEX_AVAILABLE:
1265
+ raise click.ClickException("pybtex is required for merge bibtex")
1266
+
1267
+ paths = [Path(path) for path in input_paths]
1268
+ if not paths:
1269
+ raise click.ClickException("No BibTeX inputs provided")
1270
+
1271
+ for path in paths:
1272
+ if not path.is_file():
1273
+ raise click.ClickException(f"BibTeX file not found: {path}")
1274
+
1275
+ merged_entries: dict[str, tuple[Any, int]] = {}
1276
+ duplicate_keys: list[str] = []
1277
+ duplicate_seen: set[str] = set()
1278
+
1279
+ for path in paths:
1280
+ bib_data = parse_file(str(path))
1281
+ for key, entry in bib_data.entries.items():
1282
+ score = _bibtex_entry_score(entry)
1283
+ if key not in merged_entries:
1284
+ merged_entries[key] = (entry, score)
1285
+ continue
1286
+ if key not in duplicate_seen:
1287
+ duplicate_seen.add(key)
1288
+ duplicate_keys.append(key)
1289
+ _, existing_score = merged_entries[key]
1290
+ if score > existing_score:
1291
+ merged_entries[key] = (entry, score)
1292
+
1293
+ output = Path(output_path)
1294
+ output.parent.mkdir(parents=True, exist_ok=True)
1295
+ out_data = BibliographyData()
1296
+ for key, (entry, _) in merged_entries.items():
1297
+ out_data.entries[key] = entry
1298
+ with output.open("w", encoding="utf-8") as handle:
1299
+ Writer().write_stream(out_data, handle)
1300
+
1301
+ _summarize_bibtex_merge(
1302
+ output,
1303
+ input_count=len(paths),
1304
+ entry_count=len(merged_entries),
1305
+ duplicate_count=len(duplicate_keys),
1306
+ )
1307
+
1308
+ if duplicate_keys:
1309
+ preview_limit = 20
1310
+ preview = ", ".join(duplicate_keys[:preview_limit])
1311
+ if len(duplicate_keys) > preview_limit:
1312
+ preview = f"{preview}, ... (+{len(duplicate_keys) - preview_limit} more)"
1313
+ note = "Kept entry with most fields; ties keep first input order."
1314
+ Console().print(Panel(f"{note}\n{preview}", title=f"Duplicate keys ({len(duplicate_keys)})", style="yellow"))
794
1315
 
795
1316
  @db_group.command("render-md")
796
1317
  @click.option("-i", "--input", "input_path", required=True, help="Input JSON file path")
@@ -843,6 +1364,614 @@ def register_db_commands(db_group: click.Group) -> None:
843
1364
  rendered = render_papers(papers, out_dir, template, output_language)
844
1365
  click.echo(f"Rendered {rendered} markdown files")
845
1366
 
1367
+ @db_group.command("extract")
1368
+ @click.option("--json", "target_json", default=None, help="Target JSON database path")
1369
+ @click.option("--input-json", "input_json", default=None, help="Reference JSON file path")
1370
+ @click.option(
1371
+ "--pdf-root", "pdf_roots", multiple=True, help="PDF root directories for reference (repeatable)"
1372
+ )
1373
+ @click.option(
1374
+ "--md-root", "md_roots", multiple=True, help="Markdown root directories for reference (repeatable)"
1375
+ )
1376
+ @click.option(
1377
+ "--md-translated-root", "md_translated_roots", multiple=True,
1378
+ help="Translated Markdown root directories to extract from (repeatable)"
1379
+ )
1380
+ @click.option(
1381
+ "--md-source-root", "md_source_roots", multiple=True,
1382
+ help="Source Markdown root directories to extract from (repeatable)"
1383
+ )
1384
+ @click.option("--output-json", "output_json", default=None, help="Output JSON file path")
1385
+ @click.option(
1386
+ "--output-md-translated-root",
1387
+ "output_md_translated_root",
1388
+ default=None,
1389
+ help="Output directory for matched translated Markdown",
1390
+ )
1391
+ @click.option(
1392
+ "--output-md-root",
1393
+ "output_md_root",
1394
+ default=None,
1395
+ help="Output directory for matched source Markdown",
1396
+ )
1397
+ @click.option(
1398
+ "-b",
1399
+ "--input-bibtex",
1400
+ "input_bibtex",
1401
+ default=None,
1402
+ help="Reference BibTeX file path",
1403
+ )
1404
+ @click.option("--lang", "lang", default=None, help="Language code for translated Markdown (e.g., zh)")
1405
+ @click.option("--output-csv", "output_csv", default=None, help="Path to export results as CSV")
1406
+ def extract(
1407
+ target_json: str | None,
1408
+ input_json: str | None,
1409
+ pdf_roots: tuple[str, ...],
1410
+ md_roots: tuple[str, ...],
1411
+ md_translated_roots: tuple[str, ...],
1412
+ md_source_roots: tuple[str, ...],
1413
+ output_json: str | None,
1414
+ output_md_translated_root: str | None,
1415
+ output_md_root: str | None,
1416
+ input_bibtex: str | None,
1417
+ lang: str | None,
1418
+ output_csv: str | None,
1419
+ ) -> None:
1420
+ from deepresearch_flow.paper import db_ops
1421
+ from deepresearch_flow.paper.utils import stable_hash
1422
+
1423
+ if input_json and input_bibtex:
1424
+ raise click.ClickException("Use only one of --input-json or --input-bibtex")
1425
+
1426
+ if target_json is None and input_json is not None:
1427
+ target_json = input_json
1428
+
1429
+ has_reference = bool(pdf_roots or md_roots or input_json or input_bibtex)
1430
+ if not has_reference:
1431
+ raise click.ClickException(
1432
+ "Provide at least one reference input: --pdf-root, --md-root, --input-json, or --input-bibtex"
1433
+ )
1434
+ if not target_json and not md_translated_roots and not md_source_roots:
1435
+ raise click.ClickException(
1436
+ "Provide --json and/or --md-translated-root and/or --md-source-root"
1437
+ )
1438
+ if target_json and not output_json:
1439
+ raise click.ClickException("--output-json is required when using --json")
1440
+ if output_json and not target_json:
1441
+ raise click.ClickException("--json is required when using --output-json")
1442
+ if md_translated_roots and not output_md_translated_root:
1443
+ raise click.ClickException(
1444
+ "--output-md-translated-root is required when using --md-translated-root"
1445
+ )
1446
+ if output_md_translated_root and not md_translated_roots:
1447
+ raise click.ClickException(
1448
+ "--md-translated-root is required when using --output-md-translated-root"
1449
+ )
1450
+ if md_source_roots and not output_md_root:
1451
+ raise click.ClickException("--output-md-root is required when using --md-source-root")
1452
+ if output_md_root and not md_source_roots:
1453
+ raise click.ClickException("--md-source-root is required when using --output-md-root")
1454
+ if md_translated_roots and not lang:
1455
+ raise click.ClickException("--lang is required when extracting translated Markdown")
1456
+
1457
+ pdf_root_paths = [Path(path) for path in pdf_roots]
1458
+ md_root_paths = [Path(path) for path in md_roots]
1459
+ translated_root_paths = [Path(path) for path in md_translated_roots]
1460
+ source_root_paths = [Path(path) for path in md_source_roots]
1461
+ reference_json_path = Path(input_json) if input_json else None
1462
+ reference_bibtex_path = Path(input_bibtex) if input_bibtex else None
1463
+
1464
+ reference_papers: list[dict[str, Any]] = []
1465
+ if reference_json_path:
1466
+ if not reference_json_path.is_file():
1467
+ raise click.ClickException(f"Reference JSON not found: {reference_json_path}")
1468
+ reference_papers, _ = load_json_payload(reference_json_path)
1469
+ if reference_bibtex_path:
1470
+ if not reference_bibtex_path.is_file():
1471
+ raise click.ClickException(f"Reference BibTeX not found: {reference_bibtex_path}")
1472
+ if not db_ops.PYBTEX_AVAILABLE:
1473
+ raise click.ClickException("pybtex is required for --input-bibtex support")
1474
+ bib_data = db_ops.parse_file(str(reference_bibtex_path))
1475
+ for key, entry in bib_data.entries.items():
1476
+ title = entry.fields.get("title")
1477
+ if not title:
1478
+ continue
1479
+ year = entry.fields.get("year") or ""
1480
+ year = str(year) if str(year).isdigit() else ""
1481
+ authors = []
1482
+ for person in entry.persons.get("author", []):
1483
+ authors.append(str(person))
1484
+ reference_papers.append(
1485
+ {
1486
+ "paper_title": str(title),
1487
+ "paper_authors": authors,
1488
+ "publication_date": year,
1489
+ "source_path": f"bibtex:{key}",
1490
+ }
1491
+ )
1492
+
1493
+ reference_index: dict[str, list[dict[str, Any]]] = {}
1494
+ for paper in reference_papers:
1495
+ if "source_path" not in paper and reference_json_path:
1496
+ paper["source_path"] = str(reference_json_path)
1497
+ db_ops._prepare_paper_matching_fields(paper)
1498
+ if reference_papers:
1499
+ reference_index = db_ops._build_paper_index(reference_papers)
1500
+
1501
+ all_results: list[Any] = []
1502
+
1503
+ if target_json:
1504
+ target_json_path = Path(target_json)
1505
+ if not target_json_path.is_file():
1506
+ raise click.ClickException(f"Target JSON not found: {target_json_path}")
1507
+ papers, payload = load_json_payload(target_json_path)
1508
+
1509
+ results: list[Any] = []
1510
+ matched_indices: set[int]
1511
+ if pdf_root_paths or md_root_paths:
1512
+ results, match_pairs, _, _ = db_ops.compare_datasets_with_pairs(
1513
+ json_paths_a=[target_json_path],
1514
+ pdf_roots_b=pdf_root_paths,
1515
+ md_roots_b=md_root_paths,
1516
+ bibtex_path=None,
1517
+ lang=None,
1518
+ show_progress=True,
1519
+ )
1520
+ matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
1521
+ all_results.extend(results)
1522
+ else:
1523
+ matched_indices = set(range(len(papers)))
1524
+
1525
+ matched_reference_ids: set[int] = set()
1526
+ if reference_index:
1527
+ def detail_score(paper: dict[str, Any]) -> tuple[int, int]:
1528
+ non_empty = 0
1529
+ total_len = 0
1530
+ for value in paper.values():
1531
+ if value is None:
1532
+ continue
1533
+ if isinstance(value, (list, dict)):
1534
+ if value:
1535
+ non_empty += 1
1536
+ total_len += len(
1537
+ json.dumps(value, ensure_ascii=False, sort_keys=True)
1538
+ )
1539
+ else:
1540
+ text = str(value).strip()
1541
+ if text:
1542
+ non_empty += 1
1543
+ total_len += len(text)
1544
+ return non_empty, total_len
1545
+
1546
+ def resolve_reference_match(
1547
+ paper: dict[str, Any],
1548
+ ) -> tuple[dict[str, Any] | None, str | None, float]:
1549
+ match_paper, match_type, match_score = db_ops._resolve_paper_by_title_and_meta(
1550
+ paper, reference_index
1551
+ )
1552
+ if match_paper is not None:
1553
+ return match_paper, match_type, match_score
1554
+ year = str(paper.get("_year") or "").strip()
1555
+ if not year.isdigit():
1556
+ return None, None, 0.0
1557
+ authors = paper.get("_authors") or []
1558
+ author_key = ""
1559
+ if authors:
1560
+ author_key = db_ops._normalize_author_key(str(authors[0]))
1561
+ candidates: list[dict[str, Any]] = []
1562
+ fallback_type = "year_relaxed"
1563
+ if author_key:
1564
+ candidates = reference_index.get(f"authoryear:{year}:{author_key}", [])
1565
+ if candidates:
1566
+ fallback_type = "author_year_relaxed"
1567
+ if not candidates:
1568
+ candidates = reference_index.get(f"year:{year}", [])
1569
+ if not candidates:
1570
+ return None, None, 0.0
1571
+ title_key = db_ops._normalize_title_key(str(paper.get("paper_title") or ""))
1572
+ match, score = db_ops._adaptive_similarity_match_papers(title_key, candidates)
1573
+ if match is None:
1574
+ return candidates[0], fallback_type, 0.0
1575
+ return match, fallback_type, score
1576
+
1577
+ base_indices = set(matched_indices)
1578
+ best_matches: dict[int, tuple[int, tuple[int, int], str | None, float]] = {}
1579
+ for idx, paper in enumerate(papers):
1580
+ if idx not in matched_indices:
1581
+ continue
1582
+ db_ops._prepare_paper_matching_fields(paper)
1583
+ match_paper, match_type, match_score = resolve_reference_match(paper)
1584
+ if match_paper is None:
1585
+ continue
1586
+ ref_id = id(match_paper)
1587
+ score = detail_score(paper)
1588
+ current = best_matches.get(ref_id)
1589
+ if current is None:
1590
+ best_matches[ref_id] = (idx, score, match_type, match_score)
1591
+ continue
1592
+ if score > current[1] or (score == current[1] and match_score > current[3]):
1593
+ best_matches[ref_id] = (idx, score, match_type, match_score)
1594
+
1595
+ matched_reference_ids = set(best_matches.keys())
1596
+ matched_indices = {idx for idx, *_ in best_matches.values()}
1597
+
1598
+ matched_papers = [paper for idx, paper in enumerate(papers) if idx in matched_indices]
1599
+ deduped_papers: list[Any] = []
1600
+ seen_titles: set[str] = set()
1601
+ for paper in matched_papers:
1602
+ title_key = db_ops._normalize_title_key(str(paper.get("paper_title") or ""))
1603
+ if title_key:
1604
+ if title_key in seen_titles:
1605
+ continue
1606
+ seen_titles.add(title_key)
1607
+ deduped_papers.append(paper)
1608
+ if len(deduped_papers) != len(matched_papers):
1609
+ removed = len(matched_papers) - len(deduped_papers)
1610
+ click.echo(f"Deduplicated {removed} entries by normalized title.")
1611
+ matched_papers = deduped_papers
1612
+ output_path = Path(output_json) if output_json else None
1613
+ if output_path is None:
1614
+ raise click.ClickException("--output-json is required when using --json")
1615
+ output_path.parent.mkdir(parents=True, exist_ok=True)
1616
+ if payload is None:
1617
+ write_json(output_path, matched_papers)
1618
+ else:
1619
+ output_payload = dict(payload)
1620
+ output_payload["papers"] = matched_papers
1621
+ write_json(output_path, output_payload)
1622
+ click.echo(f"Extracted {len(matched_papers)} JSON entries to {output_path}")
1623
+
1624
+ if output_csv and reference_papers:
1625
+ match_meta_by_ref_id = {
1626
+ ref_id: (idx, match_type, match_score)
1627
+ for ref_id, (idx, _, match_type, match_score) in best_matches.items()
1628
+ }
1629
+ for ref in reference_papers:
1630
+ ref_id = id(ref)
1631
+ ref_title = str(ref.get("paper_title") or "")
1632
+ ref_hash = stable_hash(str(ref_title or ref.get("source_path") or ""))
1633
+ ref_path = str(ref.get("source_path") or "")
1634
+ if ref_id in match_meta_by_ref_id:
1635
+ idx, match_type, match_score = match_meta_by_ref_id[ref_id]
1636
+ paper = papers[idx]
1637
+ paper_hash = str(paper.get("source_hash") or "") or stable_hash(
1638
+ str(paper.get("paper_title") or "")
1639
+ )
1640
+ all_results.append(
1641
+ db_ops.CompareResult(
1642
+ side="MATCH",
1643
+ source_hash=ref_hash,
1644
+ title=ref_title,
1645
+ match_status="matched_pair",
1646
+ match_type=match_type,
1647
+ match_score=match_score,
1648
+ source_path=ref_path,
1649
+ other_source_hash=paper_hash,
1650
+ other_title=str(paper.get("paper_title") or ""),
1651
+ other_source_path=str(paper.get("source_path") or ""),
1652
+ lang=None,
1653
+ )
1654
+ )
1655
+ continue
1656
+ all_results.append(
1657
+ db_ops.CompareResult(
1658
+ side="B",
1659
+ source_hash=ref_hash,
1660
+ title=ref_title,
1661
+ match_status="only_in_B",
1662
+ match_type=None,
1663
+ match_score=0.0,
1664
+ source_path=ref_path,
1665
+ other_source_hash=None,
1666
+ other_title=None,
1667
+ other_source_path=None,
1668
+ lang=None,
1669
+ )
1670
+ )
1671
+
1672
+ for idx in sorted(base_indices - matched_indices):
1673
+ paper = papers[idx]
1674
+ paper_title = str(paper.get("paper_title") or "")
1675
+ paper_hash = str(paper.get("source_hash") or "") or stable_hash(paper_title)
1676
+ all_results.append(
1677
+ db_ops.CompareResult(
1678
+ side="A",
1679
+ source_hash=paper_hash,
1680
+ title=paper_title,
1681
+ match_status="only_in_A",
1682
+ match_type=None,
1683
+ match_score=0.0,
1684
+ source_path=str(paper.get("source_path") or ""),
1685
+ other_source_hash=None,
1686
+ other_title=None,
1687
+ other_source_path=None,
1688
+ lang=None,
1689
+ )
1690
+ )
1691
+
1692
+ copied_count = 0
1693
+ if md_translated_roots:
1694
+ output_root = Path(output_md_translated_root) if output_md_translated_root else None
1695
+ if output_root is None:
1696
+ raise click.ClickException(
1697
+ "--output-md-translated-root is required when using --md-translated-root"
1698
+ )
1699
+ results, match_pairs, dataset_a, _ = compare_datasets_with_pairs(
1700
+ md_translated_roots_a=translated_root_paths,
1701
+ pdf_roots_b=pdf_root_paths,
1702
+ md_roots_b=md_root_paths,
1703
+ lang=lang,
1704
+ show_progress=True,
1705
+ )
1706
+ matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
1707
+ copy_iter = tqdm(
1708
+ enumerate(dataset_a.papers),
1709
+ total=len(dataset_a.papers),
1710
+ desc="copy translated",
1711
+ unit="file",
1712
+ )
1713
+ for idx, paper in copy_iter:
1714
+ if idx not in matched_indices:
1715
+ continue
1716
+ source_path = paper.get("source_path")
1717
+ if not source_path:
1718
+ continue
1719
+ source = Path(str(source_path))
1720
+ relative = resolve_relative_path(source, translated_root_paths)
1721
+ destination = output_root / relative
1722
+ destination.parent.mkdir(parents=True, exist_ok=True)
1723
+ shutil.copy2(source, destination)
1724
+ copied_count += 1
1725
+ click.echo(
1726
+ f"Copied {copied_count} translated Markdown files to {output_root}"
1727
+ )
1728
+ all_results.extend(results)
1729
+
1730
+ if md_source_roots:
1731
+ output_root = Path(output_md_root) if output_md_root else None
1732
+ if output_root is None:
1733
+ raise click.ClickException("--output-md-root is required when using --md-source-root")
1734
+ results, match_pairs, dataset_a, _ = compare_datasets_with_pairs(
1735
+ md_roots_a=source_root_paths,
1736
+ pdf_roots_b=pdf_root_paths,
1737
+ md_roots_b=md_root_paths,
1738
+ lang=None,
1739
+ show_progress=True,
1740
+ )
1741
+ matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
1742
+ copied_source = 0
1743
+ copy_iter = tqdm(
1744
+ enumerate(dataset_a.papers),
1745
+ total=len(dataset_a.papers),
1746
+ desc="copy source",
1747
+ unit="file",
1748
+ )
1749
+ for idx, paper in copy_iter:
1750
+ if idx not in matched_indices:
1751
+ continue
1752
+ source_path = paper.get("source_path")
1753
+ if not source_path:
1754
+ continue
1755
+ source = Path(str(source_path))
1756
+ relative = resolve_relative_path(source, source_root_paths)
1757
+ destination = output_root / relative
1758
+ destination.parent.mkdir(parents=True, exist_ok=True)
1759
+ shutil.copy2(source, destination)
1760
+ copied_source += 1
1761
+ click.echo(f"Copied {copied_source} source Markdown files to {output_root}")
1762
+ copied_count += copied_source
1763
+ all_results.extend(results)
1764
+
1765
+ if output_csv:
1766
+ output_path = Path(output_csv)
1767
+ export_compare_csv(all_results, output_path)
1768
+ click.echo(f"Results exported to: {output_path}")
1769
+
1770
+ @db_group.command("verify")
1771
+ @click.option("--input-json", "input_json", required=True, help="Input JSON file path")
1772
+ @click.option(
1773
+ "--output-json",
1774
+ "output_json",
1775
+ required=True,
1776
+ help="Output verification report JSON path",
1777
+ )
1778
+ @click.option(
1779
+ "--prompt-template",
1780
+ "prompt_template",
1781
+ default=None,
1782
+ type=click.Choice(list_template_names()),
1783
+ help="Prompt template to load schema (e.g., deep_read)",
1784
+ )
1785
+ @click.option(
1786
+ "-s",
1787
+ "--schema-json",
1788
+ "--schema",
1789
+ "schema_json",
1790
+ default=None,
1791
+ help="Custom schema JSON path",
1792
+ )
1793
+ @click.option(
1794
+ "--ignore-field",
1795
+ "ignore_fields",
1796
+ multiple=True,
1797
+ help="Schema field to ignore when checking empties (repeatable)",
1798
+ )
1799
+ def verify(
1800
+ input_json: str,
1801
+ output_json: str,
1802
+ prompt_template: str | None,
1803
+ schema_json: str | None,
1804
+ ignore_fields: tuple[str, ...],
1805
+ ) -> None:
1806
+ if prompt_template and schema_json:
1807
+ raise click.ClickException("Use only one of --prompt-template or --schema-json")
1808
+ if not prompt_template and not schema_json:
1809
+ raise click.ClickException("Provide --prompt-template or --schema-json")
1810
+
1811
+ input_path = Path(input_json)
1812
+ if not input_path.is_file():
1813
+ raise click.ClickException(f"Input JSON not found: {input_path}")
1814
+
1815
+ papers, payload = load_json_payload(input_path)
1816
+ template_tag = (
1817
+ prompt_template
1818
+ or (payload.get("template_tag") if isinstance(payload, dict) else None)
1819
+ or "custom"
1820
+ )
1821
+
1822
+ try:
1823
+ if schema_json:
1824
+ schema = load_schema(schema_json)
1825
+ else:
1826
+ schema = load_schema_for_template(prompt_template or template_tag)
1827
+ except SchemaError as exc:
1828
+ raise click.ClickException(str(exc)) from exc
1829
+ except ValueError as exc:
1830
+ raise click.ClickException(str(exc)) from exc
1831
+
1832
+ ignore_set = {field.strip() for field in ignore_fields if field.strip()}
1833
+ properties = schema.get("properties", {})
1834
+ schema_fields = sorted(
1835
+ field
1836
+ for field in (set(properties.keys()) | set(schema.get("required", [])))
1837
+ if field not in ignore_set
1838
+ )
1839
+ if not schema_fields:
1840
+ raise click.ClickException("Schema does not define any properties")
1841
+
1842
+ stage_defs = get_stage_definitions(prompt_template or template_tag)
1843
+ field_stage_map: dict[str, str] = {}
1844
+ for stage_def in stage_defs:
1845
+ for field in stage_def.fields:
1846
+ if field in ignore_set:
1847
+ continue
1848
+ field_stage_map.setdefault(field, stage_def.name)
1849
+
1850
+ report_items: list[dict[str, Any]] = []
1851
+ for paper in papers:
1852
+ if not isinstance(paper, dict):
1853
+ continue
1854
+ missing_fields = [
1855
+ field
1856
+ for field in schema_fields
1857
+ if field not in paper or is_empty_value(paper.get(field))
1858
+ ]
1859
+ if not missing_fields:
1860
+ continue
1861
+ item: dict[str, Any] = {
1862
+ "source_path": str(paper.get("source_path") or ""),
1863
+ "paper_title": str(paper.get("paper_title") or ""),
1864
+ "missing_fields": missing_fields,
1865
+ }
1866
+ if field_stage_map and all(field in field_stage_map for field in missing_fields):
1867
+ item["retry_stages"] = sorted(
1868
+ {field_stage_map[field] for field in missing_fields}
1869
+ )
1870
+ report_items.append(item)
1871
+
1872
+ report_payload = {
1873
+ "template_tag": template_tag,
1874
+ "schema_fields": schema_fields,
1875
+ "items": report_items,
1876
+ }
1877
+
1878
+ output_path = Path(output_json)
1879
+ output_path.parent.mkdir(parents=True, exist_ok=True)
1880
+ write_json(output_path, report_payload)
1881
+
1882
+ console = Console()
1883
+ total_missing = sum(len(item["missing_fields"]) for item in report_items)
1884
+ summary_table = Table(title="db verify summary")
1885
+ summary_table.add_column("Metric", style="cyan")
1886
+ summary_table.add_column("Value", style="white", overflow="fold")
1887
+ summary_table.add_row("Input", str(input_path))
1888
+ summary_table.add_row("Template", template_tag)
1889
+ summary_table.add_row("Items", str(len(papers)))
1890
+ summary_table.add_row("Items with missing fields", str(len(report_items)))
1891
+ summary_table.add_row("Total missing fields", str(total_missing))
1892
+ if ignore_set:
1893
+ summary_table.add_row("Ignored fields", ", ".join(sorted(ignore_set)))
1894
+ summary_table.add_row("Output", str(output_path))
1895
+ console.print(summary_table)
1896
+
1897
+ if report_items:
1898
+ field_counts: dict[str, int] = {field: 0 for field in schema_fields}
1899
+ for item in report_items:
1900
+ for field in item["missing_fields"]:
1901
+ field_counts[field] = field_counts.get(field, 0) + 1
1902
+
1903
+ count_table = Table(title="Missing field counts")
1904
+ count_table.add_column("Field", style="cyan")
1905
+ count_table.add_column("Missing", style="yellow", justify="right")
1906
+ for field, count in sorted(field_counts.items(), key=lambda x: (-x[1], x[0])):
1907
+ if count:
1908
+ count_table.add_row(field, str(count))
1909
+ console.print(count_table)
1910
+
1911
+ detail_table = Table(title="Missing field details")
1912
+ detail_table.add_column("#", style="dim", justify="right")
1913
+ detail_table.add_column("Title", style="white", overflow="fold")
1914
+ detail_table.add_column("Source Path", style="cyan", overflow="fold")
1915
+ detail_table.add_column("Missing Fields", style="yellow", overflow="fold")
1916
+ detail_table.add_column("Retry Stages", style="green", overflow="fold")
1917
+ for idx, item in enumerate(report_items, start=1):
1918
+ retry_stages = item.get("retry_stages") or []
1919
+ detail_table.add_row(
1920
+ str(idx),
1921
+ item.get("paper_title") or "",
1922
+ item.get("source_path") or "",
1923
+ ", ".join(item.get("missing_fields", [])),
1924
+ ", ".join(retry_stages),
1925
+ )
1926
+ console.print(detail_table)
1927
+ else:
1928
+ console.print(Panel("[green]No missing fields detected.[/green]", expand=False))
1929
+
1930
+ @db_group.command("transfer-pdfs")
1931
+ @click.option("--input-list", "input_list", required=True, help="Text file containing PDF paths")
1932
+ @click.option("--output-dir", "output_dir", required=True, help="Output directory")
1933
+ @click.option("--move", "move_files", is_flag=True, help="Move PDFs instead of copying")
1934
+ @click.option("--copy", "copy_files", is_flag=True, help="Copy PDFs instead of moving")
1935
+ def transfer_pdfs(
1936
+ input_list: str,
1937
+ output_dir: str,
1938
+ move_files: bool,
1939
+ copy_files: bool,
1940
+ ) -> None:
1941
+ if move_files == copy_files:
1942
+ raise click.ClickException("Specify exactly one of --move or --copy")
1943
+
1944
+ list_path = Path(input_list)
1945
+ if not list_path.is_file():
1946
+ raise click.ClickException(f"Input list not found: {list_path}")
1947
+
1948
+ destination_root = Path(output_dir)
1949
+ destination_root.mkdir(parents=True, exist_ok=True)
1950
+
1951
+ entries = [line.strip() for line in list_path.read_text(encoding="utf-8").splitlines()]
1952
+ entries = [line for line in entries if line]
1953
+
1954
+ processed = 0
1955
+ missing = 0
1956
+ transfer_iter = tqdm(entries, total=len(entries), desc="transfer pdfs", unit="file")
1957
+ for raw in transfer_iter:
1958
+ source = Path(raw).expanduser()
1959
+ if not source.is_file():
1960
+ missing += 1
1961
+ continue
1962
+ destination = destination_root / source.name
1963
+ destination.parent.mkdir(parents=True, exist_ok=True)
1964
+ if move_files:
1965
+ shutil.move(str(source), str(destination))
1966
+ else:
1967
+ shutil.copy2(source, destination)
1968
+ processed += 1
1969
+
1970
+ action = "Moved" if move_files else "Copied"
1971
+ click.echo(f"{action} {processed} PDFs to {destination_root}")
1972
+ if missing:
1973
+ click.echo(f"Skipped {missing} missing paths")
1974
+
846
1975
  @db_group.command("compare")
847
1976
  @click.option(
848
1977
  "-ia", "--input-a", "input_paths_a", multiple=True, help="Input JSON files for side A (repeatable)"
@@ -875,6 +2004,12 @@ def register_db_commands(db_group: click.Group) -> None:
875
2004
  @click.option(
876
2005
  "--output-csv", "output_csv", default=None, help="Path to export results as CSV"
877
2006
  )
2007
+ @click.option(
2008
+ "--output-only-in-b",
2009
+ "output_only_in_b",
2010
+ default=None,
2011
+ help="Path to export only-in-B source paths as a newline list",
2012
+ )
878
2013
  @click.option(
879
2014
  "--sample-limit", "sample_limit", default=5, type=int, show_default=True,
880
2015
  help="Number of sample items to show in terminal output"
@@ -891,12 +2026,12 @@ def register_db_commands(db_group: click.Group) -> None:
891
2026
  bibtex_path: str | None,
892
2027
  lang: str | None,
893
2028
  output_csv: str | None,
2029
+ output_only_in_b: str | None,
894
2030
  sample_limit: int,
895
2031
  ) -> None:
896
2032
  """Compare two datasets and report matches and differences."""
897
2033
  from deepresearch_flow.paper.db_ops import compare_datasets
898
- import csv
899
-
2034
+
900
2035
  # Validate that at least one input is provided for each side
901
2036
  has_input_a = bool(input_paths_a or pdf_roots_a or md_roots_a or md_translated_roots_a)
902
2037
  has_input_b = bool(input_paths_b or pdf_roots_b or md_roots_b or md_translated_roots_b)
@@ -925,6 +2060,7 @@ def register_db_commands(db_group: click.Group) -> None:
925
2060
  md_translated_roots_b=[Path(p) for p in md_translated_roots_b],
926
2061
  bibtex_path=Path(bibtex_path) if bibtex_path else None,
927
2062
  lang=lang,
2063
+ show_progress=True,
928
2064
  )
929
2065
  except ValueError as exc:
930
2066
  raise click.ClickException(str(exc)) from exc
@@ -998,31 +2134,14 @@ def register_db_commands(db_group: click.Group) -> None:
998
2134
  # Export to CSV if requested
999
2135
  if output_csv:
1000
2136
  output_path = Path(output_csv)
1001
- output_path.parent.mkdir(parents=True, exist_ok=True)
1002
-
1003
- with open(output_path, "w", newline="", encoding="utf-8") as f:
1004
- writer = csv.writer(f)
1005
- writer.writerow([
1006
- "Side", "Source Hash", "Title", "Match Status", "Match Type",
1007
- "Match Score", "Source Path", "Other Source Hash", "Other Title",
1008
- "Other Source Path", "Lang"
1009
- ])
1010
- for r in results:
1011
- writer.writerow([
1012
- r.side,
1013
- r.source_hash,
1014
- r.title,
1015
- r.match_status,
1016
- r.match_type or "",
1017
- f"{r.match_score:.4f}",
1018
- r.source_path or "",
1019
- r.other_source_hash or "",
1020
- r.other_title or "",
1021
- r.other_source_path or "",
1022
- r.lang or "",
1023
- ])
1024
-
2137
+ export_compare_csv(results, output_path)
1025
2138
  console.print(f"\n[green]Results exported to: {output_path}[/green]")
1026
-
2139
+ if output_only_in_b:
2140
+ output_path = Path(output_only_in_b)
2141
+ count = export_only_in_b_paths(results, output_path)
2142
+ console.print(
2143
+ f"\n[green]Only-in-B list exported ({count} items): {output_path}[/green]"
2144
+ )
2145
+
1027
2146
  # Print final counts
1028
2147
  console.print(f"\nTotal results: {len(results)}")