deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/cli.py +63 -0
- deepresearch_flow/paper/config.py +87 -12
- deepresearch_flow/paper/db.py +1041 -34
- deepresearch_flow/paper/db_ops.py +124 -19
- deepresearch_flow/paper/extract.py +1546 -152
- deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
- deepresearch_flow/paper/providers/azure_openai.py +45 -3
- deepresearch_flow/paper/providers/openai_compatible.py +45 -3
- deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
- deepresearch_flow/paper/snapshot/__init__.py +4 -0
- deepresearch_flow/paper/snapshot/api.py +941 -0
- deepresearch_flow/paper/snapshot/builder.py +965 -0
- deepresearch_flow/paper/snapshot/identity.py +239 -0
- deepresearch_flow/paper/snapshot/schema.py +245 -0
- deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
- deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
- deepresearch_flow/paper/snapshot/text.py +154 -0
- deepresearch_flow/paper/template_registry.py +1 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
- deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
- deepresearch_flow/paper/web/app.py +10 -3
- deepresearch_flow/recognize/cli.py +380 -103
- deepresearch_flow/recognize/markdown.py +31 -7
- deepresearch_flow/recognize/math.py +47 -12
- deepresearch_flow/recognize/mermaid.py +320 -10
- deepresearch_flow/recognize/organize.py +29 -7
- deepresearch_flow/translator/cli.py +71 -20
- deepresearch_flow/translator/engine.py +220 -81
- deepresearch_flow/translator/prompts.py +19 -2
- deepresearch_flow/translator/protector.py +15 -3
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +51 -43
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
deepresearch_flow/paper/db.py
CHANGED
|
@@ -5,10 +5,13 @@ from __future__ import annotations
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import json
|
|
7
7
|
import re
|
|
8
|
+
import shutil
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import Any, Iterable
|
|
10
11
|
import difflib
|
|
11
12
|
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
|
|
12
15
|
import click
|
|
13
16
|
import httpx
|
|
14
17
|
from rich.console import Console
|
|
@@ -19,7 +22,12 @@ from deepresearch_flow.paper.config import load_config, resolve_api_keys
|
|
|
19
22
|
from deepresearch_flow.paper.extract import parse_model_ref
|
|
20
23
|
from deepresearch_flow.paper.llm import backoff_delay, call_provider
|
|
21
24
|
from deepresearch_flow.paper.providers.base import ProviderError
|
|
22
|
-
from deepresearch_flow.paper.
|
|
25
|
+
from deepresearch_flow.paper.schema import SchemaError, load_schema
|
|
26
|
+
from deepresearch_flow.paper.template_registry import (
|
|
27
|
+
get_stage_definitions,
|
|
28
|
+
list_template_names,
|
|
29
|
+
load_schema_for_template,
|
|
30
|
+
)
|
|
23
31
|
from deepresearch_flow.paper.render import resolve_render_template, render_papers
|
|
24
32
|
|
|
25
33
|
try:
|
|
@@ -42,6 +50,74 @@ def write_json(path: Path, data: Any) -> None:
|
|
|
42
50
|
path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
43
51
|
|
|
44
52
|
|
|
53
|
+
def load_json_payload(path: Path) -> tuple[list[dict[str, Any]], dict[str, Any] | None]:
|
|
54
|
+
try:
|
|
55
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
56
|
+
except json.JSONDecodeError as exc:
|
|
57
|
+
raise click.ClickException(f"Invalid JSON in {path}: {exc}") from exc
|
|
58
|
+
|
|
59
|
+
if isinstance(data, list):
|
|
60
|
+
return data, None
|
|
61
|
+
if isinstance(data, dict):
|
|
62
|
+
papers = data.get("papers")
|
|
63
|
+
if isinstance(papers, list):
|
|
64
|
+
return papers, data
|
|
65
|
+
raise click.ClickException(f"JSON object missing 'papers' list: {path}")
|
|
66
|
+
|
|
67
|
+
raise click.ClickException(f"Unsupported JSON structure in {path}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def is_empty_value(value: Any) -> bool:
|
|
71
|
+
if value is None:
|
|
72
|
+
return True
|
|
73
|
+
if isinstance(value, str):
|
|
74
|
+
return value.strip() == ""
|
|
75
|
+
if isinstance(value, list) or isinstance(value, dict):
|
|
76
|
+
return len(value) == 0
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def export_compare_csv(results: list[Any], output_path: Path) -> None:
|
|
81
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
82
|
+
|
|
83
|
+
import csv
|
|
84
|
+
|
|
85
|
+
with open(output_path, "w", newline="", encoding="utf-8") as handle:
|
|
86
|
+
writer = csv.writer(handle)
|
|
87
|
+
writer.writerow([
|
|
88
|
+
"Side", "Source Hash", "Title", "Match Status", "Match Type",
|
|
89
|
+
"Match Score", "Source Path", "Other Source Hash", "Other Title",
|
|
90
|
+
"Other Source Path", "Lang"
|
|
91
|
+
])
|
|
92
|
+
for result in results:
|
|
93
|
+
writer.writerow([
|
|
94
|
+
result.side,
|
|
95
|
+
result.source_hash,
|
|
96
|
+
result.title,
|
|
97
|
+
result.match_status,
|
|
98
|
+
result.match_type or "",
|
|
99
|
+
f"{result.match_score:.4f}",
|
|
100
|
+
result.source_path or "",
|
|
101
|
+
result.other_source_hash or "",
|
|
102
|
+
result.other_title or "",
|
|
103
|
+
result.other_source_path or "",
|
|
104
|
+
result.lang or "",
|
|
105
|
+
])
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def export_only_in_b_paths(results: list[Any], output_path: Path) -> int:
|
|
109
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
110
|
+
lines = []
|
|
111
|
+
for result in results:
|
|
112
|
+
if result.side != "B" or result.match_status != "only_in_B":
|
|
113
|
+
continue
|
|
114
|
+
if result.source_path:
|
|
115
|
+
lines.append(result.source_path)
|
|
116
|
+
|
|
117
|
+
output_path.write_text("\n".join(lines) + ("\n" if lines else ""), encoding="utf-8")
|
|
118
|
+
return len(lines)
|
|
119
|
+
|
|
120
|
+
|
|
45
121
|
def normalize_authors(value: Any) -> list[str]:
|
|
46
122
|
if value is None:
|
|
47
123
|
return []
|
|
@@ -133,6 +209,18 @@ def parse_year_month(date_str: str | None) -> tuple[str | None, str | None]:
|
|
|
133
209
|
return year, None
|
|
134
210
|
|
|
135
211
|
|
|
212
|
+
def resolve_relative_path(path: Path, roots: Iterable[Path]) -> Path:
|
|
213
|
+
resolved = path.resolve()
|
|
214
|
+
roots_by_depth = sorted(roots, key=lambda r: len(str(r.resolve())), reverse=True)
|
|
215
|
+
for root in roots_by_depth:
|
|
216
|
+
root_resolved = root.resolve()
|
|
217
|
+
try:
|
|
218
|
+
return resolved.relative_to(root_resolved)
|
|
219
|
+
except ValueError:
|
|
220
|
+
continue
|
|
221
|
+
return Path(path.name)
|
|
222
|
+
|
|
223
|
+
|
|
136
224
|
def clean_journal_name(name: str | None) -> str:
|
|
137
225
|
if not name:
|
|
138
226
|
return "Unknown"
|
|
@@ -266,6 +354,147 @@ def parse_tag_list(text: str) -> list[str]:
|
|
|
266
354
|
|
|
267
355
|
|
|
268
356
|
def register_db_commands(db_group: click.Group) -> None:
|
|
357
|
+
@db_group.group("snapshot")
|
|
358
|
+
def snapshot_group() -> None:
|
|
359
|
+
"""Build production snapshot artifacts (SQLite + static export)."""
|
|
360
|
+
|
|
361
|
+
@snapshot_group.command("build")
|
|
362
|
+
@click.option("-i", "--input", "input_paths", multiple=True, required=True, help="Input JSON file path")
|
|
363
|
+
@click.option("-b", "--bibtex", "bibtex_path", default=None, help="Optional BibTeX file path")
|
|
364
|
+
@click.option(
|
|
365
|
+
"--md-root",
|
|
366
|
+
"md_roots",
|
|
367
|
+
multiple=True,
|
|
368
|
+
default=(),
|
|
369
|
+
help="Optional markdown root directory (repeatable) for source viewing",
|
|
370
|
+
)
|
|
371
|
+
@click.option(
|
|
372
|
+
"--md-translated-root",
|
|
373
|
+
"md_translated_roots",
|
|
374
|
+
multiple=True,
|
|
375
|
+
default=(),
|
|
376
|
+
help="Optional markdown root directory (repeatable) for translated viewing",
|
|
377
|
+
)
|
|
378
|
+
@click.option(
|
|
379
|
+
"--pdf-root",
|
|
380
|
+
"pdf_roots",
|
|
381
|
+
multiple=True,
|
|
382
|
+
default=(),
|
|
383
|
+
help="Optional PDF root directory (repeatable) for PDF discovery",
|
|
384
|
+
)
|
|
385
|
+
@click.option("--output-db", "output_db", default="paper_snapshot.db", show_default=True, help="Output DB path")
|
|
386
|
+
@click.option(
|
|
387
|
+
"--static-export-dir",
|
|
388
|
+
"static_export_dir",
|
|
389
|
+
default="paper-static",
|
|
390
|
+
show_default=True,
|
|
391
|
+
help="Output directory for hashed static assets",
|
|
392
|
+
)
|
|
393
|
+
@click.option(
|
|
394
|
+
"--previous-snapshot-db",
|
|
395
|
+
"previous_snapshot_db",
|
|
396
|
+
default=None,
|
|
397
|
+
help="Optional previous snapshot DB path for identity continuity",
|
|
398
|
+
)
|
|
399
|
+
def snapshot_build(
|
|
400
|
+
input_paths: tuple[str, ...],
|
|
401
|
+
bibtex_path: str | None,
|
|
402
|
+
md_roots: tuple[str, ...],
|
|
403
|
+
md_translated_roots: tuple[str, ...],
|
|
404
|
+
pdf_roots: tuple[str, ...],
|
|
405
|
+
output_db: str,
|
|
406
|
+
static_export_dir: str,
|
|
407
|
+
previous_snapshot_db: str | None,
|
|
408
|
+
) -> None:
|
|
409
|
+
"""Build a production snapshot (SQLite + static export)."""
|
|
410
|
+
from deepresearch_flow.paper.snapshot.builder import SnapshotBuildOptions, build_snapshot
|
|
411
|
+
|
|
412
|
+
opts = SnapshotBuildOptions(
|
|
413
|
+
input_paths=[Path(path) for path in input_paths],
|
|
414
|
+
bibtex_path=Path(bibtex_path) if bibtex_path else None,
|
|
415
|
+
md_roots=[Path(root) for root in md_roots],
|
|
416
|
+
md_translated_roots=[Path(root) for root in md_translated_roots],
|
|
417
|
+
pdf_roots=[Path(root) for root in pdf_roots],
|
|
418
|
+
output_db=Path(output_db),
|
|
419
|
+
static_export_dir=Path(static_export_dir),
|
|
420
|
+
previous_snapshot_db=Path(previous_snapshot_db) if previous_snapshot_db else None,
|
|
421
|
+
)
|
|
422
|
+
build_snapshot(opts)
|
|
423
|
+
click.echo(f"Wrote snapshot DB: {opts.output_db}")
|
|
424
|
+
click.echo(f"Wrote static export: {opts.static_export_dir}")
|
|
425
|
+
|
|
426
|
+
@db_group.group("api")
|
|
427
|
+
def api_group() -> None:
|
|
428
|
+
"""Read-only JSON API server backed by a snapshot DB."""
|
|
429
|
+
|
|
430
|
+
@api_group.command("serve")
|
|
431
|
+
@click.option("--snapshot-db", "snapshot_db", required=True, help="Path to paper_snapshot.db")
|
|
432
|
+
@click.option(
|
|
433
|
+
"--static-base-url",
|
|
434
|
+
"static_base_url",
|
|
435
|
+
default=None,
|
|
436
|
+
help="Static asset base URL (e.g. https://static.example.com)",
|
|
437
|
+
)
|
|
438
|
+
@click.option(
|
|
439
|
+
"--cors-origin",
|
|
440
|
+
"cors_origins",
|
|
441
|
+
multiple=True,
|
|
442
|
+
default=(),
|
|
443
|
+
help="Allowed CORS origin (repeatable; default is '*')",
|
|
444
|
+
)
|
|
445
|
+
@click.option("--max-query-length", "max_query_length", type=int, default=500, show_default=True)
|
|
446
|
+
@click.option("--max-page-size", "max_page_size", type=int, default=100, show_default=True)
|
|
447
|
+
@click.option("--max-pagination-offset", "max_pagination_offset", type=int, default=10000, show_default=True)
|
|
448
|
+
@click.option("--host", default="127.0.0.1", show_default=True, help="Bind host")
|
|
449
|
+
@click.option("--port", default=8001, type=int, show_default=True, help="Bind port")
|
|
450
|
+
def api_serve(
|
|
451
|
+
snapshot_db: str,
|
|
452
|
+
static_base_url: str | None,
|
|
453
|
+
cors_origins: tuple[str, ...],
|
|
454
|
+
max_query_length: int,
|
|
455
|
+
max_page_size: int,
|
|
456
|
+
max_pagination_offset: int,
|
|
457
|
+
host: str,
|
|
458
|
+
port: int,
|
|
459
|
+
) -> None:
|
|
460
|
+
"""Serve the snapshot-backed JSON API."""
|
|
461
|
+
import os
|
|
462
|
+
import uvicorn
|
|
463
|
+
|
|
464
|
+
from deepresearch_flow.paper.snapshot.api import ApiLimits, create_app
|
|
465
|
+
|
|
466
|
+
static_base_url_value = (
|
|
467
|
+
static_base_url
|
|
468
|
+
or os.getenv("PAPER_DB_STATIC_BASE")
|
|
469
|
+
or os.getenv("PAPER_DB_STATIC_BASE_URL")
|
|
470
|
+
or ""
|
|
471
|
+
)
|
|
472
|
+
api_base_url = os.getenv("PAPER_DB_API_BASE") or ""
|
|
473
|
+
if api_base_url and host == "127.0.0.1" and port == 8001:
|
|
474
|
+
from urllib.parse import urlparse
|
|
475
|
+
|
|
476
|
+
parsed = urlparse(api_base_url)
|
|
477
|
+
if not parsed.scheme:
|
|
478
|
+
parsed = urlparse(f"http://{api_base_url}")
|
|
479
|
+
if parsed.hostname:
|
|
480
|
+
host = parsed.hostname
|
|
481
|
+
if parsed.port:
|
|
482
|
+
port = parsed.port
|
|
483
|
+
cors_allowed = list(cors_origins) if cors_origins else ["*"]
|
|
484
|
+
limits = ApiLimits(
|
|
485
|
+
max_query_length=max_query_length,
|
|
486
|
+
max_page_size=max_page_size,
|
|
487
|
+
max_pagination_offset=max_pagination_offset,
|
|
488
|
+
)
|
|
489
|
+
app = create_app(
|
|
490
|
+
snapshot_db=Path(snapshot_db),
|
|
491
|
+
static_base_url=static_base_url_value,
|
|
492
|
+
cors_allowed_origins=cors_allowed,
|
|
493
|
+
limits=limits,
|
|
494
|
+
)
|
|
495
|
+
click.echo(f"Serving API on http://{host}:{port} (Ctrl+C to stop)")
|
|
496
|
+
uvicorn.run(app, host=host, port=port, log_level="info")
|
|
497
|
+
|
|
269
498
|
@db_group.command("append-bibtex")
|
|
270
499
|
@click.option("-i", "--input", "input_path", required=True, help="Input JSON file path")
|
|
271
500
|
@click.option("-b", "--bibtex", "bibtex_path", required=True, help="Input BibTeX file path")
|
|
@@ -782,15 +1011,195 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
782
1011
|
write_json(Path(output_path), filtered)
|
|
783
1012
|
click.echo(f"Filtered down to {len(filtered)} papers")
|
|
784
1013
|
|
|
785
|
-
@db_group.
|
|
1014
|
+
@db_group.group("merge")
|
|
1015
|
+
def merge_group() -> None:
|
|
1016
|
+
"""Merge paper JSON inputs."""
|
|
1017
|
+
|
|
1018
|
+
def _summarize_merge(output_path: Path, merged: list[dict[str, Any]], *, input_count: int) -> None:
|
|
1019
|
+
field_set: set[str] = set()
|
|
1020
|
+
for item in merged:
|
|
1021
|
+
if isinstance(item, dict):
|
|
1022
|
+
field_set.update(item.keys())
|
|
1023
|
+
field_list = sorted(field_set)
|
|
1024
|
+
|
|
1025
|
+
console = Console()
|
|
1026
|
+
summary = Table(title="Merge Summary")
|
|
1027
|
+
summary.add_column("Metric", style="bold")
|
|
1028
|
+
summary.add_column("Value")
|
|
1029
|
+
summary.add_row("Inputs", str(input_count))
|
|
1030
|
+
summary.add_row("Items", str(len(merged)))
|
|
1031
|
+
summary.add_row("Fields", str(len(field_list)))
|
|
1032
|
+
summary.add_row("Output", str(output_path))
|
|
1033
|
+
console.print(summary)
|
|
1034
|
+
|
|
1035
|
+
if field_list:
|
|
1036
|
+
field_table = Table(title="Fields")
|
|
1037
|
+
field_table.add_column("Name")
|
|
1038
|
+
for name in field_list:
|
|
1039
|
+
field_table.add_row(name)
|
|
1040
|
+
console.print(field_table)
|
|
1041
|
+
|
|
1042
|
+
@merge_group.command("library")
|
|
1043
|
+
@click.option("-i", "--inputs", "input_paths", multiple=True, required=True, help="Input JSON files")
|
|
1044
|
+
@click.option("-o", "--output", "output_path", required=True, help="Output JSON file path")
|
|
1045
|
+
def merge_library(input_paths: Iterable[str], output_path: str) -> None:
|
|
1046
|
+
paths = [Path(path) for path in input_paths]
|
|
1047
|
+
merged: list[dict[str, Any]] = []
|
|
1048
|
+
for path in paths:
|
|
1049
|
+
merged.extend(load_json(path))
|
|
1050
|
+
output = Path(output_path)
|
|
1051
|
+
write_json(output, merged)
|
|
1052
|
+
_summarize_merge(output, merged, input_count=len(paths))
|
|
1053
|
+
|
|
1054
|
+
@merge_group.command("templates")
|
|
786
1055
|
@click.option("-i", "--inputs", "input_paths", multiple=True, required=True, help="Input JSON files")
|
|
787
1056
|
@click.option("-o", "--output", "output_path", required=True, help="Output JSON file path")
|
|
788
|
-
def
|
|
1057
|
+
def merge_templates(input_paths: Iterable[str], output_path: str) -> None:
|
|
1058
|
+
from deepresearch_flow.paper import db_ops
|
|
1059
|
+
|
|
1060
|
+
paths = [Path(path) for path in input_paths]
|
|
1061
|
+
inputs = db_ops._load_paper_inputs(paths)
|
|
1062
|
+
if not inputs:
|
|
1063
|
+
raise click.ClickException("No input JSON files provided")
|
|
1064
|
+
|
|
1065
|
+
groups: list[dict[str, Any]] = []
|
|
1066
|
+
base_papers: list[dict[str, Any]] = []
|
|
1067
|
+
hash_to_group: dict[str, int] = {}
|
|
1068
|
+
paper_id_to_group: dict[int, int] = {}
|
|
1069
|
+
paper_index: dict[str, list[dict[str, Any]]] = {}
|
|
1070
|
+
|
|
1071
|
+
def rebuild_index() -> None:
|
|
1072
|
+
nonlocal paper_index, paper_id_to_group
|
|
1073
|
+
paper_index = db_ops._build_paper_index(base_papers)
|
|
1074
|
+
paper_id_to_group = {id(paper): idx for idx, paper in enumerate(base_papers)}
|
|
1075
|
+
|
|
1076
|
+
def add_group(template_tag: str, paper: dict[str, Any]) -> None:
|
|
1077
|
+
group = {
|
|
1078
|
+
"templates": {template_tag: paper},
|
|
1079
|
+
"template_order": [template_tag],
|
|
1080
|
+
}
|
|
1081
|
+
groups.append(group)
|
|
1082
|
+
base_papers.append(paper)
|
|
1083
|
+
source_hash = str(paper.get("source_hash") or "")
|
|
1084
|
+
if source_hash:
|
|
1085
|
+
hash_to_group[source_hash] = len(groups) - 1
|
|
1086
|
+
rebuild_index()
|
|
1087
|
+
|
|
1088
|
+
stats: dict[str, dict[str, int]] = {}
|
|
1089
|
+
diff_counts: dict[tuple[str, str], int] = {}
|
|
1090
|
+
diff_samples: list[tuple[str, str, str, str, str]] = []
|
|
1091
|
+
first_tag = str(inputs[0].get("template_tag") or "")
|
|
1092
|
+
base_items = inputs[0].get("papers") or []
|
|
1093
|
+
stats[first_tag] = {"total": len(base_items), "matched": len(base_items), "skipped": 0}
|
|
1094
|
+
for paper in base_items:
|
|
1095
|
+
if not isinstance(paper, dict):
|
|
1096
|
+
raise click.ClickException("Input papers must be objects")
|
|
1097
|
+
db_ops._prepare_paper_matching_fields(paper)
|
|
1098
|
+
add_group(first_tag, paper)
|
|
1099
|
+
|
|
1100
|
+
for bundle in inputs[1:]:
|
|
1101
|
+
template_tag = str(bundle.get("template_tag") or "")
|
|
1102
|
+
items = bundle.get("papers") or []
|
|
1103
|
+
matched = 0
|
|
1104
|
+
skipped = 0
|
|
1105
|
+
for paper in items:
|
|
1106
|
+
if not isinstance(paper, dict):
|
|
1107
|
+
raise click.ClickException("Input papers must be objects")
|
|
1108
|
+
db_ops._prepare_paper_matching_fields(paper)
|
|
1109
|
+
source_hash = str(paper.get("source_hash") or "")
|
|
1110
|
+
match_idx: int | None = None
|
|
1111
|
+
if source_hash and source_hash in hash_to_group:
|
|
1112
|
+
match_idx = hash_to_group[source_hash]
|
|
1113
|
+
else:
|
|
1114
|
+
match_paper, _, _ = db_ops._resolve_paper_by_title_and_meta(
|
|
1115
|
+
paper, paper_index
|
|
1116
|
+
)
|
|
1117
|
+
if match_paper is not None:
|
|
1118
|
+
match_idx = paper_id_to_group.get(id(match_paper))
|
|
1119
|
+
if match_idx is None:
|
|
1120
|
+
skipped += 1
|
|
1121
|
+
continue
|
|
1122
|
+
matched += 1
|
|
1123
|
+
group = groups[match_idx]
|
|
1124
|
+
base_templates = group.get("templates") or {}
|
|
1125
|
+
base_paper = base_templates.get(first_tag)
|
|
1126
|
+
if isinstance(base_paper, dict):
|
|
1127
|
+
for field in ("source_hash", "paper_title", "publication_date"):
|
|
1128
|
+
base_value = str(base_paper.get(field) or "")
|
|
1129
|
+
other_value = str(paper.get(field) or "")
|
|
1130
|
+
if base_value == other_value:
|
|
1131
|
+
continue
|
|
1132
|
+
diff_counts[(template_tag, field)] = diff_counts.get(
|
|
1133
|
+
(template_tag, field), 0
|
|
1134
|
+
) + 1
|
|
1135
|
+
if len(diff_samples) < 50:
|
|
1136
|
+
diff_samples.append(
|
|
1137
|
+
(
|
|
1138
|
+
template_tag,
|
|
1139
|
+
field,
|
|
1140
|
+
str(base_paper.get("paper_title") or ""),
|
|
1141
|
+
base_value,
|
|
1142
|
+
other_value,
|
|
1143
|
+
)
|
|
1144
|
+
)
|
|
1145
|
+
templates = group.setdefault("templates", {})
|
|
1146
|
+
templates[template_tag] = paper
|
|
1147
|
+
order = group.setdefault("template_order", [])
|
|
1148
|
+
if template_tag not in order:
|
|
1149
|
+
order.append(template_tag)
|
|
1150
|
+
stats[template_tag] = {"total": len(items), "matched": matched, "skipped": skipped}
|
|
1151
|
+
|
|
789
1152
|
merged: list[dict[str, Any]] = []
|
|
790
|
-
for
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
1153
|
+
for group in groups:
|
|
1154
|
+
templates = group.get("templates") or {}
|
|
1155
|
+
order = group.get("template_order") or list(templates.keys())
|
|
1156
|
+
entry: dict[str, Any] = {}
|
|
1157
|
+
for tag in order:
|
|
1158
|
+
paper = templates.get(tag)
|
|
1159
|
+
if not isinstance(paper, dict):
|
|
1160
|
+
continue
|
|
1161
|
+
for key, value in paper.items():
|
|
1162
|
+
if key not in entry:
|
|
1163
|
+
entry[key] = value
|
|
1164
|
+
merged.append(entry)
|
|
1165
|
+
|
|
1166
|
+
output = Path(output_path)
|
|
1167
|
+
write_json(output, merged)
|
|
1168
|
+
_summarize_merge(output, merged, input_count=len(paths))
|
|
1169
|
+
|
|
1170
|
+
stat_table = Table(title="Template Merge Stats")
|
|
1171
|
+
stat_table.add_column("Template")
|
|
1172
|
+
stat_table.add_column("Total", justify="right")
|
|
1173
|
+
stat_table.add_column("Matched", justify="right")
|
|
1174
|
+
stat_table.add_column("Skipped", justify="right")
|
|
1175
|
+
for tag, values in stats.items():
|
|
1176
|
+
stat_table.add_row(
|
|
1177
|
+
tag or "(unknown)",
|
|
1178
|
+
str(values.get("total", 0)),
|
|
1179
|
+
str(values.get("matched", 0)),
|
|
1180
|
+
str(values.get("skipped", 0)),
|
|
1181
|
+
)
|
|
1182
|
+
Console().print(stat_table)
|
|
1183
|
+
|
|
1184
|
+
if diff_counts:
|
|
1185
|
+
diff_table = Table(title="Template Field Diff Summary")
|
|
1186
|
+
diff_table.add_column("Template")
|
|
1187
|
+
diff_table.add_column("Field")
|
|
1188
|
+
diff_table.add_column("Count", justify="right")
|
|
1189
|
+
for (template_tag, field), count in sorted(diff_counts.items()):
|
|
1190
|
+
diff_table.add_row(template_tag or "(unknown)", field, str(count))
|
|
1191
|
+
Console().print(diff_table)
|
|
1192
|
+
|
|
1193
|
+
if diff_samples:
|
|
1194
|
+
sample_table = Table(title="Template Field Diff Samples (up to 50)")
|
|
1195
|
+
sample_table.add_column("Template")
|
|
1196
|
+
sample_table.add_column("Field")
|
|
1197
|
+
sample_table.add_column("Base Title")
|
|
1198
|
+
sample_table.add_column("Base Value")
|
|
1199
|
+
sample_table.add_column("Other Value")
|
|
1200
|
+
for row in diff_samples:
|
|
1201
|
+
sample_table.add_row(*row)
|
|
1202
|
+
Console().print(sample_table)
|
|
794
1203
|
|
|
795
1204
|
@db_group.command("render-md")
|
|
796
1205
|
@click.option("-i", "--input", "input_path", required=True, help="Input JSON file path")
|
|
@@ -843,6 +1252,614 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
843
1252
|
rendered = render_papers(papers, out_dir, template, output_language)
|
|
844
1253
|
click.echo(f"Rendered {rendered} markdown files")
|
|
845
1254
|
|
|
1255
|
+
@db_group.command("extract")
|
|
1256
|
+
@click.option("--json", "target_json", default=None, help="Target JSON database path")
|
|
1257
|
+
@click.option("--input-json", "input_json", default=None, help="Reference JSON file path")
|
|
1258
|
+
@click.option(
|
|
1259
|
+
"--pdf-root", "pdf_roots", multiple=True, help="PDF root directories for reference (repeatable)"
|
|
1260
|
+
)
|
|
1261
|
+
@click.option(
|
|
1262
|
+
"--md-root", "md_roots", multiple=True, help="Markdown root directories for reference (repeatable)"
|
|
1263
|
+
)
|
|
1264
|
+
@click.option(
|
|
1265
|
+
"--md-translated-root", "md_translated_roots", multiple=True,
|
|
1266
|
+
help="Translated Markdown root directories to extract from (repeatable)"
|
|
1267
|
+
)
|
|
1268
|
+
@click.option(
|
|
1269
|
+
"--md-source-root", "md_source_roots", multiple=True,
|
|
1270
|
+
help="Source Markdown root directories to extract from (repeatable)"
|
|
1271
|
+
)
|
|
1272
|
+
@click.option("--output-json", "output_json", default=None, help="Output JSON file path")
|
|
1273
|
+
@click.option(
|
|
1274
|
+
"--output-md-translated-root",
|
|
1275
|
+
"output_md_translated_root",
|
|
1276
|
+
default=None,
|
|
1277
|
+
help="Output directory for matched translated Markdown",
|
|
1278
|
+
)
|
|
1279
|
+
@click.option(
|
|
1280
|
+
"--output-md-root",
|
|
1281
|
+
"output_md_root",
|
|
1282
|
+
default=None,
|
|
1283
|
+
help="Output directory for matched source Markdown",
|
|
1284
|
+
)
|
|
1285
|
+
@click.option(
|
|
1286
|
+
"-b",
|
|
1287
|
+
"--input-bibtex",
|
|
1288
|
+
"input_bibtex",
|
|
1289
|
+
default=None,
|
|
1290
|
+
help="Reference BibTeX file path",
|
|
1291
|
+
)
|
|
1292
|
+
@click.option("--lang", "lang", default=None, help="Language code for translated Markdown (e.g., zh)")
|
|
1293
|
+
@click.option("--output-csv", "output_csv", default=None, help="Path to export results as CSV")
|
|
1294
|
+
def extract(
|
|
1295
|
+
target_json: str | None,
|
|
1296
|
+
input_json: str | None,
|
|
1297
|
+
pdf_roots: tuple[str, ...],
|
|
1298
|
+
md_roots: tuple[str, ...],
|
|
1299
|
+
md_translated_roots: tuple[str, ...],
|
|
1300
|
+
md_source_roots: tuple[str, ...],
|
|
1301
|
+
output_json: str | None,
|
|
1302
|
+
output_md_translated_root: str | None,
|
|
1303
|
+
output_md_root: str | None,
|
|
1304
|
+
input_bibtex: str | None,
|
|
1305
|
+
lang: str | None,
|
|
1306
|
+
output_csv: str | None,
|
|
1307
|
+
) -> None:
|
|
1308
|
+
from deepresearch_flow.paper import db_ops
|
|
1309
|
+
from deepresearch_flow.paper.utils import stable_hash
|
|
1310
|
+
|
|
1311
|
+
if input_json and input_bibtex:
|
|
1312
|
+
raise click.ClickException("Use only one of --input-json or --input-bibtex")
|
|
1313
|
+
|
|
1314
|
+
if target_json is None and input_json is not None:
|
|
1315
|
+
target_json = input_json
|
|
1316
|
+
|
|
1317
|
+
has_reference = bool(pdf_roots or md_roots or input_json or input_bibtex)
|
|
1318
|
+
if not has_reference:
|
|
1319
|
+
raise click.ClickException(
|
|
1320
|
+
"Provide at least one reference input: --pdf-root, --md-root, --input-json, or --input-bibtex"
|
|
1321
|
+
)
|
|
1322
|
+
if not target_json and not md_translated_roots and not md_source_roots:
|
|
1323
|
+
raise click.ClickException(
|
|
1324
|
+
"Provide --json and/or --md-translated-root and/or --md-source-root"
|
|
1325
|
+
)
|
|
1326
|
+
if target_json and not output_json:
|
|
1327
|
+
raise click.ClickException("--output-json is required when using --json")
|
|
1328
|
+
if output_json and not target_json:
|
|
1329
|
+
raise click.ClickException("--json is required when using --output-json")
|
|
1330
|
+
if md_translated_roots and not output_md_translated_root:
|
|
1331
|
+
raise click.ClickException(
|
|
1332
|
+
"--output-md-translated-root is required when using --md-translated-root"
|
|
1333
|
+
)
|
|
1334
|
+
if output_md_translated_root and not md_translated_roots:
|
|
1335
|
+
raise click.ClickException(
|
|
1336
|
+
"--md-translated-root is required when using --output-md-translated-root"
|
|
1337
|
+
)
|
|
1338
|
+
if md_source_roots and not output_md_root:
|
|
1339
|
+
raise click.ClickException("--output-md-root is required when using --md-source-root")
|
|
1340
|
+
if output_md_root and not md_source_roots:
|
|
1341
|
+
raise click.ClickException("--md-source-root is required when using --output-md-root")
|
|
1342
|
+
if md_translated_roots and not lang:
|
|
1343
|
+
raise click.ClickException("--lang is required when extracting translated Markdown")
|
|
1344
|
+
|
|
1345
|
+
pdf_root_paths = [Path(path) for path in pdf_roots]
|
|
1346
|
+
md_root_paths = [Path(path) for path in md_roots]
|
|
1347
|
+
translated_root_paths = [Path(path) for path in md_translated_roots]
|
|
1348
|
+
source_root_paths = [Path(path) for path in md_source_roots]
|
|
1349
|
+
reference_json_path = Path(input_json) if input_json else None
|
|
1350
|
+
reference_bibtex_path = Path(input_bibtex) if input_bibtex else None
|
|
1351
|
+
|
|
1352
|
+
reference_papers: list[dict[str, Any]] = []
|
|
1353
|
+
if reference_json_path:
|
|
1354
|
+
if not reference_json_path.is_file():
|
|
1355
|
+
raise click.ClickException(f"Reference JSON not found: {reference_json_path}")
|
|
1356
|
+
reference_papers, _ = load_json_payload(reference_json_path)
|
|
1357
|
+
if reference_bibtex_path:
|
|
1358
|
+
if not reference_bibtex_path.is_file():
|
|
1359
|
+
raise click.ClickException(f"Reference BibTeX not found: {reference_bibtex_path}")
|
|
1360
|
+
if not db_ops.PYBTEX_AVAILABLE:
|
|
1361
|
+
raise click.ClickException("pybtex is required for --input-bibtex support")
|
|
1362
|
+
bib_data = db_ops.parse_file(str(reference_bibtex_path))
|
|
1363
|
+
for key, entry in bib_data.entries.items():
|
|
1364
|
+
title = entry.fields.get("title")
|
|
1365
|
+
if not title:
|
|
1366
|
+
continue
|
|
1367
|
+
year = entry.fields.get("year") or ""
|
|
1368
|
+
year = str(year) if str(year).isdigit() else ""
|
|
1369
|
+
authors = []
|
|
1370
|
+
for person in entry.persons.get("author", []):
|
|
1371
|
+
authors.append(str(person))
|
|
1372
|
+
reference_papers.append(
|
|
1373
|
+
{
|
|
1374
|
+
"paper_title": str(title),
|
|
1375
|
+
"paper_authors": authors,
|
|
1376
|
+
"publication_date": year,
|
|
1377
|
+
"source_path": f"bibtex:{key}",
|
|
1378
|
+
}
|
|
1379
|
+
)
|
|
1380
|
+
|
|
1381
|
+
reference_index: dict[str, list[dict[str, Any]]] = {}
|
|
1382
|
+
for paper in reference_papers:
|
|
1383
|
+
if "source_path" not in paper and reference_json_path:
|
|
1384
|
+
paper["source_path"] = str(reference_json_path)
|
|
1385
|
+
db_ops._prepare_paper_matching_fields(paper)
|
|
1386
|
+
if reference_papers:
|
|
1387
|
+
reference_index = db_ops._build_paper_index(reference_papers)
|
|
1388
|
+
|
|
1389
|
+
all_results: list[Any] = []
|
|
1390
|
+
|
|
1391
|
+
if target_json:
|
|
1392
|
+
target_json_path = Path(target_json)
|
|
1393
|
+
if not target_json_path.is_file():
|
|
1394
|
+
raise click.ClickException(f"Target JSON not found: {target_json_path}")
|
|
1395
|
+
papers, payload = load_json_payload(target_json_path)
|
|
1396
|
+
|
|
1397
|
+
results: list[Any] = []
|
|
1398
|
+
matched_indices: set[int]
|
|
1399
|
+
if pdf_root_paths or md_root_paths:
|
|
1400
|
+
results, match_pairs, _, _ = db_ops.compare_datasets_with_pairs(
|
|
1401
|
+
json_paths_a=[target_json_path],
|
|
1402
|
+
pdf_roots_b=pdf_root_paths,
|
|
1403
|
+
md_roots_b=md_root_paths,
|
|
1404
|
+
bibtex_path=None,
|
|
1405
|
+
lang=None,
|
|
1406
|
+
show_progress=True,
|
|
1407
|
+
)
|
|
1408
|
+
matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
|
|
1409
|
+
all_results.extend(results)
|
|
1410
|
+
else:
|
|
1411
|
+
matched_indices = set(range(len(papers)))
|
|
1412
|
+
|
|
1413
|
+
matched_reference_ids: set[int] = set()
|
|
1414
|
+
if reference_index:
|
|
1415
|
+
def detail_score(paper: dict[str, Any]) -> tuple[int, int]:
|
|
1416
|
+
non_empty = 0
|
|
1417
|
+
total_len = 0
|
|
1418
|
+
for value in paper.values():
|
|
1419
|
+
if value is None:
|
|
1420
|
+
continue
|
|
1421
|
+
if isinstance(value, (list, dict)):
|
|
1422
|
+
if value:
|
|
1423
|
+
non_empty += 1
|
|
1424
|
+
total_len += len(
|
|
1425
|
+
json.dumps(value, ensure_ascii=False, sort_keys=True)
|
|
1426
|
+
)
|
|
1427
|
+
else:
|
|
1428
|
+
text = str(value).strip()
|
|
1429
|
+
if text:
|
|
1430
|
+
non_empty += 1
|
|
1431
|
+
total_len += len(text)
|
|
1432
|
+
return non_empty, total_len
|
|
1433
|
+
|
|
1434
|
+
def resolve_reference_match(
|
|
1435
|
+
paper: dict[str, Any],
|
|
1436
|
+
) -> tuple[dict[str, Any] | None, str | None, float]:
|
|
1437
|
+
match_paper, match_type, match_score = db_ops._resolve_paper_by_title_and_meta(
|
|
1438
|
+
paper, reference_index
|
|
1439
|
+
)
|
|
1440
|
+
if match_paper is not None:
|
|
1441
|
+
return match_paper, match_type, match_score
|
|
1442
|
+
year = str(paper.get("_year") or "").strip()
|
|
1443
|
+
if not year.isdigit():
|
|
1444
|
+
return None, None, 0.0
|
|
1445
|
+
authors = paper.get("_authors") or []
|
|
1446
|
+
author_key = ""
|
|
1447
|
+
if authors:
|
|
1448
|
+
author_key = db_ops._normalize_author_key(str(authors[0]))
|
|
1449
|
+
candidates: list[dict[str, Any]] = []
|
|
1450
|
+
fallback_type = "year_relaxed"
|
|
1451
|
+
if author_key:
|
|
1452
|
+
candidates = reference_index.get(f"authoryear:{year}:{author_key}", [])
|
|
1453
|
+
if candidates:
|
|
1454
|
+
fallback_type = "author_year_relaxed"
|
|
1455
|
+
if not candidates:
|
|
1456
|
+
candidates = reference_index.get(f"year:{year}", [])
|
|
1457
|
+
if not candidates:
|
|
1458
|
+
return None, None, 0.0
|
|
1459
|
+
title_key = db_ops._normalize_title_key(str(paper.get("paper_title") or ""))
|
|
1460
|
+
match, score = db_ops._adaptive_similarity_match_papers(title_key, candidates)
|
|
1461
|
+
if match is None:
|
|
1462
|
+
return candidates[0], fallback_type, 0.0
|
|
1463
|
+
return match, fallback_type, score
|
|
1464
|
+
|
|
1465
|
+
base_indices = set(matched_indices)
|
|
1466
|
+
best_matches: dict[int, tuple[int, tuple[int, int], str | None, float]] = {}
|
|
1467
|
+
for idx, paper in enumerate(papers):
|
|
1468
|
+
if idx not in matched_indices:
|
|
1469
|
+
continue
|
|
1470
|
+
db_ops._prepare_paper_matching_fields(paper)
|
|
1471
|
+
match_paper, match_type, match_score = resolve_reference_match(paper)
|
|
1472
|
+
if match_paper is None:
|
|
1473
|
+
continue
|
|
1474
|
+
ref_id = id(match_paper)
|
|
1475
|
+
score = detail_score(paper)
|
|
1476
|
+
current = best_matches.get(ref_id)
|
|
1477
|
+
if current is None:
|
|
1478
|
+
best_matches[ref_id] = (idx, score, match_type, match_score)
|
|
1479
|
+
continue
|
|
1480
|
+
if score > current[1] or (score == current[1] and match_score > current[3]):
|
|
1481
|
+
best_matches[ref_id] = (idx, score, match_type, match_score)
|
|
1482
|
+
|
|
1483
|
+
matched_reference_ids = set(best_matches.keys())
|
|
1484
|
+
matched_indices = {idx for idx, *_ in best_matches.values()}
|
|
1485
|
+
|
|
1486
|
+
matched_papers = [paper for idx, paper in enumerate(papers) if idx in matched_indices]
|
|
1487
|
+
deduped_papers: list[Any] = []
|
|
1488
|
+
seen_titles: set[str] = set()
|
|
1489
|
+
for paper in matched_papers:
|
|
1490
|
+
title_key = db_ops._normalize_title_key(str(paper.get("paper_title") or ""))
|
|
1491
|
+
if title_key:
|
|
1492
|
+
if title_key in seen_titles:
|
|
1493
|
+
continue
|
|
1494
|
+
seen_titles.add(title_key)
|
|
1495
|
+
deduped_papers.append(paper)
|
|
1496
|
+
if len(deduped_papers) != len(matched_papers):
|
|
1497
|
+
removed = len(matched_papers) - len(deduped_papers)
|
|
1498
|
+
click.echo(f"Deduplicated {removed} entries by normalized title.")
|
|
1499
|
+
matched_papers = deduped_papers
|
|
1500
|
+
output_path = Path(output_json) if output_json else None
|
|
1501
|
+
if output_path is None:
|
|
1502
|
+
raise click.ClickException("--output-json is required when using --json")
|
|
1503
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1504
|
+
if payload is None:
|
|
1505
|
+
write_json(output_path, matched_papers)
|
|
1506
|
+
else:
|
|
1507
|
+
output_payload = dict(payload)
|
|
1508
|
+
output_payload["papers"] = matched_papers
|
|
1509
|
+
write_json(output_path, output_payload)
|
|
1510
|
+
click.echo(f"Extracted {len(matched_papers)} JSON entries to {output_path}")
|
|
1511
|
+
|
|
1512
|
+
if output_csv and reference_papers:
|
|
1513
|
+
match_meta_by_ref_id = {
|
|
1514
|
+
ref_id: (idx, match_type, match_score)
|
|
1515
|
+
for ref_id, (idx, _, match_type, match_score) in best_matches.items()
|
|
1516
|
+
}
|
|
1517
|
+
for ref in reference_papers:
|
|
1518
|
+
ref_id = id(ref)
|
|
1519
|
+
ref_title = str(ref.get("paper_title") or "")
|
|
1520
|
+
ref_hash = stable_hash(str(ref_title or ref.get("source_path") or ""))
|
|
1521
|
+
ref_path = str(ref.get("source_path") or "")
|
|
1522
|
+
if ref_id in match_meta_by_ref_id:
|
|
1523
|
+
idx, match_type, match_score = match_meta_by_ref_id[ref_id]
|
|
1524
|
+
paper = papers[idx]
|
|
1525
|
+
paper_hash = str(paper.get("source_hash") or "") or stable_hash(
|
|
1526
|
+
str(paper.get("paper_title") or "")
|
|
1527
|
+
)
|
|
1528
|
+
all_results.append(
|
|
1529
|
+
db_ops.CompareResult(
|
|
1530
|
+
side="MATCH",
|
|
1531
|
+
source_hash=ref_hash,
|
|
1532
|
+
title=ref_title,
|
|
1533
|
+
match_status="matched_pair",
|
|
1534
|
+
match_type=match_type,
|
|
1535
|
+
match_score=match_score,
|
|
1536
|
+
source_path=ref_path,
|
|
1537
|
+
other_source_hash=paper_hash,
|
|
1538
|
+
other_title=str(paper.get("paper_title") or ""),
|
|
1539
|
+
other_source_path=str(paper.get("source_path") or ""),
|
|
1540
|
+
lang=None,
|
|
1541
|
+
)
|
|
1542
|
+
)
|
|
1543
|
+
continue
|
|
1544
|
+
all_results.append(
|
|
1545
|
+
db_ops.CompareResult(
|
|
1546
|
+
side="B",
|
|
1547
|
+
source_hash=ref_hash,
|
|
1548
|
+
title=ref_title,
|
|
1549
|
+
match_status="only_in_B",
|
|
1550
|
+
match_type=None,
|
|
1551
|
+
match_score=0.0,
|
|
1552
|
+
source_path=ref_path,
|
|
1553
|
+
other_source_hash=None,
|
|
1554
|
+
other_title=None,
|
|
1555
|
+
other_source_path=None,
|
|
1556
|
+
lang=None,
|
|
1557
|
+
)
|
|
1558
|
+
)
|
|
1559
|
+
|
|
1560
|
+
for idx in sorted(base_indices - matched_indices):
|
|
1561
|
+
paper = papers[idx]
|
|
1562
|
+
paper_title = str(paper.get("paper_title") or "")
|
|
1563
|
+
paper_hash = str(paper.get("source_hash") or "") or stable_hash(paper_title)
|
|
1564
|
+
all_results.append(
|
|
1565
|
+
db_ops.CompareResult(
|
|
1566
|
+
side="A",
|
|
1567
|
+
source_hash=paper_hash,
|
|
1568
|
+
title=paper_title,
|
|
1569
|
+
match_status="only_in_A",
|
|
1570
|
+
match_type=None,
|
|
1571
|
+
match_score=0.0,
|
|
1572
|
+
source_path=str(paper.get("source_path") or ""),
|
|
1573
|
+
other_source_hash=None,
|
|
1574
|
+
other_title=None,
|
|
1575
|
+
other_source_path=None,
|
|
1576
|
+
lang=None,
|
|
1577
|
+
)
|
|
1578
|
+
)
|
|
1579
|
+
|
|
1580
|
+
copied_count = 0
|
|
1581
|
+
if md_translated_roots:
|
|
1582
|
+
output_root = Path(output_md_translated_root) if output_md_translated_root else None
|
|
1583
|
+
if output_root is None:
|
|
1584
|
+
raise click.ClickException(
|
|
1585
|
+
"--output-md-translated-root is required when using --md-translated-root"
|
|
1586
|
+
)
|
|
1587
|
+
results, match_pairs, dataset_a, _ = compare_datasets_with_pairs(
|
|
1588
|
+
md_translated_roots_a=translated_root_paths,
|
|
1589
|
+
pdf_roots_b=pdf_root_paths,
|
|
1590
|
+
md_roots_b=md_root_paths,
|
|
1591
|
+
lang=lang,
|
|
1592
|
+
show_progress=True,
|
|
1593
|
+
)
|
|
1594
|
+
matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
|
|
1595
|
+
copy_iter = tqdm(
|
|
1596
|
+
enumerate(dataset_a.papers),
|
|
1597
|
+
total=len(dataset_a.papers),
|
|
1598
|
+
desc="copy translated",
|
|
1599
|
+
unit="file",
|
|
1600
|
+
)
|
|
1601
|
+
for idx, paper in copy_iter:
|
|
1602
|
+
if idx not in matched_indices:
|
|
1603
|
+
continue
|
|
1604
|
+
source_path = paper.get("source_path")
|
|
1605
|
+
if not source_path:
|
|
1606
|
+
continue
|
|
1607
|
+
source = Path(str(source_path))
|
|
1608
|
+
relative = resolve_relative_path(source, translated_root_paths)
|
|
1609
|
+
destination = output_root / relative
|
|
1610
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
1611
|
+
shutil.copy2(source, destination)
|
|
1612
|
+
copied_count += 1
|
|
1613
|
+
click.echo(
|
|
1614
|
+
f"Copied {copied_count} translated Markdown files to {output_root}"
|
|
1615
|
+
)
|
|
1616
|
+
all_results.extend(results)
|
|
1617
|
+
|
|
1618
|
+
if md_source_roots:
|
|
1619
|
+
output_root = Path(output_md_root) if output_md_root else None
|
|
1620
|
+
if output_root is None:
|
|
1621
|
+
raise click.ClickException("--output-md-root is required when using --md-source-root")
|
|
1622
|
+
results, match_pairs, dataset_a, _ = compare_datasets_with_pairs(
|
|
1623
|
+
md_roots_a=source_root_paths,
|
|
1624
|
+
pdf_roots_b=pdf_root_paths,
|
|
1625
|
+
md_roots_b=md_root_paths,
|
|
1626
|
+
lang=None,
|
|
1627
|
+
show_progress=True,
|
|
1628
|
+
)
|
|
1629
|
+
matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
|
|
1630
|
+
copied_source = 0
|
|
1631
|
+
copy_iter = tqdm(
|
|
1632
|
+
enumerate(dataset_a.papers),
|
|
1633
|
+
total=len(dataset_a.papers),
|
|
1634
|
+
desc="copy source",
|
|
1635
|
+
unit="file",
|
|
1636
|
+
)
|
|
1637
|
+
for idx, paper in copy_iter:
|
|
1638
|
+
if idx not in matched_indices:
|
|
1639
|
+
continue
|
|
1640
|
+
source_path = paper.get("source_path")
|
|
1641
|
+
if not source_path:
|
|
1642
|
+
continue
|
|
1643
|
+
source = Path(str(source_path))
|
|
1644
|
+
relative = resolve_relative_path(source, source_root_paths)
|
|
1645
|
+
destination = output_root / relative
|
|
1646
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
1647
|
+
shutil.copy2(source, destination)
|
|
1648
|
+
copied_source += 1
|
|
1649
|
+
click.echo(f"Copied {copied_source} source Markdown files to {output_root}")
|
|
1650
|
+
copied_count += copied_source
|
|
1651
|
+
all_results.extend(results)
|
|
1652
|
+
|
|
1653
|
+
if output_csv:
|
|
1654
|
+
output_path = Path(output_csv)
|
|
1655
|
+
export_compare_csv(all_results, output_path)
|
|
1656
|
+
click.echo(f"Results exported to: {output_path}")
|
|
1657
|
+
|
|
1658
|
+
@db_group.command("verify")
|
|
1659
|
+
@click.option("--input-json", "input_json", required=True, help="Input JSON file path")
|
|
1660
|
+
@click.option(
|
|
1661
|
+
"--output-json",
|
|
1662
|
+
"output_json",
|
|
1663
|
+
required=True,
|
|
1664
|
+
help="Output verification report JSON path",
|
|
1665
|
+
)
|
|
1666
|
+
@click.option(
|
|
1667
|
+
"--prompt-template",
|
|
1668
|
+
"prompt_template",
|
|
1669
|
+
default=None,
|
|
1670
|
+
type=click.Choice(list_template_names()),
|
|
1671
|
+
help="Prompt template to load schema (e.g., deep_read)",
|
|
1672
|
+
)
|
|
1673
|
+
@click.option(
|
|
1674
|
+
"-s",
|
|
1675
|
+
"--schema-json",
|
|
1676
|
+
"--schema",
|
|
1677
|
+
"schema_json",
|
|
1678
|
+
default=None,
|
|
1679
|
+
help="Custom schema JSON path",
|
|
1680
|
+
)
|
|
1681
|
+
@click.option(
|
|
1682
|
+
"--ignore-field",
|
|
1683
|
+
"ignore_fields",
|
|
1684
|
+
multiple=True,
|
|
1685
|
+
help="Schema field to ignore when checking empties (repeatable)",
|
|
1686
|
+
)
|
|
1687
|
+
def verify(
|
|
1688
|
+
input_json: str,
|
|
1689
|
+
output_json: str,
|
|
1690
|
+
prompt_template: str | None,
|
|
1691
|
+
schema_json: str | None,
|
|
1692
|
+
ignore_fields: tuple[str, ...],
|
|
1693
|
+
) -> None:
|
|
1694
|
+
if prompt_template and schema_json:
|
|
1695
|
+
raise click.ClickException("Use only one of --prompt-template or --schema-json")
|
|
1696
|
+
if not prompt_template and not schema_json:
|
|
1697
|
+
raise click.ClickException("Provide --prompt-template or --schema-json")
|
|
1698
|
+
|
|
1699
|
+
input_path = Path(input_json)
|
|
1700
|
+
if not input_path.is_file():
|
|
1701
|
+
raise click.ClickException(f"Input JSON not found: {input_path}")
|
|
1702
|
+
|
|
1703
|
+
papers, payload = load_json_payload(input_path)
|
|
1704
|
+
template_tag = (
|
|
1705
|
+
prompt_template
|
|
1706
|
+
or (payload.get("template_tag") if isinstance(payload, dict) else None)
|
|
1707
|
+
or "custom"
|
|
1708
|
+
)
|
|
1709
|
+
|
|
1710
|
+
try:
|
|
1711
|
+
if schema_json:
|
|
1712
|
+
schema = load_schema(schema_json)
|
|
1713
|
+
else:
|
|
1714
|
+
schema = load_schema_for_template(prompt_template or template_tag)
|
|
1715
|
+
except SchemaError as exc:
|
|
1716
|
+
raise click.ClickException(str(exc)) from exc
|
|
1717
|
+
except ValueError as exc:
|
|
1718
|
+
raise click.ClickException(str(exc)) from exc
|
|
1719
|
+
|
|
1720
|
+
ignore_set = {field.strip() for field in ignore_fields if field.strip()}
|
|
1721
|
+
properties = schema.get("properties", {})
|
|
1722
|
+
schema_fields = sorted(
|
|
1723
|
+
field
|
|
1724
|
+
for field in (set(properties.keys()) | set(schema.get("required", [])))
|
|
1725
|
+
if field not in ignore_set
|
|
1726
|
+
)
|
|
1727
|
+
if not schema_fields:
|
|
1728
|
+
raise click.ClickException("Schema does not define any properties")
|
|
1729
|
+
|
|
1730
|
+
stage_defs = get_stage_definitions(prompt_template or template_tag)
|
|
1731
|
+
field_stage_map: dict[str, str] = {}
|
|
1732
|
+
for stage_def in stage_defs:
|
|
1733
|
+
for field in stage_def.fields:
|
|
1734
|
+
if field in ignore_set:
|
|
1735
|
+
continue
|
|
1736
|
+
field_stage_map.setdefault(field, stage_def.name)
|
|
1737
|
+
|
|
1738
|
+
report_items: list[dict[str, Any]] = []
|
|
1739
|
+
for paper in papers:
|
|
1740
|
+
if not isinstance(paper, dict):
|
|
1741
|
+
continue
|
|
1742
|
+
missing_fields = [
|
|
1743
|
+
field
|
|
1744
|
+
for field in schema_fields
|
|
1745
|
+
if field not in paper or is_empty_value(paper.get(field))
|
|
1746
|
+
]
|
|
1747
|
+
if not missing_fields:
|
|
1748
|
+
continue
|
|
1749
|
+
item: dict[str, Any] = {
|
|
1750
|
+
"source_path": str(paper.get("source_path") or ""),
|
|
1751
|
+
"paper_title": str(paper.get("paper_title") or ""),
|
|
1752
|
+
"missing_fields": missing_fields,
|
|
1753
|
+
}
|
|
1754
|
+
if field_stage_map and all(field in field_stage_map for field in missing_fields):
|
|
1755
|
+
item["retry_stages"] = sorted(
|
|
1756
|
+
{field_stage_map[field] for field in missing_fields}
|
|
1757
|
+
)
|
|
1758
|
+
report_items.append(item)
|
|
1759
|
+
|
|
1760
|
+
report_payload = {
|
|
1761
|
+
"template_tag": template_tag,
|
|
1762
|
+
"schema_fields": schema_fields,
|
|
1763
|
+
"items": report_items,
|
|
1764
|
+
}
|
|
1765
|
+
|
|
1766
|
+
output_path = Path(output_json)
|
|
1767
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1768
|
+
write_json(output_path, report_payload)
|
|
1769
|
+
|
|
1770
|
+
console = Console()
|
|
1771
|
+
total_missing = sum(len(item["missing_fields"]) for item in report_items)
|
|
1772
|
+
summary_table = Table(title="db verify summary")
|
|
1773
|
+
summary_table.add_column("Metric", style="cyan")
|
|
1774
|
+
summary_table.add_column("Value", style="white", overflow="fold")
|
|
1775
|
+
summary_table.add_row("Input", str(input_path))
|
|
1776
|
+
summary_table.add_row("Template", template_tag)
|
|
1777
|
+
summary_table.add_row("Items", str(len(papers)))
|
|
1778
|
+
summary_table.add_row("Items with missing fields", str(len(report_items)))
|
|
1779
|
+
summary_table.add_row("Total missing fields", str(total_missing))
|
|
1780
|
+
if ignore_set:
|
|
1781
|
+
summary_table.add_row("Ignored fields", ", ".join(sorted(ignore_set)))
|
|
1782
|
+
summary_table.add_row("Output", str(output_path))
|
|
1783
|
+
console.print(summary_table)
|
|
1784
|
+
|
|
1785
|
+
if report_items:
|
|
1786
|
+
field_counts: dict[str, int] = {field: 0 for field in schema_fields}
|
|
1787
|
+
for item in report_items:
|
|
1788
|
+
for field in item["missing_fields"]:
|
|
1789
|
+
field_counts[field] = field_counts.get(field, 0) + 1
|
|
1790
|
+
|
|
1791
|
+
count_table = Table(title="Missing field counts")
|
|
1792
|
+
count_table.add_column("Field", style="cyan")
|
|
1793
|
+
count_table.add_column("Missing", style="yellow", justify="right")
|
|
1794
|
+
for field, count in sorted(field_counts.items(), key=lambda x: (-x[1], x[0])):
|
|
1795
|
+
if count:
|
|
1796
|
+
count_table.add_row(field, str(count))
|
|
1797
|
+
console.print(count_table)
|
|
1798
|
+
|
|
1799
|
+
detail_table = Table(title="Missing field details")
|
|
1800
|
+
detail_table.add_column("#", style="dim", justify="right")
|
|
1801
|
+
detail_table.add_column("Title", style="white", overflow="fold")
|
|
1802
|
+
detail_table.add_column("Source Path", style="cyan", overflow="fold")
|
|
1803
|
+
detail_table.add_column("Missing Fields", style="yellow", overflow="fold")
|
|
1804
|
+
detail_table.add_column("Retry Stages", style="green", overflow="fold")
|
|
1805
|
+
for idx, item in enumerate(report_items, start=1):
|
|
1806
|
+
retry_stages = item.get("retry_stages") or []
|
|
1807
|
+
detail_table.add_row(
|
|
1808
|
+
str(idx),
|
|
1809
|
+
item.get("paper_title") or "",
|
|
1810
|
+
item.get("source_path") or "",
|
|
1811
|
+
", ".join(item.get("missing_fields", [])),
|
|
1812
|
+
", ".join(retry_stages),
|
|
1813
|
+
)
|
|
1814
|
+
console.print(detail_table)
|
|
1815
|
+
else:
|
|
1816
|
+
console.print(Panel("[green]No missing fields detected.[/green]", expand=False))
|
|
1817
|
+
|
|
1818
|
+
@db_group.command("transfer-pdfs")
|
|
1819
|
+
@click.option("--input-list", "input_list", required=True, help="Text file containing PDF paths")
|
|
1820
|
+
@click.option("--output-dir", "output_dir", required=True, help="Output directory")
|
|
1821
|
+
@click.option("--move", "move_files", is_flag=True, help="Move PDFs instead of copying")
|
|
1822
|
+
@click.option("--copy", "copy_files", is_flag=True, help="Copy PDFs instead of moving")
|
|
1823
|
+
def transfer_pdfs(
|
|
1824
|
+
input_list: str,
|
|
1825
|
+
output_dir: str,
|
|
1826
|
+
move_files: bool,
|
|
1827
|
+
copy_files: bool,
|
|
1828
|
+
) -> None:
|
|
1829
|
+
if move_files == copy_files:
|
|
1830
|
+
raise click.ClickException("Specify exactly one of --move or --copy")
|
|
1831
|
+
|
|
1832
|
+
list_path = Path(input_list)
|
|
1833
|
+
if not list_path.is_file():
|
|
1834
|
+
raise click.ClickException(f"Input list not found: {list_path}")
|
|
1835
|
+
|
|
1836
|
+
destination_root = Path(output_dir)
|
|
1837
|
+
destination_root.mkdir(parents=True, exist_ok=True)
|
|
1838
|
+
|
|
1839
|
+
entries = [line.strip() for line in list_path.read_text(encoding="utf-8").splitlines()]
|
|
1840
|
+
entries = [line for line in entries if line]
|
|
1841
|
+
|
|
1842
|
+
processed = 0
|
|
1843
|
+
missing = 0
|
|
1844
|
+
transfer_iter = tqdm(entries, total=len(entries), desc="transfer pdfs", unit="file")
|
|
1845
|
+
for raw in transfer_iter:
|
|
1846
|
+
source = Path(raw).expanduser()
|
|
1847
|
+
if not source.is_file():
|
|
1848
|
+
missing += 1
|
|
1849
|
+
continue
|
|
1850
|
+
destination = destination_root / source.name
|
|
1851
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
1852
|
+
if move_files:
|
|
1853
|
+
shutil.move(str(source), str(destination))
|
|
1854
|
+
else:
|
|
1855
|
+
shutil.copy2(source, destination)
|
|
1856
|
+
processed += 1
|
|
1857
|
+
|
|
1858
|
+
action = "Moved" if move_files else "Copied"
|
|
1859
|
+
click.echo(f"{action} {processed} PDFs to {destination_root}")
|
|
1860
|
+
if missing:
|
|
1861
|
+
click.echo(f"Skipped {missing} missing paths")
|
|
1862
|
+
|
|
846
1863
|
@db_group.command("compare")
|
|
847
1864
|
@click.option(
|
|
848
1865
|
"-ia", "--input-a", "input_paths_a", multiple=True, help="Input JSON files for side A (repeatable)"
|
|
@@ -875,6 +1892,12 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
875
1892
|
@click.option(
|
|
876
1893
|
"--output-csv", "output_csv", default=None, help="Path to export results as CSV"
|
|
877
1894
|
)
|
|
1895
|
+
@click.option(
|
|
1896
|
+
"--output-only-in-b",
|
|
1897
|
+
"output_only_in_b",
|
|
1898
|
+
default=None,
|
|
1899
|
+
help="Path to export only-in-B source paths as a newline list",
|
|
1900
|
+
)
|
|
878
1901
|
@click.option(
|
|
879
1902
|
"--sample-limit", "sample_limit", default=5, type=int, show_default=True,
|
|
880
1903
|
help="Number of sample items to show in terminal output"
|
|
@@ -891,12 +1914,12 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
891
1914
|
bibtex_path: str | None,
|
|
892
1915
|
lang: str | None,
|
|
893
1916
|
output_csv: str | None,
|
|
1917
|
+
output_only_in_b: str | None,
|
|
894
1918
|
sample_limit: int,
|
|
895
1919
|
) -> None:
|
|
896
1920
|
"""Compare two datasets and report matches and differences."""
|
|
897
1921
|
from deepresearch_flow.paper.db_ops import compare_datasets
|
|
898
|
-
|
|
899
|
-
|
|
1922
|
+
|
|
900
1923
|
# Validate that at least one input is provided for each side
|
|
901
1924
|
has_input_a = bool(input_paths_a or pdf_roots_a or md_roots_a or md_translated_roots_a)
|
|
902
1925
|
has_input_b = bool(input_paths_b or pdf_roots_b or md_roots_b or md_translated_roots_b)
|
|
@@ -925,6 +1948,7 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
925
1948
|
md_translated_roots_b=[Path(p) for p in md_translated_roots_b],
|
|
926
1949
|
bibtex_path=Path(bibtex_path) if bibtex_path else None,
|
|
927
1950
|
lang=lang,
|
|
1951
|
+
show_progress=True,
|
|
928
1952
|
)
|
|
929
1953
|
except ValueError as exc:
|
|
930
1954
|
raise click.ClickException(str(exc)) from exc
|
|
@@ -998,31 +2022,14 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
998
2022
|
# Export to CSV if requested
|
|
999
2023
|
if output_csv:
|
|
1000
2024
|
output_path = Path(output_csv)
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
with open(output_path, "w", newline="", encoding="utf-8") as f:
|
|
1004
|
-
writer = csv.writer(f)
|
|
1005
|
-
writer.writerow([
|
|
1006
|
-
"Side", "Source Hash", "Title", "Match Status", "Match Type",
|
|
1007
|
-
"Match Score", "Source Path", "Other Source Hash", "Other Title",
|
|
1008
|
-
"Other Source Path", "Lang"
|
|
1009
|
-
])
|
|
1010
|
-
for r in results:
|
|
1011
|
-
writer.writerow([
|
|
1012
|
-
r.side,
|
|
1013
|
-
r.source_hash,
|
|
1014
|
-
r.title,
|
|
1015
|
-
r.match_status,
|
|
1016
|
-
r.match_type or "",
|
|
1017
|
-
f"{r.match_score:.4f}",
|
|
1018
|
-
r.source_path or "",
|
|
1019
|
-
r.other_source_hash or "",
|
|
1020
|
-
r.other_title or "",
|
|
1021
|
-
r.other_source_path or "",
|
|
1022
|
-
r.lang or "",
|
|
1023
|
-
])
|
|
1024
|
-
|
|
2025
|
+
export_compare_csv(results, output_path)
|
|
1025
2026
|
console.print(f"\n[green]Results exported to: {output_path}[/green]")
|
|
1026
|
-
|
|
2027
|
+
if output_only_in_b:
|
|
2028
|
+
output_path = Path(output_only_in_b)
|
|
2029
|
+
count = export_only_in_b_paths(results, output_path)
|
|
2030
|
+
console.print(
|
|
2031
|
+
f"\n[green]Only-in-B list exported ({count} items): {output_path}[/green]"
|
|
2032
|
+
)
|
|
2033
|
+
|
|
1027
2034
|
# Print final counts
|
|
1028
2035
|
console.print(f"\nTotal results: {len(results)}")
|