deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/cli.py +63 -0
- deepresearch_flow/paper/config.py +87 -12
- deepresearch_flow/paper/db.py +1154 -35
- deepresearch_flow/paper/db_ops.py +124 -19
- deepresearch_flow/paper/extract.py +1546 -152
- deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
- deepresearch_flow/paper/providers/azure_openai.py +45 -3
- deepresearch_flow/paper/providers/openai_compatible.py +45 -3
- deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
- deepresearch_flow/paper/snapshot/__init__.py +4 -0
- deepresearch_flow/paper/snapshot/api.py +941 -0
- deepresearch_flow/paper/snapshot/builder.py +965 -0
- deepresearch_flow/paper/snapshot/identity.py +239 -0
- deepresearch_flow/paper/snapshot/schema.py +245 -0
- deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
- deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
- deepresearch_flow/paper/snapshot/text.py +154 -0
- deepresearch_flow/paper/template_registry.py +1 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
- deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
- deepresearch_flow/paper/web/app.py +10 -3
- deepresearch_flow/recognize/cli.py +380 -103
- deepresearch_flow/recognize/markdown.py +31 -7
- deepresearch_flow/recognize/math.py +47 -12
- deepresearch_flow/recognize/mermaid.py +320 -10
- deepresearch_flow/recognize/organize.py +29 -7
- deepresearch_flow/translator/cli.py +71 -20
- deepresearch_flow/translator/engine.py +220 -81
- deepresearch_flow/translator/prompts.py +19 -2
- deepresearch_flow/translator/protector.py +15 -3
- deepresearch_flow-0.6.1.dist-info/METADATA +849 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/RECORD +51 -43
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/WHEEL +1 -1
- deepresearch_flow-0.5.1.dist-info/METADATA +0 -440
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/top_level.txt +0 -0
deepresearch_flow/paper/db.py
CHANGED
|
@@ -5,10 +5,13 @@ from __future__ import annotations
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import json
|
|
7
7
|
import re
|
|
8
|
+
import shutil
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import Any, Iterable
|
|
10
11
|
import difflib
|
|
11
12
|
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
|
|
12
15
|
import click
|
|
13
16
|
import httpx
|
|
14
17
|
from rich.console import Console
|
|
@@ -19,11 +22,17 @@ from deepresearch_flow.paper.config import load_config, resolve_api_keys
|
|
|
19
22
|
from deepresearch_flow.paper.extract import parse_model_ref
|
|
20
23
|
from deepresearch_flow.paper.llm import backoff_delay, call_provider
|
|
21
24
|
from deepresearch_flow.paper.providers.base import ProviderError
|
|
22
|
-
from deepresearch_flow.paper.
|
|
25
|
+
from deepresearch_flow.paper.schema import SchemaError, load_schema
|
|
26
|
+
from deepresearch_flow.paper.template_registry import (
|
|
27
|
+
get_stage_definitions,
|
|
28
|
+
list_template_names,
|
|
29
|
+
load_schema_for_template,
|
|
30
|
+
)
|
|
23
31
|
from deepresearch_flow.paper.render import resolve_render_template, render_papers
|
|
24
32
|
|
|
25
33
|
try:
|
|
26
|
-
from pybtex.database import parse_file
|
|
34
|
+
from pybtex.database import BibliographyData, parse_file
|
|
35
|
+
from pybtex.database.output.bibtex import Writer
|
|
27
36
|
PYBTEX_AVAILABLE = True
|
|
28
37
|
except ImportError:
|
|
29
38
|
PYBTEX_AVAILABLE = False
|
|
@@ -42,6 +51,74 @@ def write_json(path: Path, data: Any) -> None:
|
|
|
42
51
|
path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
43
52
|
|
|
44
53
|
|
|
54
|
+
def load_json_payload(path: Path) -> tuple[list[dict[str, Any]], dict[str, Any] | None]:
|
|
55
|
+
try:
|
|
56
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
57
|
+
except json.JSONDecodeError as exc:
|
|
58
|
+
raise click.ClickException(f"Invalid JSON in {path}: {exc}") from exc
|
|
59
|
+
|
|
60
|
+
if isinstance(data, list):
|
|
61
|
+
return data, None
|
|
62
|
+
if isinstance(data, dict):
|
|
63
|
+
papers = data.get("papers")
|
|
64
|
+
if isinstance(papers, list):
|
|
65
|
+
return papers, data
|
|
66
|
+
raise click.ClickException(f"JSON object missing 'papers' list: {path}")
|
|
67
|
+
|
|
68
|
+
raise click.ClickException(f"Unsupported JSON structure in {path}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def is_empty_value(value: Any) -> bool:
|
|
72
|
+
if value is None:
|
|
73
|
+
return True
|
|
74
|
+
if isinstance(value, str):
|
|
75
|
+
return value.strip() == ""
|
|
76
|
+
if isinstance(value, list) or isinstance(value, dict):
|
|
77
|
+
return len(value) == 0
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def export_compare_csv(results: list[Any], output_path: Path) -> None:
|
|
82
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
|
|
84
|
+
import csv
|
|
85
|
+
|
|
86
|
+
with open(output_path, "w", newline="", encoding="utf-8") as handle:
|
|
87
|
+
writer = csv.writer(handle)
|
|
88
|
+
writer.writerow([
|
|
89
|
+
"Side", "Source Hash", "Title", "Match Status", "Match Type",
|
|
90
|
+
"Match Score", "Source Path", "Other Source Hash", "Other Title",
|
|
91
|
+
"Other Source Path", "Lang"
|
|
92
|
+
])
|
|
93
|
+
for result in results:
|
|
94
|
+
writer.writerow([
|
|
95
|
+
result.side,
|
|
96
|
+
result.source_hash,
|
|
97
|
+
result.title,
|
|
98
|
+
result.match_status,
|
|
99
|
+
result.match_type or "",
|
|
100
|
+
f"{result.match_score:.4f}",
|
|
101
|
+
result.source_path or "",
|
|
102
|
+
result.other_source_hash or "",
|
|
103
|
+
result.other_title or "",
|
|
104
|
+
result.other_source_path or "",
|
|
105
|
+
result.lang or "",
|
|
106
|
+
])
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def export_only_in_b_paths(results: list[Any], output_path: Path) -> int:
|
|
110
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
111
|
+
lines = []
|
|
112
|
+
for result in results:
|
|
113
|
+
if result.side != "B" or result.match_status != "only_in_B":
|
|
114
|
+
continue
|
|
115
|
+
if result.source_path:
|
|
116
|
+
lines.append(result.source_path)
|
|
117
|
+
|
|
118
|
+
output_path.write_text("\n".join(lines) + ("\n" if lines else ""), encoding="utf-8")
|
|
119
|
+
return len(lines)
|
|
120
|
+
|
|
121
|
+
|
|
45
122
|
def normalize_authors(value: Any) -> list[str]:
|
|
46
123
|
if value is None:
|
|
47
124
|
return []
|
|
@@ -133,6 +210,18 @@ def parse_year_month(date_str: str | None) -> tuple[str | None, str | None]:
|
|
|
133
210
|
return year, None
|
|
134
211
|
|
|
135
212
|
|
|
213
|
+
def resolve_relative_path(path: Path, roots: Iterable[Path]) -> Path:
|
|
214
|
+
resolved = path.resolve()
|
|
215
|
+
roots_by_depth = sorted(roots, key=lambda r: len(str(r.resolve())), reverse=True)
|
|
216
|
+
for root in roots_by_depth:
|
|
217
|
+
root_resolved = root.resolve()
|
|
218
|
+
try:
|
|
219
|
+
return resolved.relative_to(root_resolved)
|
|
220
|
+
except ValueError:
|
|
221
|
+
continue
|
|
222
|
+
return Path(path.name)
|
|
223
|
+
|
|
224
|
+
|
|
136
225
|
def clean_journal_name(name: str | None) -> str:
|
|
137
226
|
if not name:
|
|
138
227
|
return "Unknown"
|
|
@@ -266,6 +355,147 @@ def parse_tag_list(text: str) -> list[str]:
|
|
|
266
355
|
|
|
267
356
|
|
|
268
357
|
def register_db_commands(db_group: click.Group) -> None:
|
|
358
|
+
@db_group.group("snapshot")
|
|
359
|
+
def snapshot_group() -> None:
|
|
360
|
+
"""Build production snapshot artifacts (SQLite + static export)."""
|
|
361
|
+
|
|
362
|
+
@snapshot_group.command("build")
|
|
363
|
+
@click.option("-i", "--input", "input_paths", multiple=True, required=True, help="Input JSON file path")
|
|
364
|
+
@click.option("-b", "--bibtex", "bibtex_path", default=None, help="Optional BibTeX file path")
|
|
365
|
+
@click.option(
|
|
366
|
+
"--md-root",
|
|
367
|
+
"md_roots",
|
|
368
|
+
multiple=True,
|
|
369
|
+
default=(),
|
|
370
|
+
help="Optional markdown root directory (repeatable) for source viewing",
|
|
371
|
+
)
|
|
372
|
+
@click.option(
|
|
373
|
+
"--md-translated-root",
|
|
374
|
+
"md_translated_roots",
|
|
375
|
+
multiple=True,
|
|
376
|
+
default=(),
|
|
377
|
+
help="Optional markdown root directory (repeatable) for translated viewing",
|
|
378
|
+
)
|
|
379
|
+
@click.option(
|
|
380
|
+
"--pdf-root",
|
|
381
|
+
"pdf_roots",
|
|
382
|
+
multiple=True,
|
|
383
|
+
default=(),
|
|
384
|
+
help="Optional PDF root directory (repeatable) for PDF discovery",
|
|
385
|
+
)
|
|
386
|
+
@click.option("--output-db", "output_db", default="paper_snapshot.db", show_default=True, help="Output DB path")
|
|
387
|
+
@click.option(
|
|
388
|
+
"--static-export-dir",
|
|
389
|
+
"static_export_dir",
|
|
390
|
+
default="paper-static",
|
|
391
|
+
show_default=True,
|
|
392
|
+
help="Output directory for hashed static assets",
|
|
393
|
+
)
|
|
394
|
+
@click.option(
|
|
395
|
+
"--previous-snapshot-db",
|
|
396
|
+
"previous_snapshot_db",
|
|
397
|
+
default=None,
|
|
398
|
+
help="Optional previous snapshot DB path for identity continuity",
|
|
399
|
+
)
|
|
400
|
+
def snapshot_build(
|
|
401
|
+
input_paths: tuple[str, ...],
|
|
402
|
+
bibtex_path: str | None,
|
|
403
|
+
md_roots: tuple[str, ...],
|
|
404
|
+
md_translated_roots: tuple[str, ...],
|
|
405
|
+
pdf_roots: tuple[str, ...],
|
|
406
|
+
output_db: str,
|
|
407
|
+
static_export_dir: str,
|
|
408
|
+
previous_snapshot_db: str | None,
|
|
409
|
+
) -> None:
|
|
410
|
+
"""Build a production snapshot (SQLite + static export)."""
|
|
411
|
+
from deepresearch_flow.paper.snapshot.builder import SnapshotBuildOptions, build_snapshot
|
|
412
|
+
|
|
413
|
+
opts = SnapshotBuildOptions(
|
|
414
|
+
input_paths=[Path(path) for path in input_paths],
|
|
415
|
+
bibtex_path=Path(bibtex_path) if bibtex_path else None,
|
|
416
|
+
md_roots=[Path(root) for root in md_roots],
|
|
417
|
+
md_translated_roots=[Path(root) for root in md_translated_roots],
|
|
418
|
+
pdf_roots=[Path(root) for root in pdf_roots],
|
|
419
|
+
output_db=Path(output_db),
|
|
420
|
+
static_export_dir=Path(static_export_dir),
|
|
421
|
+
previous_snapshot_db=Path(previous_snapshot_db) if previous_snapshot_db else None,
|
|
422
|
+
)
|
|
423
|
+
build_snapshot(opts)
|
|
424
|
+
click.echo(f"Wrote snapshot DB: {opts.output_db}")
|
|
425
|
+
click.echo(f"Wrote static export: {opts.static_export_dir}")
|
|
426
|
+
|
|
427
|
+
@db_group.group("api")
|
|
428
|
+
def api_group() -> None:
|
|
429
|
+
"""Read-only JSON API server backed by a snapshot DB."""
|
|
430
|
+
|
|
431
|
+
@api_group.command("serve")
|
|
432
|
+
@click.option("--snapshot-db", "snapshot_db", required=True, help="Path to paper_snapshot.db")
|
|
433
|
+
@click.option(
|
|
434
|
+
"--static-base-url",
|
|
435
|
+
"static_base_url",
|
|
436
|
+
default=None,
|
|
437
|
+
help="Static asset base URL (e.g. https://static.example.com)",
|
|
438
|
+
)
|
|
439
|
+
@click.option(
|
|
440
|
+
"--cors-origin",
|
|
441
|
+
"cors_origins",
|
|
442
|
+
multiple=True,
|
|
443
|
+
default=(),
|
|
444
|
+
help="Allowed CORS origin (repeatable; default is '*')",
|
|
445
|
+
)
|
|
446
|
+
@click.option("--max-query-length", "max_query_length", type=int, default=500, show_default=True)
|
|
447
|
+
@click.option("--max-page-size", "max_page_size", type=int, default=100, show_default=True)
|
|
448
|
+
@click.option("--max-pagination-offset", "max_pagination_offset", type=int, default=10000, show_default=True)
|
|
449
|
+
@click.option("--host", default="127.0.0.1", show_default=True, help="Bind host")
|
|
450
|
+
@click.option("--port", default=8001, type=int, show_default=True, help="Bind port")
|
|
451
|
+
def api_serve(
|
|
452
|
+
snapshot_db: str,
|
|
453
|
+
static_base_url: str | None,
|
|
454
|
+
cors_origins: tuple[str, ...],
|
|
455
|
+
max_query_length: int,
|
|
456
|
+
max_page_size: int,
|
|
457
|
+
max_pagination_offset: int,
|
|
458
|
+
host: str,
|
|
459
|
+
port: int,
|
|
460
|
+
) -> None:
|
|
461
|
+
"""Serve the snapshot-backed JSON API."""
|
|
462
|
+
import os
|
|
463
|
+
import uvicorn
|
|
464
|
+
|
|
465
|
+
from deepresearch_flow.paper.snapshot.api import ApiLimits, create_app
|
|
466
|
+
|
|
467
|
+
static_base_url_value = (
|
|
468
|
+
static_base_url
|
|
469
|
+
or os.getenv("PAPER_DB_STATIC_BASE")
|
|
470
|
+
or os.getenv("PAPER_DB_STATIC_BASE_URL")
|
|
471
|
+
or ""
|
|
472
|
+
)
|
|
473
|
+
api_base_url = os.getenv("PAPER_DB_API_BASE") or ""
|
|
474
|
+
if api_base_url and host == "127.0.0.1" and port == 8001:
|
|
475
|
+
from urllib.parse import urlparse
|
|
476
|
+
|
|
477
|
+
parsed = urlparse(api_base_url)
|
|
478
|
+
if not parsed.scheme:
|
|
479
|
+
parsed = urlparse(f"http://{api_base_url}")
|
|
480
|
+
if parsed.hostname:
|
|
481
|
+
host = parsed.hostname
|
|
482
|
+
if parsed.port:
|
|
483
|
+
port = parsed.port
|
|
484
|
+
cors_allowed = list(cors_origins) if cors_origins else ["*"]
|
|
485
|
+
limits = ApiLimits(
|
|
486
|
+
max_query_length=max_query_length,
|
|
487
|
+
max_page_size=max_page_size,
|
|
488
|
+
max_pagination_offset=max_pagination_offset,
|
|
489
|
+
)
|
|
490
|
+
app = create_app(
|
|
491
|
+
snapshot_db=Path(snapshot_db),
|
|
492
|
+
static_base_url=static_base_url_value,
|
|
493
|
+
cors_allowed_origins=cors_allowed,
|
|
494
|
+
limits=limits,
|
|
495
|
+
)
|
|
496
|
+
click.echo(f"Serving API on http://{host}:{port} (Ctrl+C to stop)")
|
|
497
|
+
uvicorn.run(app, host=host, port=port, log_level="info")
|
|
498
|
+
|
|
269
499
|
@db_group.command("append-bibtex")
|
|
270
500
|
@click.option("-i", "--input", "input_path", required=True, help="Input JSON file path")
|
|
271
501
|
@click.option("-b", "--bibtex", "bibtex_path", required=True, help="Input BibTeX file path")
|
|
@@ -782,15 +1012,306 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
782
1012
|
write_json(Path(output_path), filtered)
|
|
783
1013
|
click.echo(f"Filtered down to {len(filtered)} papers")
|
|
784
1014
|
|
|
785
|
-
@db_group.
|
|
1015
|
+
@db_group.group("merge")
|
|
1016
|
+
def merge_group() -> None:
|
|
1017
|
+
"""Merge paper JSON inputs."""
|
|
1018
|
+
|
|
1019
|
+
def _summarize_merge(output_path: Path, merged: Any, *, input_count: int) -> None:
|
|
1020
|
+
items: list[dict[str, Any]] = []
|
|
1021
|
+
if isinstance(merged, dict):
|
|
1022
|
+
raw_items = merged.get("papers")
|
|
1023
|
+
if isinstance(raw_items, list):
|
|
1024
|
+
items = [item for item in raw_items if isinstance(item, dict)]
|
|
1025
|
+
elif isinstance(merged, list):
|
|
1026
|
+
items = [item for item in merged if isinstance(item, dict)]
|
|
1027
|
+
|
|
1028
|
+
field_set: set[str] = set()
|
|
1029
|
+
for item in items:
|
|
1030
|
+
field_set.update(item.keys())
|
|
1031
|
+
field_list = sorted(field_set)
|
|
1032
|
+
|
|
1033
|
+
console = Console()
|
|
1034
|
+
summary = Table(title="Merge Summary")
|
|
1035
|
+
summary.add_column("Metric", style="bold")
|
|
1036
|
+
summary.add_column("Value")
|
|
1037
|
+
summary.add_row("Inputs", str(input_count))
|
|
1038
|
+
summary.add_row("Items", str(len(items)))
|
|
1039
|
+
summary.add_row("Fields", str(len(field_list)))
|
|
1040
|
+
summary.add_row("Output", str(output_path))
|
|
1041
|
+
console.print(summary)
|
|
1042
|
+
|
|
1043
|
+
if field_list:
|
|
1044
|
+
field_table = Table(title="Fields")
|
|
1045
|
+
field_table.add_column("Name")
|
|
1046
|
+
for name in field_list:
|
|
1047
|
+
field_table.add_row(name)
|
|
1048
|
+
console.print(field_table)
|
|
1049
|
+
|
|
1050
|
+
def _bibtex_entry_score(entry: Any) -> int:
|
|
1051
|
+
fields = getattr(entry, "fields", {}) or {}
|
|
1052
|
+
persons = getattr(entry, "persons", {}) or {}
|
|
1053
|
+
person_count = sum(len(people) for people in persons.values())
|
|
1054
|
+
return len(fields) + len(persons) + person_count
|
|
1055
|
+
|
|
1056
|
+
def _summarize_bibtex_merge(output_path: Path, *, input_count: int, entry_count: int, duplicate_count: int) -> None:
|
|
1057
|
+
summary = Table(title="BibTeX Merge Summary")
|
|
1058
|
+
summary.add_column("Metric", style="bold")
|
|
1059
|
+
summary.add_column("Value")
|
|
1060
|
+
summary.add_row("Inputs", str(input_count))
|
|
1061
|
+
summary.add_row("Entries", str(entry_count))
|
|
1062
|
+
summary.add_row("Duplicates", str(duplicate_count))
|
|
1063
|
+
summary.add_row("Output", str(output_path))
|
|
1064
|
+
Console().print(summary)
|
|
1065
|
+
|
|
1066
|
+
@merge_group.command("library")
|
|
786
1067
|
@click.option("-i", "--inputs", "input_paths", multiple=True, required=True, help="Input JSON files")
|
|
1068
|
+
@click.option("--template-tag", "template_tag", default=None, help="Template tag for merged output")
|
|
787
1069
|
@click.option("-o", "--output", "output_path", required=True, help="Output JSON file path")
|
|
788
|
-
def
|
|
1070
|
+
def merge_library(input_paths: Iterable[str], template_tag: str | None, output_path: str) -> None:
|
|
1071
|
+
paths = [Path(path) for path in input_paths]
|
|
789
1072
|
merged: list[dict[str, Any]] = []
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
1073
|
+
tag_candidates: list[str] = []
|
|
1074
|
+
for path in paths:
|
|
1075
|
+
payload = load_json(path)
|
|
1076
|
+
if isinstance(payload, dict):
|
|
1077
|
+
tag = str(payload.get("template_tag") or "")
|
|
1078
|
+
if tag:
|
|
1079
|
+
tag_candidates.append(tag)
|
|
1080
|
+
papers = payload.get("papers")
|
|
1081
|
+
if isinstance(papers, list):
|
|
1082
|
+
merged.extend(papers)
|
|
1083
|
+
else:
|
|
1084
|
+
raise click.ClickException("Input JSON must be a list or {template_tag, papers}")
|
|
1085
|
+
elif isinstance(payload, list):
|
|
1086
|
+
merged.extend(payload)
|
|
1087
|
+
else:
|
|
1088
|
+
raise click.ClickException("Input JSON must be a list or {template_tag, papers}")
|
|
1089
|
+
if not template_tag:
|
|
1090
|
+
inferred = ""
|
|
1091
|
+
for paper in merged:
|
|
1092
|
+
if not isinstance(paper, dict):
|
|
1093
|
+
continue
|
|
1094
|
+
inferred = str(paper.get("prompt_template") or paper.get("template_tag") or "")
|
|
1095
|
+
if inferred:
|
|
1096
|
+
break
|
|
1097
|
+
if inferred:
|
|
1098
|
+
template_tag = inferred
|
|
1099
|
+
if tag_candidates and not template_tag:
|
|
1100
|
+
template_tag = tag_candidates[0]
|
|
1101
|
+
if not template_tag:
|
|
1102
|
+
template_tag = "unknown"
|
|
1103
|
+
if tag_candidates and any(tag != template_tag for tag in tag_candidates):
|
|
1104
|
+
click.echo("Warning: multiple template_tag values detected in inputs; using first")
|
|
1105
|
+
output = Path(output_path)
|
|
1106
|
+
bundle = {"template_tag": template_tag, "papers": merged}
|
|
1107
|
+
write_json(output, bundle)
|
|
1108
|
+
_summarize_merge(output, bundle, input_count=len(paths))
|
|
1109
|
+
|
|
1110
|
+
@merge_group.command("templates")
|
|
1111
|
+
@click.option("-i", "--inputs", "input_paths", multiple=True, required=True, help="Input JSON files")
|
|
1112
|
+
@click.option("-o", "--output", "output_path", required=True, help="Output JSON file path")
|
|
1113
|
+
def merge_templates(input_paths: Iterable[str], output_path: str) -> None:
|
|
1114
|
+
from deepresearch_flow.paper import db_ops
|
|
1115
|
+
|
|
1116
|
+
paths = [Path(path) for path in input_paths]
|
|
1117
|
+
inputs = db_ops._load_paper_inputs(paths)
|
|
1118
|
+
if not inputs:
|
|
1119
|
+
raise click.ClickException("No input JSON files provided")
|
|
1120
|
+
|
|
1121
|
+
groups: list[dict[str, Any]] = []
|
|
1122
|
+
base_papers: list[dict[str, Any]] = []
|
|
1123
|
+
hash_to_group: dict[str, int] = {}
|
|
1124
|
+
paper_id_to_group: dict[int, int] = {}
|
|
1125
|
+
paper_index: dict[str, list[dict[str, Any]]] = {}
|
|
1126
|
+
|
|
1127
|
+
def rebuild_index() -> None:
|
|
1128
|
+
nonlocal paper_index, paper_id_to_group
|
|
1129
|
+
paper_index = db_ops._build_paper_index(base_papers)
|
|
1130
|
+
paper_id_to_group = {id(paper): idx for idx, paper in enumerate(base_papers)}
|
|
1131
|
+
|
|
1132
|
+
def add_group(template_tag: str, paper: dict[str, Any]) -> None:
|
|
1133
|
+
group = {
|
|
1134
|
+
"templates": {template_tag: paper},
|
|
1135
|
+
"template_order": [template_tag],
|
|
1136
|
+
}
|
|
1137
|
+
groups.append(group)
|
|
1138
|
+
base_papers.append(paper)
|
|
1139
|
+
source_hash = str(paper.get("source_hash") or "")
|
|
1140
|
+
if source_hash:
|
|
1141
|
+
hash_to_group[source_hash] = len(groups) - 1
|
|
1142
|
+
rebuild_index()
|
|
1143
|
+
|
|
1144
|
+
stats: dict[str, dict[str, int]] = {}
|
|
1145
|
+
diff_counts: dict[tuple[str, str], int] = {}
|
|
1146
|
+
diff_samples: list[tuple[str, str, str, str, str]] = []
|
|
1147
|
+
first_tag = str(inputs[0].get("template_tag") or "")
|
|
1148
|
+
base_items = inputs[0].get("papers") or []
|
|
1149
|
+
stats[first_tag] = {"total": len(base_items), "matched": len(base_items), "skipped": 0}
|
|
1150
|
+
for paper in base_items:
|
|
1151
|
+
if not isinstance(paper, dict):
|
|
1152
|
+
raise click.ClickException("Input papers must be objects")
|
|
1153
|
+
db_ops._prepare_paper_matching_fields(paper)
|
|
1154
|
+
add_group(first_tag, paper)
|
|
1155
|
+
|
|
1156
|
+
for bundle in inputs[1:]:
|
|
1157
|
+
template_tag = str(bundle.get("template_tag") or "")
|
|
1158
|
+
items = bundle.get("papers") or []
|
|
1159
|
+
matched = 0
|
|
1160
|
+
skipped = 0
|
|
1161
|
+
for paper in items:
|
|
1162
|
+
if not isinstance(paper, dict):
|
|
1163
|
+
raise click.ClickException("Input papers must be objects")
|
|
1164
|
+
db_ops._prepare_paper_matching_fields(paper)
|
|
1165
|
+
source_hash = str(paper.get("source_hash") or "")
|
|
1166
|
+
match_idx: int | None = None
|
|
1167
|
+
if source_hash and source_hash in hash_to_group:
|
|
1168
|
+
match_idx = hash_to_group[source_hash]
|
|
1169
|
+
else:
|
|
1170
|
+
match_paper, _, _ = db_ops._resolve_paper_by_title_and_meta(
|
|
1171
|
+
paper, paper_index
|
|
1172
|
+
)
|
|
1173
|
+
if match_paper is not None:
|
|
1174
|
+
match_idx = paper_id_to_group.get(id(match_paper))
|
|
1175
|
+
if match_idx is None:
|
|
1176
|
+
skipped += 1
|
|
1177
|
+
continue
|
|
1178
|
+
matched += 1
|
|
1179
|
+
group = groups[match_idx]
|
|
1180
|
+
base_templates = group.get("templates") or {}
|
|
1181
|
+
base_paper = base_templates.get(first_tag)
|
|
1182
|
+
if isinstance(base_paper, dict):
|
|
1183
|
+
for field in ("source_hash", "paper_title", "publication_date"):
|
|
1184
|
+
base_value = str(base_paper.get(field) or "")
|
|
1185
|
+
other_value = str(paper.get(field) or "")
|
|
1186
|
+
if base_value == other_value:
|
|
1187
|
+
continue
|
|
1188
|
+
diff_counts[(template_tag, field)] = diff_counts.get(
|
|
1189
|
+
(template_tag, field), 0
|
|
1190
|
+
) + 1
|
|
1191
|
+
if len(diff_samples) < 50:
|
|
1192
|
+
diff_samples.append(
|
|
1193
|
+
(
|
|
1194
|
+
template_tag,
|
|
1195
|
+
field,
|
|
1196
|
+
str(base_paper.get("paper_title") or ""),
|
|
1197
|
+
base_value,
|
|
1198
|
+
other_value,
|
|
1199
|
+
)
|
|
1200
|
+
)
|
|
1201
|
+
templates = group.setdefault("templates", {})
|
|
1202
|
+
templates[template_tag] = paper
|
|
1203
|
+
order = group.setdefault("template_order", [])
|
|
1204
|
+
if template_tag not in order:
|
|
1205
|
+
order.append(template_tag)
|
|
1206
|
+
stats[template_tag] = {"total": len(items), "matched": matched, "skipped": skipped}
|
|
1207
|
+
|
|
1208
|
+
merged: list[dict[str, Any]] = []
|
|
1209
|
+
for group in groups:
|
|
1210
|
+
templates = group.get("templates") or {}
|
|
1211
|
+
order = group.get("template_order") or list(templates.keys())
|
|
1212
|
+
entry: dict[str, Any] = {}
|
|
1213
|
+
for tag in order:
|
|
1214
|
+
paper = templates.get(tag)
|
|
1215
|
+
if not isinstance(paper, dict):
|
|
1216
|
+
continue
|
|
1217
|
+
for key, value in paper.items():
|
|
1218
|
+
if key not in entry:
|
|
1219
|
+
entry[key] = value
|
|
1220
|
+
merged.append(entry)
|
|
1221
|
+
|
|
1222
|
+
output = Path(output_path)
|
|
1223
|
+
write_json(output, merged)
|
|
1224
|
+
_summarize_merge(output, merged, input_count=len(paths))
|
|
1225
|
+
|
|
1226
|
+
stat_table = Table(title="Template Merge Stats")
|
|
1227
|
+
stat_table.add_column("Template")
|
|
1228
|
+
stat_table.add_column("Total", justify="right")
|
|
1229
|
+
stat_table.add_column("Matched", justify="right")
|
|
1230
|
+
stat_table.add_column("Skipped", justify="right")
|
|
1231
|
+
for tag, values in stats.items():
|
|
1232
|
+
stat_table.add_row(
|
|
1233
|
+
tag or "(unknown)",
|
|
1234
|
+
str(values.get("total", 0)),
|
|
1235
|
+
str(values.get("matched", 0)),
|
|
1236
|
+
str(values.get("skipped", 0)),
|
|
1237
|
+
)
|
|
1238
|
+
Console().print(stat_table)
|
|
1239
|
+
|
|
1240
|
+
if diff_counts:
|
|
1241
|
+
diff_table = Table(title="Template Field Diff Summary")
|
|
1242
|
+
diff_table.add_column("Template")
|
|
1243
|
+
diff_table.add_column("Field")
|
|
1244
|
+
diff_table.add_column("Count", justify="right")
|
|
1245
|
+
for (template_tag, field), count in sorted(diff_counts.items()):
|
|
1246
|
+
diff_table.add_row(template_tag or "(unknown)", field, str(count))
|
|
1247
|
+
Console().print(diff_table)
|
|
1248
|
+
|
|
1249
|
+
if diff_samples:
|
|
1250
|
+
sample_table = Table(title="Template Field Diff Samples (up to 50)")
|
|
1251
|
+
sample_table.add_column("Template")
|
|
1252
|
+
sample_table.add_column("Field")
|
|
1253
|
+
sample_table.add_column("Base Title")
|
|
1254
|
+
sample_table.add_column("Base Value")
|
|
1255
|
+
sample_table.add_column("Other Value")
|
|
1256
|
+
for row in diff_samples:
|
|
1257
|
+
sample_table.add_row(*row)
|
|
1258
|
+
Console().print(sample_table)
|
|
1259
|
+
|
|
1260
|
+
@merge_group.command("bibtex")
|
|
1261
|
+
@click.option("-i", "--input", "input_paths", multiple=True, required=True, help="Input BibTeX file paths")
|
|
1262
|
+
@click.option("-o", "--output", "output_path", required=True, help="Output BibTeX file path")
|
|
1263
|
+
def merge_bibtex(input_paths: Iterable[str], output_path: str) -> None:
|
|
1264
|
+
if not PYBTEX_AVAILABLE:
|
|
1265
|
+
raise click.ClickException("pybtex is required for merge bibtex")
|
|
1266
|
+
|
|
1267
|
+
paths = [Path(path) for path in input_paths]
|
|
1268
|
+
if not paths:
|
|
1269
|
+
raise click.ClickException("No BibTeX inputs provided")
|
|
1270
|
+
|
|
1271
|
+
for path in paths:
|
|
1272
|
+
if not path.is_file():
|
|
1273
|
+
raise click.ClickException(f"BibTeX file not found: {path}")
|
|
1274
|
+
|
|
1275
|
+
merged_entries: dict[str, tuple[Any, int]] = {}
|
|
1276
|
+
duplicate_keys: list[str] = []
|
|
1277
|
+
duplicate_seen: set[str] = set()
|
|
1278
|
+
|
|
1279
|
+
for path in paths:
|
|
1280
|
+
bib_data = parse_file(str(path))
|
|
1281
|
+
for key, entry in bib_data.entries.items():
|
|
1282
|
+
score = _bibtex_entry_score(entry)
|
|
1283
|
+
if key not in merged_entries:
|
|
1284
|
+
merged_entries[key] = (entry, score)
|
|
1285
|
+
continue
|
|
1286
|
+
if key not in duplicate_seen:
|
|
1287
|
+
duplicate_seen.add(key)
|
|
1288
|
+
duplicate_keys.append(key)
|
|
1289
|
+
_, existing_score = merged_entries[key]
|
|
1290
|
+
if score > existing_score:
|
|
1291
|
+
merged_entries[key] = (entry, score)
|
|
1292
|
+
|
|
1293
|
+
output = Path(output_path)
|
|
1294
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
1295
|
+
out_data = BibliographyData()
|
|
1296
|
+
for key, (entry, _) in merged_entries.items():
|
|
1297
|
+
out_data.entries[key] = entry
|
|
1298
|
+
with output.open("w", encoding="utf-8") as handle:
|
|
1299
|
+
Writer().write_stream(out_data, handle)
|
|
1300
|
+
|
|
1301
|
+
_summarize_bibtex_merge(
|
|
1302
|
+
output,
|
|
1303
|
+
input_count=len(paths),
|
|
1304
|
+
entry_count=len(merged_entries),
|
|
1305
|
+
duplicate_count=len(duplicate_keys),
|
|
1306
|
+
)
|
|
1307
|
+
|
|
1308
|
+
if duplicate_keys:
|
|
1309
|
+
preview_limit = 20
|
|
1310
|
+
preview = ", ".join(duplicate_keys[:preview_limit])
|
|
1311
|
+
if len(duplicate_keys) > preview_limit:
|
|
1312
|
+
preview = f"{preview}, ... (+{len(duplicate_keys) - preview_limit} more)"
|
|
1313
|
+
note = "Kept entry with most fields; ties keep first input order."
|
|
1314
|
+
Console().print(Panel(f"{note}\n{preview}", title=f"Duplicate keys ({len(duplicate_keys)})", style="yellow"))
|
|
794
1315
|
|
|
795
1316
|
@db_group.command("render-md")
|
|
796
1317
|
@click.option("-i", "--input", "input_path", required=True, help="Input JSON file path")
|
|
@@ -843,6 +1364,614 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
843
1364
|
rendered = render_papers(papers, out_dir, template, output_language)
|
|
844
1365
|
click.echo(f"Rendered {rendered} markdown files")
|
|
845
1366
|
|
|
1367
|
+
@db_group.command("extract")
|
|
1368
|
+
@click.option("--json", "target_json", default=None, help="Target JSON database path")
|
|
1369
|
+
@click.option("--input-json", "input_json", default=None, help="Reference JSON file path")
|
|
1370
|
+
@click.option(
|
|
1371
|
+
"--pdf-root", "pdf_roots", multiple=True, help="PDF root directories for reference (repeatable)"
|
|
1372
|
+
)
|
|
1373
|
+
@click.option(
|
|
1374
|
+
"--md-root", "md_roots", multiple=True, help="Markdown root directories for reference (repeatable)"
|
|
1375
|
+
)
|
|
1376
|
+
@click.option(
|
|
1377
|
+
"--md-translated-root", "md_translated_roots", multiple=True,
|
|
1378
|
+
help="Translated Markdown root directories to extract from (repeatable)"
|
|
1379
|
+
)
|
|
1380
|
+
@click.option(
|
|
1381
|
+
"--md-source-root", "md_source_roots", multiple=True,
|
|
1382
|
+
help="Source Markdown root directories to extract from (repeatable)"
|
|
1383
|
+
)
|
|
1384
|
+
@click.option("--output-json", "output_json", default=None, help="Output JSON file path")
|
|
1385
|
+
@click.option(
|
|
1386
|
+
"--output-md-translated-root",
|
|
1387
|
+
"output_md_translated_root",
|
|
1388
|
+
default=None,
|
|
1389
|
+
help="Output directory for matched translated Markdown",
|
|
1390
|
+
)
|
|
1391
|
+
@click.option(
|
|
1392
|
+
"--output-md-root",
|
|
1393
|
+
"output_md_root",
|
|
1394
|
+
default=None,
|
|
1395
|
+
help="Output directory for matched source Markdown",
|
|
1396
|
+
)
|
|
1397
|
+
@click.option(
|
|
1398
|
+
"-b",
|
|
1399
|
+
"--input-bibtex",
|
|
1400
|
+
"input_bibtex",
|
|
1401
|
+
default=None,
|
|
1402
|
+
help="Reference BibTeX file path",
|
|
1403
|
+
)
|
|
1404
|
+
@click.option("--lang", "lang", default=None, help="Language code for translated Markdown (e.g., zh)")
|
|
1405
|
+
@click.option("--output-csv", "output_csv", default=None, help="Path to export results as CSV")
|
|
1406
|
+
def extract(
|
|
1407
|
+
target_json: str | None,
|
|
1408
|
+
input_json: str | None,
|
|
1409
|
+
pdf_roots: tuple[str, ...],
|
|
1410
|
+
md_roots: tuple[str, ...],
|
|
1411
|
+
md_translated_roots: tuple[str, ...],
|
|
1412
|
+
md_source_roots: tuple[str, ...],
|
|
1413
|
+
output_json: str | None,
|
|
1414
|
+
output_md_translated_root: str | None,
|
|
1415
|
+
output_md_root: str | None,
|
|
1416
|
+
input_bibtex: str | None,
|
|
1417
|
+
lang: str | None,
|
|
1418
|
+
output_csv: str | None,
|
|
1419
|
+
) -> None:
|
|
1420
|
+
from deepresearch_flow.paper import db_ops
|
|
1421
|
+
from deepresearch_flow.paper.utils import stable_hash
|
|
1422
|
+
|
|
1423
|
+
if input_json and input_bibtex:
|
|
1424
|
+
raise click.ClickException("Use only one of --input-json or --input-bibtex")
|
|
1425
|
+
|
|
1426
|
+
if target_json is None and input_json is not None:
|
|
1427
|
+
target_json = input_json
|
|
1428
|
+
|
|
1429
|
+
has_reference = bool(pdf_roots or md_roots or input_json or input_bibtex)
|
|
1430
|
+
if not has_reference:
|
|
1431
|
+
raise click.ClickException(
|
|
1432
|
+
"Provide at least one reference input: --pdf-root, --md-root, --input-json, or --input-bibtex"
|
|
1433
|
+
)
|
|
1434
|
+
if not target_json and not md_translated_roots and not md_source_roots:
|
|
1435
|
+
raise click.ClickException(
|
|
1436
|
+
"Provide --json and/or --md-translated-root and/or --md-source-root"
|
|
1437
|
+
)
|
|
1438
|
+
if target_json and not output_json:
|
|
1439
|
+
raise click.ClickException("--output-json is required when using --json")
|
|
1440
|
+
if output_json and not target_json:
|
|
1441
|
+
raise click.ClickException("--json is required when using --output-json")
|
|
1442
|
+
if md_translated_roots and not output_md_translated_root:
|
|
1443
|
+
raise click.ClickException(
|
|
1444
|
+
"--output-md-translated-root is required when using --md-translated-root"
|
|
1445
|
+
)
|
|
1446
|
+
if output_md_translated_root and not md_translated_roots:
|
|
1447
|
+
raise click.ClickException(
|
|
1448
|
+
"--md-translated-root is required when using --output-md-translated-root"
|
|
1449
|
+
)
|
|
1450
|
+
if md_source_roots and not output_md_root:
|
|
1451
|
+
raise click.ClickException("--output-md-root is required when using --md-source-root")
|
|
1452
|
+
if output_md_root and not md_source_roots:
|
|
1453
|
+
raise click.ClickException("--md-source-root is required when using --output-md-root")
|
|
1454
|
+
if md_translated_roots and not lang:
|
|
1455
|
+
raise click.ClickException("--lang is required when extracting translated Markdown")
|
|
1456
|
+
|
|
1457
|
+
pdf_root_paths = [Path(path) for path in pdf_roots]
|
|
1458
|
+
md_root_paths = [Path(path) for path in md_roots]
|
|
1459
|
+
translated_root_paths = [Path(path) for path in md_translated_roots]
|
|
1460
|
+
source_root_paths = [Path(path) for path in md_source_roots]
|
|
1461
|
+
reference_json_path = Path(input_json) if input_json else None
|
|
1462
|
+
reference_bibtex_path = Path(input_bibtex) if input_bibtex else None
|
|
1463
|
+
|
|
1464
|
+
reference_papers: list[dict[str, Any]] = []
|
|
1465
|
+
if reference_json_path:
|
|
1466
|
+
if not reference_json_path.is_file():
|
|
1467
|
+
raise click.ClickException(f"Reference JSON not found: {reference_json_path}")
|
|
1468
|
+
reference_papers, _ = load_json_payload(reference_json_path)
|
|
1469
|
+
if reference_bibtex_path:
|
|
1470
|
+
if not reference_bibtex_path.is_file():
|
|
1471
|
+
raise click.ClickException(f"Reference BibTeX not found: {reference_bibtex_path}")
|
|
1472
|
+
if not db_ops.PYBTEX_AVAILABLE:
|
|
1473
|
+
raise click.ClickException("pybtex is required for --input-bibtex support")
|
|
1474
|
+
bib_data = db_ops.parse_file(str(reference_bibtex_path))
|
|
1475
|
+
for key, entry in bib_data.entries.items():
|
|
1476
|
+
title = entry.fields.get("title")
|
|
1477
|
+
if not title:
|
|
1478
|
+
continue
|
|
1479
|
+
year = entry.fields.get("year") or ""
|
|
1480
|
+
year = str(year) if str(year).isdigit() else ""
|
|
1481
|
+
authors = []
|
|
1482
|
+
for person in entry.persons.get("author", []):
|
|
1483
|
+
authors.append(str(person))
|
|
1484
|
+
reference_papers.append(
|
|
1485
|
+
{
|
|
1486
|
+
"paper_title": str(title),
|
|
1487
|
+
"paper_authors": authors,
|
|
1488
|
+
"publication_date": year,
|
|
1489
|
+
"source_path": f"bibtex:{key}",
|
|
1490
|
+
}
|
|
1491
|
+
)
|
|
1492
|
+
|
|
1493
|
+
reference_index: dict[str, list[dict[str, Any]]] = {}
|
|
1494
|
+
for paper in reference_papers:
|
|
1495
|
+
if "source_path" not in paper and reference_json_path:
|
|
1496
|
+
paper["source_path"] = str(reference_json_path)
|
|
1497
|
+
db_ops._prepare_paper_matching_fields(paper)
|
|
1498
|
+
if reference_papers:
|
|
1499
|
+
reference_index = db_ops._build_paper_index(reference_papers)
|
|
1500
|
+
|
|
1501
|
+
all_results: list[Any] = []
|
|
1502
|
+
|
|
1503
|
+
if target_json:
|
|
1504
|
+
target_json_path = Path(target_json)
|
|
1505
|
+
if not target_json_path.is_file():
|
|
1506
|
+
raise click.ClickException(f"Target JSON not found: {target_json_path}")
|
|
1507
|
+
papers, payload = load_json_payload(target_json_path)
|
|
1508
|
+
|
|
1509
|
+
results: list[Any] = []
|
|
1510
|
+
matched_indices: set[int]
|
|
1511
|
+
if pdf_root_paths or md_root_paths:
|
|
1512
|
+
results, match_pairs, _, _ = db_ops.compare_datasets_with_pairs(
|
|
1513
|
+
json_paths_a=[target_json_path],
|
|
1514
|
+
pdf_roots_b=pdf_root_paths,
|
|
1515
|
+
md_roots_b=md_root_paths,
|
|
1516
|
+
bibtex_path=None,
|
|
1517
|
+
lang=None,
|
|
1518
|
+
show_progress=True,
|
|
1519
|
+
)
|
|
1520
|
+
matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
|
|
1521
|
+
all_results.extend(results)
|
|
1522
|
+
else:
|
|
1523
|
+
matched_indices = set(range(len(papers)))
|
|
1524
|
+
|
|
1525
|
+
matched_reference_ids: set[int] = set()
|
|
1526
|
+
if reference_index:
|
|
1527
|
+
def detail_score(paper: dict[str, Any]) -> tuple[int, int]:
|
|
1528
|
+
non_empty = 0
|
|
1529
|
+
total_len = 0
|
|
1530
|
+
for value in paper.values():
|
|
1531
|
+
if value is None:
|
|
1532
|
+
continue
|
|
1533
|
+
if isinstance(value, (list, dict)):
|
|
1534
|
+
if value:
|
|
1535
|
+
non_empty += 1
|
|
1536
|
+
total_len += len(
|
|
1537
|
+
json.dumps(value, ensure_ascii=False, sort_keys=True)
|
|
1538
|
+
)
|
|
1539
|
+
else:
|
|
1540
|
+
text = str(value).strip()
|
|
1541
|
+
if text:
|
|
1542
|
+
non_empty += 1
|
|
1543
|
+
total_len += len(text)
|
|
1544
|
+
return non_empty, total_len
|
|
1545
|
+
|
|
1546
|
+
def resolve_reference_match(
|
|
1547
|
+
paper: dict[str, Any],
|
|
1548
|
+
) -> tuple[dict[str, Any] | None, str | None, float]:
|
|
1549
|
+
match_paper, match_type, match_score = db_ops._resolve_paper_by_title_and_meta(
|
|
1550
|
+
paper, reference_index
|
|
1551
|
+
)
|
|
1552
|
+
if match_paper is not None:
|
|
1553
|
+
return match_paper, match_type, match_score
|
|
1554
|
+
year = str(paper.get("_year") or "").strip()
|
|
1555
|
+
if not year.isdigit():
|
|
1556
|
+
return None, None, 0.0
|
|
1557
|
+
authors = paper.get("_authors") or []
|
|
1558
|
+
author_key = ""
|
|
1559
|
+
if authors:
|
|
1560
|
+
author_key = db_ops._normalize_author_key(str(authors[0]))
|
|
1561
|
+
candidates: list[dict[str, Any]] = []
|
|
1562
|
+
fallback_type = "year_relaxed"
|
|
1563
|
+
if author_key:
|
|
1564
|
+
candidates = reference_index.get(f"authoryear:{year}:{author_key}", [])
|
|
1565
|
+
if candidates:
|
|
1566
|
+
fallback_type = "author_year_relaxed"
|
|
1567
|
+
if not candidates:
|
|
1568
|
+
candidates = reference_index.get(f"year:{year}", [])
|
|
1569
|
+
if not candidates:
|
|
1570
|
+
return None, None, 0.0
|
|
1571
|
+
title_key = db_ops._normalize_title_key(str(paper.get("paper_title") or ""))
|
|
1572
|
+
match, score = db_ops._adaptive_similarity_match_papers(title_key, candidates)
|
|
1573
|
+
if match is None:
|
|
1574
|
+
return candidates[0], fallback_type, 0.0
|
|
1575
|
+
return match, fallback_type, score
|
|
1576
|
+
|
|
1577
|
+
base_indices = set(matched_indices)
|
|
1578
|
+
best_matches: dict[int, tuple[int, tuple[int, int], str | None, float]] = {}
|
|
1579
|
+
for idx, paper in enumerate(papers):
|
|
1580
|
+
if idx not in matched_indices:
|
|
1581
|
+
continue
|
|
1582
|
+
db_ops._prepare_paper_matching_fields(paper)
|
|
1583
|
+
match_paper, match_type, match_score = resolve_reference_match(paper)
|
|
1584
|
+
if match_paper is None:
|
|
1585
|
+
continue
|
|
1586
|
+
ref_id = id(match_paper)
|
|
1587
|
+
score = detail_score(paper)
|
|
1588
|
+
current = best_matches.get(ref_id)
|
|
1589
|
+
if current is None:
|
|
1590
|
+
best_matches[ref_id] = (idx, score, match_type, match_score)
|
|
1591
|
+
continue
|
|
1592
|
+
if score > current[1] or (score == current[1] and match_score > current[3]):
|
|
1593
|
+
best_matches[ref_id] = (idx, score, match_type, match_score)
|
|
1594
|
+
|
|
1595
|
+
matched_reference_ids = set(best_matches.keys())
|
|
1596
|
+
matched_indices = {idx for idx, *_ in best_matches.values()}
|
|
1597
|
+
|
|
1598
|
+
matched_papers = [paper for idx, paper in enumerate(papers) if idx in matched_indices]
|
|
1599
|
+
deduped_papers: list[Any] = []
|
|
1600
|
+
seen_titles: set[str] = set()
|
|
1601
|
+
for paper in matched_papers:
|
|
1602
|
+
title_key = db_ops._normalize_title_key(str(paper.get("paper_title") or ""))
|
|
1603
|
+
if title_key:
|
|
1604
|
+
if title_key in seen_titles:
|
|
1605
|
+
continue
|
|
1606
|
+
seen_titles.add(title_key)
|
|
1607
|
+
deduped_papers.append(paper)
|
|
1608
|
+
if len(deduped_papers) != len(matched_papers):
|
|
1609
|
+
removed = len(matched_papers) - len(deduped_papers)
|
|
1610
|
+
click.echo(f"Deduplicated {removed} entries by normalized title.")
|
|
1611
|
+
matched_papers = deduped_papers
|
|
1612
|
+
output_path = Path(output_json) if output_json else None
|
|
1613
|
+
if output_path is None:
|
|
1614
|
+
raise click.ClickException("--output-json is required when using --json")
|
|
1615
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1616
|
+
if payload is None:
|
|
1617
|
+
write_json(output_path, matched_papers)
|
|
1618
|
+
else:
|
|
1619
|
+
output_payload = dict(payload)
|
|
1620
|
+
output_payload["papers"] = matched_papers
|
|
1621
|
+
write_json(output_path, output_payload)
|
|
1622
|
+
click.echo(f"Extracted {len(matched_papers)} JSON entries to {output_path}")
|
|
1623
|
+
|
|
1624
|
+
if output_csv and reference_papers:
|
|
1625
|
+
match_meta_by_ref_id = {
|
|
1626
|
+
ref_id: (idx, match_type, match_score)
|
|
1627
|
+
for ref_id, (idx, _, match_type, match_score) in best_matches.items()
|
|
1628
|
+
}
|
|
1629
|
+
for ref in reference_papers:
|
|
1630
|
+
ref_id = id(ref)
|
|
1631
|
+
ref_title = str(ref.get("paper_title") or "")
|
|
1632
|
+
ref_hash = stable_hash(str(ref_title or ref.get("source_path") or ""))
|
|
1633
|
+
ref_path = str(ref.get("source_path") or "")
|
|
1634
|
+
if ref_id in match_meta_by_ref_id:
|
|
1635
|
+
idx, match_type, match_score = match_meta_by_ref_id[ref_id]
|
|
1636
|
+
paper = papers[idx]
|
|
1637
|
+
paper_hash = str(paper.get("source_hash") or "") or stable_hash(
|
|
1638
|
+
str(paper.get("paper_title") or "")
|
|
1639
|
+
)
|
|
1640
|
+
all_results.append(
|
|
1641
|
+
db_ops.CompareResult(
|
|
1642
|
+
side="MATCH",
|
|
1643
|
+
source_hash=ref_hash,
|
|
1644
|
+
title=ref_title,
|
|
1645
|
+
match_status="matched_pair",
|
|
1646
|
+
match_type=match_type,
|
|
1647
|
+
match_score=match_score,
|
|
1648
|
+
source_path=ref_path,
|
|
1649
|
+
other_source_hash=paper_hash,
|
|
1650
|
+
other_title=str(paper.get("paper_title") or ""),
|
|
1651
|
+
other_source_path=str(paper.get("source_path") or ""),
|
|
1652
|
+
lang=None,
|
|
1653
|
+
)
|
|
1654
|
+
)
|
|
1655
|
+
continue
|
|
1656
|
+
all_results.append(
|
|
1657
|
+
db_ops.CompareResult(
|
|
1658
|
+
side="B",
|
|
1659
|
+
source_hash=ref_hash,
|
|
1660
|
+
title=ref_title,
|
|
1661
|
+
match_status="only_in_B",
|
|
1662
|
+
match_type=None,
|
|
1663
|
+
match_score=0.0,
|
|
1664
|
+
source_path=ref_path,
|
|
1665
|
+
other_source_hash=None,
|
|
1666
|
+
other_title=None,
|
|
1667
|
+
other_source_path=None,
|
|
1668
|
+
lang=None,
|
|
1669
|
+
)
|
|
1670
|
+
)
|
|
1671
|
+
|
|
1672
|
+
for idx in sorted(base_indices - matched_indices):
|
|
1673
|
+
paper = papers[idx]
|
|
1674
|
+
paper_title = str(paper.get("paper_title") or "")
|
|
1675
|
+
paper_hash = str(paper.get("source_hash") or "") or stable_hash(paper_title)
|
|
1676
|
+
all_results.append(
|
|
1677
|
+
db_ops.CompareResult(
|
|
1678
|
+
side="A",
|
|
1679
|
+
source_hash=paper_hash,
|
|
1680
|
+
title=paper_title,
|
|
1681
|
+
match_status="only_in_A",
|
|
1682
|
+
match_type=None,
|
|
1683
|
+
match_score=0.0,
|
|
1684
|
+
source_path=str(paper.get("source_path") or ""),
|
|
1685
|
+
other_source_hash=None,
|
|
1686
|
+
other_title=None,
|
|
1687
|
+
other_source_path=None,
|
|
1688
|
+
lang=None,
|
|
1689
|
+
)
|
|
1690
|
+
)
|
|
1691
|
+
|
|
1692
|
+
copied_count = 0
|
|
1693
|
+
if md_translated_roots:
|
|
1694
|
+
output_root = Path(output_md_translated_root) if output_md_translated_root else None
|
|
1695
|
+
if output_root is None:
|
|
1696
|
+
raise click.ClickException(
|
|
1697
|
+
"--output-md-translated-root is required when using --md-translated-root"
|
|
1698
|
+
)
|
|
1699
|
+
results, match_pairs, dataset_a, _ = compare_datasets_with_pairs(
|
|
1700
|
+
md_translated_roots_a=translated_root_paths,
|
|
1701
|
+
pdf_roots_b=pdf_root_paths,
|
|
1702
|
+
md_roots_b=md_root_paths,
|
|
1703
|
+
lang=lang,
|
|
1704
|
+
show_progress=True,
|
|
1705
|
+
)
|
|
1706
|
+
matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
|
|
1707
|
+
copy_iter = tqdm(
|
|
1708
|
+
enumerate(dataset_a.papers),
|
|
1709
|
+
total=len(dataset_a.papers),
|
|
1710
|
+
desc="copy translated",
|
|
1711
|
+
unit="file",
|
|
1712
|
+
)
|
|
1713
|
+
for idx, paper in copy_iter:
|
|
1714
|
+
if idx not in matched_indices:
|
|
1715
|
+
continue
|
|
1716
|
+
source_path = paper.get("source_path")
|
|
1717
|
+
if not source_path:
|
|
1718
|
+
continue
|
|
1719
|
+
source = Path(str(source_path))
|
|
1720
|
+
relative = resolve_relative_path(source, translated_root_paths)
|
|
1721
|
+
destination = output_root / relative
|
|
1722
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
1723
|
+
shutil.copy2(source, destination)
|
|
1724
|
+
copied_count += 1
|
|
1725
|
+
click.echo(
|
|
1726
|
+
f"Copied {copied_count} translated Markdown files to {output_root}"
|
|
1727
|
+
)
|
|
1728
|
+
all_results.extend(results)
|
|
1729
|
+
|
|
1730
|
+
if md_source_roots:
|
|
1731
|
+
output_root = Path(output_md_root) if output_md_root else None
|
|
1732
|
+
if output_root is None:
|
|
1733
|
+
raise click.ClickException("--output-md-root is required when using --md-source-root")
|
|
1734
|
+
results, match_pairs, dataset_a, _ = compare_datasets_with_pairs(
|
|
1735
|
+
md_roots_a=source_root_paths,
|
|
1736
|
+
pdf_roots_b=pdf_root_paths,
|
|
1737
|
+
md_roots_b=md_root_paths,
|
|
1738
|
+
lang=None,
|
|
1739
|
+
show_progress=True,
|
|
1740
|
+
)
|
|
1741
|
+
matched_indices = {idx_a for idx_a, _, _, _ in match_pairs}
|
|
1742
|
+
copied_source = 0
|
|
1743
|
+
copy_iter = tqdm(
|
|
1744
|
+
enumerate(dataset_a.papers),
|
|
1745
|
+
total=len(dataset_a.papers),
|
|
1746
|
+
desc="copy source",
|
|
1747
|
+
unit="file",
|
|
1748
|
+
)
|
|
1749
|
+
for idx, paper in copy_iter:
|
|
1750
|
+
if idx not in matched_indices:
|
|
1751
|
+
continue
|
|
1752
|
+
source_path = paper.get("source_path")
|
|
1753
|
+
if not source_path:
|
|
1754
|
+
continue
|
|
1755
|
+
source = Path(str(source_path))
|
|
1756
|
+
relative = resolve_relative_path(source, source_root_paths)
|
|
1757
|
+
destination = output_root / relative
|
|
1758
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
1759
|
+
shutil.copy2(source, destination)
|
|
1760
|
+
copied_source += 1
|
|
1761
|
+
click.echo(f"Copied {copied_source} source Markdown files to {output_root}")
|
|
1762
|
+
copied_count += copied_source
|
|
1763
|
+
all_results.extend(results)
|
|
1764
|
+
|
|
1765
|
+
if output_csv:
|
|
1766
|
+
output_path = Path(output_csv)
|
|
1767
|
+
export_compare_csv(all_results, output_path)
|
|
1768
|
+
click.echo(f"Results exported to: {output_path}")
|
|
1769
|
+
|
|
1770
|
+
@db_group.command("verify")
|
|
1771
|
+
@click.option("--input-json", "input_json", required=True, help="Input JSON file path")
|
|
1772
|
+
@click.option(
|
|
1773
|
+
"--output-json",
|
|
1774
|
+
"output_json",
|
|
1775
|
+
required=True,
|
|
1776
|
+
help="Output verification report JSON path",
|
|
1777
|
+
)
|
|
1778
|
+
@click.option(
|
|
1779
|
+
"--prompt-template",
|
|
1780
|
+
"prompt_template",
|
|
1781
|
+
default=None,
|
|
1782
|
+
type=click.Choice(list_template_names()),
|
|
1783
|
+
help="Prompt template to load schema (e.g., deep_read)",
|
|
1784
|
+
)
|
|
1785
|
+
@click.option(
|
|
1786
|
+
"-s",
|
|
1787
|
+
"--schema-json",
|
|
1788
|
+
"--schema",
|
|
1789
|
+
"schema_json",
|
|
1790
|
+
default=None,
|
|
1791
|
+
help="Custom schema JSON path",
|
|
1792
|
+
)
|
|
1793
|
+
@click.option(
|
|
1794
|
+
"--ignore-field",
|
|
1795
|
+
"ignore_fields",
|
|
1796
|
+
multiple=True,
|
|
1797
|
+
help="Schema field to ignore when checking empties (repeatable)",
|
|
1798
|
+
)
|
|
1799
|
+
def verify(
|
|
1800
|
+
input_json: str,
|
|
1801
|
+
output_json: str,
|
|
1802
|
+
prompt_template: str | None,
|
|
1803
|
+
schema_json: str | None,
|
|
1804
|
+
ignore_fields: tuple[str, ...],
|
|
1805
|
+
) -> None:
|
|
1806
|
+
if prompt_template and schema_json:
|
|
1807
|
+
raise click.ClickException("Use only one of --prompt-template or --schema-json")
|
|
1808
|
+
if not prompt_template and not schema_json:
|
|
1809
|
+
raise click.ClickException("Provide --prompt-template or --schema-json")
|
|
1810
|
+
|
|
1811
|
+
input_path = Path(input_json)
|
|
1812
|
+
if not input_path.is_file():
|
|
1813
|
+
raise click.ClickException(f"Input JSON not found: {input_path}")
|
|
1814
|
+
|
|
1815
|
+
papers, payload = load_json_payload(input_path)
|
|
1816
|
+
template_tag = (
|
|
1817
|
+
prompt_template
|
|
1818
|
+
or (payload.get("template_tag") if isinstance(payload, dict) else None)
|
|
1819
|
+
or "custom"
|
|
1820
|
+
)
|
|
1821
|
+
|
|
1822
|
+
try:
|
|
1823
|
+
if schema_json:
|
|
1824
|
+
schema = load_schema(schema_json)
|
|
1825
|
+
else:
|
|
1826
|
+
schema = load_schema_for_template(prompt_template or template_tag)
|
|
1827
|
+
except SchemaError as exc:
|
|
1828
|
+
raise click.ClickException(str(exc)) from exc
|
|
1829
|
+
except ValueError as exc:
|
|
1830
|
+
raise click.ClickException(str(exc)) from exc
|
|
1831
|
+
|
|
1832
|
+
ignore_set = {field.strip() for field in ignore_fields if field.strip()}
|
|
1833
|
+
properties = schema.get("properties", {})
|
|
1834
|
+
schema_fields = sorted(
|
|
1835
|
+
field
|
|
1836
|
+
for field in (set(properties.keys()) | set(schema.get("required", [])))
|
|
1837
|
+
if field not in ignore_set
|
|
1838
|
+
)
|
|
1839
|
+
if not schema_fields:
|
|
1840
|
+
raise click.ClickException("Schema does not define any properties")
|
|
1841
|
+
|
|
1842
|
+
stage_defs = get_stage_definitions(prompt_template or template_tag)
|
|
1843
|
+
field_stage_map: dict[str, str] = {}
|
|
1844
|
+
for stage_def in stage_defs:
|
|
1845
|
+
for field in stage_def.fields:
|
|
1846
|
+
if field in ignore_set:
|
|
1847
|
+
continue
|
|
1848
|
+
field_stage_map.setdefault(field, stage_def.name)
|
|
1849
|
+
|
|
1850
|
+
report_items: list[dict[str, Any]] = []
|
|
1851
|
+
for paper in papers:
|
|
1852
|
+
if not isinstance(paper, dict):
|
|
1853
|
+
continue
|
|
1854
|
+
missing_fields = [
|
|
1855
|
+
field
|
|
1856
|
+
for field in schema_fields
|
|
1857
|
+
if field not in paper or is_empty_value(paper.get(field))
|
|
1858
|
+
]
|
|
1859
|
+
if not missing_fields:
|
|
1860
|
+
continue
|
|
1861
|
+
item: dict[str, Any] = {
|
|
1862
|
+
"source_path": str(paper.get("source_path") or ""),
|
|
1863
|
+
"paper_title": str(paper.get("paper_title") or ""),
|
|
1864
|
+
"missing_fields": missing_fields,
|
|
1865
|
+
}
|
|
1866
|
+
if field_stage_map and all(field in field_stage_map for field in missing_fields):
|
|
1867
|
+
item["retry_stages"] = sorted(
|
|
1868
|
+
{field_stage_map[field] for field in missing_fields}
|
|
1869
|
+
)
|
|
1870
|
+
report_items.append(item)
|
|
1871
|
+
|
|
1872
|
+
report_payload = {
|
|
1873
|
+
"template_tag": template_tag,
|
|
1874
|
+
"schema_fields": schema_fields,
|
|
1875
|
+
"items": report_items,
|
|
1876
|
+
}
|
|
1877
|
+
|
|
1878
|
+
output_path = Path(output_json)
|
|
1879
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1880
|
+
write_json(output_path, report_payload)
|
|
1881
|
+
|
|
1882
|
+
console = Console()
|
|
1883
|
+
total_missing = sum(len(item["missing_fields"]) for item in report_items)
|
|
1884
|
+
summary_table = Table(title="db verify summary")
|
|
1885
|
+
summary_table.add_column("Metric", style="cyan")
|
|
1886
|
+
summary_table.add_column("Value", style="white", overflow="fold")
|
|
1887
|
+
summary_table.add_row("Input", str(input_path))
|
|
1888
|
+
summary_table.add_row("Template", template_tag)
|
|
1889
|
+
summary_table.add_row("Items", str(len(papers)))
|
|
1890
|
+
summary_table.add_row("Items with missing fields", str(len(report_items)))
|
|
1891
|
+
summary_table.add_row("Total missing fields", str(total_missing))
|
|
1892
|
+
if ignore_set:
|
|
1893
|
+
summary_table.add_row("Ignored fields", ", ".join(sorted(ignore_set)))
|
|
1894
|
+
summary_table.add_row("Output", str(output_path))
|
|
1895
|
+
console.print(summary_table)
|
|
1896
|
+
|
|
1897
|
+
if report_items:
|
|
1898
|
+
field_counts: dict[str, int] = {field: 0 for field in schema_fields}
|
|
1899
|
+
for item in report_items:
|
|
1900
|
+
for field in item["missing_fields"]:
|
|
1901
|
+
field_counts[field] = field_counts.get(field, 0) + 1
|
|
1902
|
+
|
|
1903
|
+
count_table = Table(title="Missing field counts")
|
|
1904
|
+
count_table.add_column("Field", style="cyan")
|
|
1905
|
+
count_table.add_column("Missing", style="yellow", justify="right")
|
|
1906
|
+
for field, count in sorted(field_counts.items(), key=lambda x: (-x[1], x[0])):
|
|
1907
|
+
if count:
|
|
1908
|
+
count_table.add_row(field, str(count))
|
|
1909
|
+
console.print(count_table)
|
|
1910
|
+
|
|
1911
|
+
detail_table = Table(title="Missing field details")
|
|
1912
|
+
detail_table.add_column("#", style="dim", justify="right")
|
|
1913
|
+
detail_table.add_column("Title", style="white", overflow="fold")
|
|
1914
|
+
detail_table.add_column("Source Path", style="cyan", overflow="fold")
|
|
1915
|
+
detail_table.add_column("Missing Fields", style="yellow", overflow="fold")
|
|
1916
|
+
detail_table.add_column("Retry Stages", style="green", overflow="fold")
|
|
1917
|
+
for idx, item in enumerate(report_items, start=1):
|
|
1918
|
+
retry_stages = item.get("retry_stages") or []
|
|
1919
|
+
detail_table.add_row(
|
|
1920
|
+
str(idx),
|
|
1921
|
+
item.get("paper_title") or "",
|
|
1922
|
+
item.get("source_path") or "",
|
|
1923
|
+
", ".join(item.get("missing_fields", [])),
|
|
1924
|
+
", ".join(retry_stages),
|
|
1925
|
+
)
|
|
1926
|
+
console.print(detail_table)
|
|
1927
|
+
else:
|
|
1928
|
+
console.print(Panel("[green]No missing fields detected.[/green]", expand=False))
|
|
1929
|
+
|
|
1930
|
+
@db_group.command("transfer-pdfs")
|
|
1931
|
+
@click.option("--input-list", "input_list", required=True, help="Text file containing PDF paths")
|
|
1932
|
+
@click.option("--output-dir", "output_dir", required=True, help="Output directory")
|
|
1933
|
+
@click.option("--move", "move_files", is_flag=True, help="Move PDFs instead of copying")
|
|
1934
|
+
@click.option("--copy", "copy_files", is_flag=True, help="Copy PDFs instead of moving")
|
|
1935
|
+
def transfer_pdfs(
|
|
1936
|
+
input_list: str,
|
|
1937
|
+
output_dir: str,
|
|
1938
|
+
move_files: bool,
|
|
1939
|
+
copy_files: bool,
|
|
1940
|
+
) -> None:
|
|
1941
|
+
if move_files == copy_files:
|
|
1942
|
+
raise click.ClickException("Specify exactly one of --move or --copy")
|
|
1943
|
+
|
|
1944
|
+
list_path = Path(input_list)
|
|
1945
|
+
if not list_path.is_file():
|
|
1946
|
+
raise click.ClickException(f"Input list not found: {list_path}")
|
|
1947
|
+
|
|
1948
|
+
destination_root = Path(output_dir)
|
|
1949
|
+
destination_root.mkdir(parents=True, exist_ok=True)
|
|
1950
|
+
|
|
1951
|
+
entries = [line.strip() for line in list_path.read_text(encoding="utf-8").splitlines()]
|
|
1952
|
+
entries = [line for line in entries if line]
|
|
1953
|
+
|
|
1954
|
+
processed = 0
|
|
1955
|
+
missing = 0
|
|
1956
|
+
transfer_iter = tqdm(entries, total=len(entries), desc="transfer pdfs", unit="file")
|
|
1957
|
+
for raw in transfer_iter:
|
|
1958
|
+
source = Path(raw).expanduser()
|
|
1959
|
+
if not source.is_file():
|
|
1960
|
+
missing += 1
|
|
1961
|
+
continue
|
|
1962
|
+
destination = destination_root / source.name
|
|
1963
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
1964
|
+
if move_files:
|
|
1965
|
+
shutil.move(str(source), str(destination))
|
|
1966
|
+
else:
|
|
1967
|
+
shutil.copy2(source, destination)
|
|
1968
|
+
processed += 1
|
|
1969
|
+
|
|
1970
|
+
action = "Moved" if move_files else "Copied"
|
|
1971
|
+
click.echo(f"{action} {processed} PDFs to {destination_root}")
|
|
1972
|
+
if missing:
|
|
1973
|
+
click.echo(f"Skipped {missing} missing paths")
|
|
1974
|
+
|
|
846
1975
|
@db_group.command("compare")
|
|
847
1976
|
@click.option(
|
|
848
1977
|
"-ia", "--input-a", "input_paths_a", multiple=True, help="Input JSON files for side A (repeatable)"
|
|
@@ -875,6 +2004,12 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
875
2004
|
@click.option(
|
|
876
2005
|
"--output-csv", "output_csv", default=None, help="Path to export results as CSV"
|
|
877
2006
|
)
|
|
2007
|
+
@click.option(
|
|
2008
|
+
"--output-only-in-b",
|
|
2009
|
+
"output_only_in_b",
|
|
2010
|
+
default=None,
|
|
2011
|
+
help="Path to export only-in-B source paths as a newline list",
|
|
2012
|
+
)
|
|
878
2013
|
@click.option(
|
|
879
2014
|
"--sample-limit", "sample_limit", default=5, type=int, show_default=True,
|
|
880
2015
|
help="Number of sample items to show in terminal output"
|
|
@@ -891,12 +2026,12 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
891
2026
|
bibtex_path: str | None,
|
|
892
2027
|
lang: str | None,
|
|
893
2028
|
output_csv: str | None,
|
|
2029
|
+
output_only_in_b: str | None,
|
|
894
2030
|
sample_limit: int,
|
|
895
2031
|
) -> None:
|
|
896
2032
|
"""Compare two datasets and report matches and differences."""
|
|
897
2033
|
from deepresearch_flow.paper.db_ops import compare_datasets
|
|
898
|
-
|
|
899
|
-
|
|
2034
|
+
|
|
900
2035
|
# Validate that at least one input is provided for each side
|
|
901
2036
|
has_input_a = bool(input_paths_a or pdf_roots_a or md_roots_a or md_translated_roots_a)
|
|
902
2037
|
has_input_b = bool(input_paths_b or pdf_roots_b or md_roots_b or md_translated_roots_b)
|
|
@@ -925,6 +2060,7 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
925
2060
|
md_translated_roots_b=[Path(p) for p in md_translated_roots_b],
|
|
926
2061
|
bibtex_path=Path(bibtex_path) if bibtex_path else None,
|
|
927
2062
|
lang=lang,
|
|
2063
|
+
show_progress=True,
|
|
928
2064
|
)
|
|
929
2065
|
except ValueError as exc:
|
|
930
2066
|
raise click.ClickException(str(exc)) from exc
|
|
@@ -998,31 +2134,14 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
998
2134
|
# Export to CSV if requested
|
|
999
2135
|
if output_csv:
|
|
1000
2136
|
output_path = Path(output_csv)
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
with open(output_path, "w", newline="", encoding="utf-8") as f:
|
|
1004
|
-
writer = csv.writer(f)
|
|
1005
|
-
writer.writerow([
|
|
1006
|
-
"Side", "Source Hash", "Title", "Match Status", "Match Type",
|
|
1007
|
-
"Match Score", "Source Path", "Other Source Hash", "Other Title",
|
|
1008
|
-
"Other Source Path", "Lang"
|
|
1009
|
-
])
|
|
1010
|
-
for r in results:
|
|
1011
|
-
writer.writerow([
|
|
1012
|
-
r.side,
|
|
1013
|
-
r.source_hash,
|
|
1014
|
-
r.title,
|
|
1015
|
-
r.match_status,
|
|
1016
|
-
r.match_type or "",
|
|
1017
|
-
f"{r.match_score:.4f}",
|
|
1018
|
-
r.source_path or "",
|
|
1019
|
-
r.other_source_hash or "",
|
|
1020
|
-
r.other_title or "",
|
|
1021
|
-
r.other_source_path or "",
|
|
1022
|
-
r.lang or "",
|
|
1023
|
-
])
|
|
1024
|
-
|
|
2137
|
+
export_compare_csv(results, output_path)
|
|
1025
2138
|
console.print(f"\n[green]Results exported to: {output_path}[/green]")
|
|
1026
|
-
|
|
2139
|
+
if output_only_in_b:
|
|
2140
|
+
output_path = Path(output_only_in_b)
|
|
2141
|
+
count = export_only_in_b_paths(results, output_path)
|
|
2142
|
+
console.print(
|
|
2143
|
+
f"\n[green]Only-in-B list exported ({count} items): {output_path}[/green]"
|
|
2144
|
+
)
|
|
2145
|
+
|
|
1027
2146
|
# Print final counts
|
|
1028
2147
|
console.print(f"\nTotal results: {len(results)}")
|