deepresearch-flow 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/db.py +184 -0
- deepresearch_flow/paper/db_ops.py +1939 -0
- deepresearch_flow/paper/web/app.py +38 -3705
- deepresearch_flow/paper/web/constants.py +23 -0
- deepresearch_flow/paper/web/filters.py +255 -0
- deepresearch_flow/paper/web/handlers/__init__.py +14 -0
- deepresearch_flow/paper/web/handlers/api.py +217 -0
- deepresearch_flow/paper/web/handlers/pages.py +334 -0
- deepresearch_flow/paper/web/markdown.py +549 -0
- deepresearch_flow/paper/web/static/css/main.css +857 -0
- deepresearch_flow/paper/web/static/js/detail.js +406 -0
- deepresearch_flow/paper/web/static/js/index.js +266 -0
- deepresearch_flow/paper/web/static/js/outline.js +58 -0
- deepresearch_flow/paper/web/static/js/stats.js +39 -0
- deepresearch_flow/paper/web/templates/base.html +43 -0
- deepresearch_flow/paper/web/templates/detail.html +332 -0
- deepresearch_flow/paper/web/templates/index.html +114 -0
- deepresearch_flow/paper/web/templates/stats.html +29 -0
- deepresearch_flow/paper/web/templates.py +85 -0
- deepresearch_flow/paper/web/text.py +68 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/METADATA +23 -2
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +26 -8
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1939 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import difflib
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
import re
|
|
9
|
+
import unicodedata
|
|
10
|
+
|
|
11
|
+
from deepresearch_flow.paper.utils import stable_hash
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from pypdf import PdfReader
|
|
15
|
+
PYPDF_AVAILABLE = True
|
|
16
|
+
except Exception:
|
|
17
|
+
PYPDF_AVAILABLE = False
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from pybtex.database import parse_file
|
|
21
|
+
PYBTEX_AVAILABLE = True
|
|
22
|
+
except Exception:
|
|
23
|
+
PYBTEX_AVAILABLE = False
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class PaperIndex:
|
|
27
|
+
papers: list[dict[str, Any]]
|
|
28
|
+
id_by_hash: dict[str, int]
|
|
29
|
+
ordered_ids: list[int]
|
|
30
|
+
by_tag: dict[str, set[int]]
|
|
31
|
+
by_author: dict[str, set[int]]
|
|
32
|
+
by_year: dict[str, set[int]]
|
|
33
|
+
by_month: dict[str, set[int]]
|
|
34
|
+
by_venue: dict[str, set[int]]
|
|
35
|
+
stats: dict[str, Any]
|
|
36
|
+
md_path_by_hash: dict[str, Path]
|
|
37
|
+
translated_md_by_hash: dict[str, dict[str, Path]]
|
|
38
|
+
pdf_path_by_hash: dict[str, Path]
|
|
39
|
+
template_tags: list[str]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _normalize_key(value: str) -> str:
|
|
43
|
+
return value.strip().lower()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _parse_year_month(date_str: str | None) -> tuple[str | None, str | None]:
|
|
47
|
+
if not date_str:
|
|
48
|
+
return None, None
|
|
49
|
+
text = str(date_str).strip()
|
|
50
|
+
year = None
|
|
51
|
+
month = None
|
|
52
|
+
|
|
53
|
+
year_match = re.search(r"(19|20)\d{2}", text)
|
|
54
|
+
if year_match:
|
|
55
|
+
year = year_match.group(0)
|
|
56
|
+
|
|
57
|
+
numeric_match = re.search(r"(19|20)\d{2}[-/](\d{1,2})", text)
|
|
58
|
+
if numeric_match:
|
|
59
|
+
m = int(numeric_match.group(2))
|
|
60
|
+
if 1 <= m <= 12:
|
|
61
|
+
month = f"{m:02d}"
|
|
62
|
+
return year, month
|
|
63
|
+
|
|
64
|
+
month_word = re.search(
|
|
65
|
+
r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|"
|
|
66
|
+
r"january|february|march|april|june|july|august|september|october|november|december)",
|
|
67
|
+
text.lower(),
|
|
68
|
+
)
|
|
69
|
+
if month_word:
|
|
70
|
+
lookup = {
|
|
71
|
+
"january": "01",
|
|
72
|
+
"february": "02",
|
|
73
|
+
"march": "03",
|
|
74
|
+
"april": "04",
|
|
75
|
+
"may": "05",
|
|
76
|
+
"june": "06",
|
|
77
|
+
"july": "07",
|
|
78
|
+
"august": "08",
|
|
79
|
+
"september": "09",
|
|
80
|
+
"october": "10",
|
|
81
|
+
"november": "11",
|
|
82
|
+
"december": "12",
|
|
83
|
+
"jan": "01",
|
|
84
|
+
"feb": "02",
|
|
85
|
+
"mar": "03",
|
|
86
|
+
"apr": "04",
|
|
87
|
+
"jun": "06",
|
|
88
|
+
"jul": "07",
|
|
89
|
+
"aug": "08",
|
|
90
|
+
"sep": "09",
|
|
91
|
+
"sept": "09",
|
|
92
|
+
"oct": "10",
|
|
93
|
+
"nov": "11",
|
|
94
|
+
"dec": "12",
|
|
95
|
+
}
|
|
96
|
+
month = lookup.get(month_word.group(0))
|
|
97
|
+
|
|
98
|
+
return year, month
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _normalize_month_token(value: str | None) -> str | None:
|
|
102
|
+
if not value:
|
|
103
|
+
return None
|
|
104
|
+
raw = str(value).strip().lower()
|
|
105
|
+
if not raw:
|
|
106
|
+
return None
|
|
107
|
+
if raw.isdigit():
|
|
108
|
+
num = int(raw)
|
|
109
|
+
if 1 <= num <= 12:
|
|
110
|
+
return f"{num:02d}"
|
|
111
|
+
lookup = {
|
|
112
|
+
"january": "01",
|
|
113
|
+
"february": "02",
|
|
114
|
+
"march": "03",
|
|
115
|
+
"april": "04",
|
|
116
|
+
"may": "05",
|
|
117
|
+
"june": "06",
|
|
118
|
+
"july": "07",
|
|
119
|
+
"august": "08",
|
|
120
|
+
"september": "09",
|
|
121
|
+
"october": "10",
|
|
122
|
+
"november": "11",
|
|
123
|
+
"december": "12",
|
|
124
|
+
"jan": "01",
|
|
125
|
+
"feb": "02",
|
|
126
|
+
"mar": "03",
|
|
127
|
+
"apr": "04",
|
|
128
|
+
"jun": "06",
|
|
129
|
+
"jul": "07",
|
|
130
|
+
"aug": "08",
|
|
131
|
+
"sep": "09",
|
|
132
|
+
"sept": "09",
|
|
133
|
+
"oct": "10",
|
|
134
|
+
"nov": "11",
|
|
135
|
+
"dec": "12",
|
|
136
|
+
}
|
|
137
|
+
return lookup.get(raw)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _extract_authors(paper: dict[str, Any]) -> list[str]:
|
|
141
|
+
value = paper.get("paper_authors")
|
|
142
|
+
if value is None:
|
|
143
|
+
return []
|
|
144
|
+
if isinstance(value, list):
|
|
145
|
+
return [str(item).strip() for item in value if str(item).strip()]
|
|
146
|
+
if isinstance(value, str):
|
|
147
|
+
return [part.strip() for part in value.split(",") if part.strip()]
|
|
148
|
+
return [str(value)]
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _extract_tags(paper: dict[str, Any]) -> list[str]:
|
|
152
|
+
tags = paper.get("ai_generated_tags") or []
|
|
153
|
+
if isinstance(tags, list):
|
|
154
|
+
return [str(tag).strip() for tag in tags if str(tag).strip()]
|
|
155
|
+
return []
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _extract_keywords(paper: dict[str, Any]) -> list[str]:
|
|
159
|
+
keywords = paper.get("keywords") or []
|
|
160
|
+
if isinstance(keywords, list):
|
|
161
|
+
return [str(keyword).strip() for keyword in keywords if str(keyword).strip()]
|
|
162
|
+
if isinstance(keywords, str):
|
|
163
|
+
parts = re.split(r"[;,]", keywords)
|
|
164
|
+
return [part.strip() for part in parts if part.strip()]
|
|
165
|
+
return []
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
_SUMMARY_FIELDS = (
|
|
169
|
+
"summary",
|
|
170
|
+
"abstract",
|
|
171
|
+
"keywords",
|
|
172
|
+
"question1",
|
|
173
|
+
"question2",
|
|
174
|
+
"question3",
|
|
175
|
+
"question4",
|
|
176
|
+
"question5",
|
|
177
|
+
"question6",
|
|
178
|
+
"question7",
|
|
179
|
+
"question8",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _has_summary(paper: dict[str, Any], template_tags: list[str]) -> bool:
|
|
184
|
+
if template_tags:
|
|
185
|
+
return True
|
|
186
|
+
for key in _SUMMARY_FIELDS:
|
|
187
|
+
value = paper.get(key)
|
|
188
|
+
if isinstance(value, str) and value.strip():
|
|
189
|
+
return True
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _extract_venue(paper: dict[str, Any]) -> str:
|
|
194
|
+
if isinstance(paper.get("bibtex"), dict):
|
|
195
|
+
bib = paper.get("bibtex") or {}
|
|
196
|
+
fields = bib.get("fields") or {}
|
|
197
|
+
bib_type = (bib.get("type") or "").lower()
|
|
198
|
+
if bib_type == "article" and fields.get("journal"):
|
|
199
|
+
return str(fields.get("journal"))
|
|
200
|
+
if bib_type in {"inproceedings", "conference", "proceedings"} and fields.get("booktitle"):
|
|
201
|
+
return str(fields.get("booktitle"))
|
|
202
|
+
return str(paper.get("publication_venue") or "")
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _available_templates(paper: dict[str, Any]) -> list[str]:
|
|
206
|
+
templates = paper.get("templates")
|
|
207
|
+
if not isinstance(templates, dict):
|
|
208
|
+
return []
|
|
209
|
+
order = paper.get("template_order") or list(templates.keys())
|
|
210
|
+
seen: set[str] = set()
|
|
211
|
+
available: list[str] = []
|
|
212
|
+
for tag in order:
|
|
213
|
+
if tag in templates and tag not in seen:
|
|
214
|
+
available.append(tag)
|
|
215
|
+
seen.add(tag)
|
|
216
|
+
for tag in templates:
|
|
217
|
+
if tag not in seen:
|
|
218
|
+
available.append(tag)
|
|
219
|
+
seen.add(tag)
|
|
220
|
+
return available
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
_TITLE_PREFIX_LEN = 16
|
|
224
|
+
_TITLE_MIN_CHARS = 24
|
|
225
|
+
_TITLE_MIN_TOKENS = 4
|
|
226
|
+
_AUTHOR_YEAR_MIN_SIMILARITY = 0.8
|
|
227
|
+
_LEADING_NUMERIC_MAX_LEN = 2
|
|
228
|
+
_SIMILARITY_START = 0.95
|
|
229
|
+
_SIMILARITY_STEP = 0.05
|
|
230
|
+
_SIMILARITY_MAX_STEPS = 10
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _normalize_title_key(title: str) -> str:
|
|
234
|
+
value = unicodedata.normalize("NFKD", title)
|
|
235
|
+
greek_map = {
|
|
236
|
+
"α": "alpha",
|
|
237
|
+
"β": "beta",
|
|
238
|
+
"γ": "gamma",
|
|
239
|
+
"δ": "delta",
|
|
240
|
+
"ε": "epsilon",
|
|
241
|
+
"ζ": "zeta",
|
|
242
|
+
"η": "eta",
|
|
243
|
+
"θ": "theta",
|
|
244
|
+
"ι": "iota",
|
|
245
|
+
"κ": "kappa",
|
|
246
|
+
"λ": "lambda",
|
|
247
|
+
"μ": "mu",
|
|
248
|
+
"ν": "nu",
|
|
249
|
+
"ξ": "xi",
|
|
250
|
+
"ο": "omicron",
|
|
251
|
+
"π": "pi",
|
|
252
|
+
"ρ": "rho",
|
|
253
|
+
"σ": "sigma",
|
|
254
|
+
"τ": "tau",
|
|
255
|
+
"υ": "upsilon",
|
|
256
|
+
"φ": "phi",
|
|
257
|
+
"χ": "chi",
|
|
258
|
+
"ψ": "psi",
|
|
259
|
+
"ω": "omega",
|
|
260
|
+
}
|
|
261
|
+
for char, name in greek_map.items():
|
|
262
|
+
value = value.replace(char, f" {name} ")
|
|
263
|
+
value = re.sub(
|
|
264
|
+
r"\\(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\b",
|
|
265
|
+
r" \1 ",
|
|
266
|
+
value,
|
|
267
|
+
flags=re.IGNORECASE,
|
|
268
|
+
)
|
|
269
|
+
value = value.replace("{", "").replace("}", "")
|
|
270
|
+
value = value.replace("_", " ")
|
|
271
|
+
value = re.sub(r"([a-z])([0-9])", r"\1 \2", value, flags=re.IGNORECASE)
|
|
272
|
+
value = re.sub(r"([0-9])([a-z])", r"\1 \2", value, flags=re.IGNORECASE)
|
|
273
|
+
value = re.sub(r"[^a-z0-9]+", " ", value.lower())
|
|
274
|
+
value = re.sub(r"\s+", " ", value).strip()
|
|
275
|
+
tokens = value.split()
|
|
276
|
+
if not tokens:
|
|
277
|
+
return ""
|
|
278
|
+
merged: list[str] = []
|
|
279
|
+
idx = 0
|
|
280
|
+
while idx < len(tokens):
|
|
281
|
+
token = tokens[idx]
|
|
282
|
+
if len(token) == 1 and idx + 1 < len(tokens):
|
|
283
|
+
merged.append(token + tokens[idx + 1])
|
|
284
|
+
idx += 2
|
|
285
|
+
continue
|
|
286
|
+
merged.append(token)
|
|
287
|
+
idx += 1
|
|
288
|
+
return " ".join(merged)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _compact_title_key(title_key: str) -> str:
|
|
292
|
+
return title_key.replace(" ", "")
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _strip_leading_numeric_tokens(title_key: str) -> str:
|
|
296
|
+
tokens = title_key.split()
|
|
297
|
+
idx = 0
|
|
298
|
+
while idx < len(tokens):
|
|
299
|
+
token = tokens[idx]
|
|
300
|
+
if token.isdigit() and len(token) <= _LEADING_NUMERIC_MAX_LEN:
|
|
301
|
+
idx += 1
|
|
302
|
+
continue
|
|
303
|
+
break
|
|
304
|
+
if idx == 0:
|
|
305
|
+
return title_key
|
|
306
|
+
return " ".join(tokens[idx:])
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _strip_pdf_hash_suffix(name: str) -> str:
|
|
310
|
+
return re.sub(r"(?i)(\.pdf)(?:-[0-9a-f\-]{8,})$", r"\1", name)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _extract_title_from_filename(name: str) -> str:
|
|
314
|
+
base = name
|
|
315
|
+
lower = base.lower()
|
|
316
|
+
if lower.endswith(".md"):
|
|
317
|
+
base = base[:-3]
|
|
318
|
+
lower = base.lower()
|
|
319
|
+
if ".pdf-" in lower:
|
|
320
|
+
base = _strip_pdf_hash_suffix(base)
|
|
321
|
+
lower = base.lower()
|
|
322
|
+
if lower.endswith(".pdf"):
|
|
323
|
+
base = base[:-4]
|
|
324
|
+
base = base.replace("_", " ").strip()
|
|
325
|
+
match = re.match(r"\s*\d{4}\s*-\s*(.+)$", base)
|
|
326
|
+
if match:
|
|
327
|
+
return match.group(1).strip()
|
|
328
|
+
match = re.match(r"\s*.+?\s*-\s*\d{4}\s*-\s*(.+)$", base)
|
|
329
|
+
if match:
|
|
330
|
+
return match.group(1).strip()
|
|
331
|
+
return base.strip()
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _clean_pdf_metadata_title(value: str | None, path: Path) -> str | None:
|
|
335
|
+
if not value:
|
|
336
|
+
return None
|
|
337
|
+
text = str(value).replace("\x00", "").strip()
|
|
338
|
+
if not text:
|
|
339
|
+
return None
|
|
340
|
+
text = re.sub(r"(?i)^microsoft\\s+word\\s*-\\s*", "", text)
|
|
341
|
+
text = re.sub(r"(?i)^pdf\\s*-\\s*", "", text)
|
|
342
|
+
text = re.sub(r"(?i)^untitled\\b", "", text).strip()
|
|
343
|
+
if text.lower().endswith(".pdf"):
|
|
344
|
+
text = text[:-4].strip()
|
|
345
|
+
if len(text) < 3:
|
|
346
|
+
return None
|
|
347
|
+
stem = path.stem.strip()
|
|
348
|
+
if stem and text.lower() == stem.lower():
|
|
349
|
+
return None
|
|
350
|
+
return text
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _read_pdf_metadata_title(path: Path) -> str | None:
|
|
354
|
+
if not PYPDF_AVAILABLE:
|
|
355
|
+
return None
|
|
356
|
+
try:
|
|
357
|
+
reader = PdfReader(str(path))
|
|
358
|
+
meta = reader.metadata
|
|
359
|
+
title = meta.title if meta else None
|
|
360
|
+
except Exception:
|
|
361
|
+
return None
|
|
362
|
+
return _clean_pdf_metadata_title(title, path)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _is_pdf_like(path: Path) -> bool:
|
|
366
|
+
suffix = path.suffix.lower()
|
|
367
|
+
if suffix == ".pdf":
|
|
368
|
+
return True
|
|
369
|
+
name_lower = path.name.lower()
|
|
370
|
+
return ".pdf-" in name_lower and not name_lower.endswith(".md")
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _scan_pdf_roots(roots: list[Path]) -> tuple[list[Path], list[dict[str, Any]]]:
|
|
374
|
+
pdf_paths: list[Path] = []
|
|
375
|
+
meta: list[dict[str, Any]] = []
|
|
376
|
+
seen: set[Path] = set()
|
|
377
|
+
for root in roots:
|
|
378
|
+
try:
|
|
379
|
+
if not root.exists() or not root.is_dir():
|
|
380
|
+
continue
|
|
381
|
+
except OSError:
|
|
382
|
+
continue
|
|
383
|
+
files: list[Path] = []
|
|
384
|
+
for path in root.rglob("*"):
|
|
385
|
+
try:
|
|
386
|
+
if not path.is_file():
|
|
387
|
+
continue
|
|
388
|
+
except OSError:
|
|
389
|
+
continue
|
|
390
|
+
if not _is_pdf_like(path):
|
|
391
|
+
continue
|
|
392
|
+
resolved = path.resolve()
|
|
393
|
+
if resolved in seen:
|
|
394
|
+
continue
|
|
395
|
+
seen.add(resolved)
|
|
396
|
+
files.append(resolved)
|
|
397
|
+
max_mtime = 0.0
|
|
398
|
+
total_size = 0
|
|
399
|
+
for path in files:
|
|
400
|
+
try:
|
|
401
|
+
stats = path.stat()
|
|
402
|
+
except OSError:
|
|
403
|
+
continue
|
|
404
|
+
max_mtime = max(max_mtime, stats.st_mtime)
|
|
405
|
+
total_size += stats.st_size
|
|
406
|
+
pdf_paths.extend(files)
|
|
407
|
+
meta.append(
|
|
408
|
+
{
|
|
409
|
+
"path": str(root),
|
|
410
|
+
"count": len(files),
|
|
411
|
+
"max_mtime": max_mtime,
|
|
412
|
+
"size": total_size,
|
|
413
|
+
}
|
|
414
|
+
)
|
|
415
|
+
return pdf_paths, meta
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _extract_year_author_from_filename(name: str) -> tuple[str | None, str | None]:
|
|
419
|
+
base = name
|
|
420
|
+
lower = base.lower()
|
|
421
|
+
if lower.endswith(".md"):
|
|
422
|
+
base = base[:-3]
|
|
423
|
+
lower = base.lower()
|
|
424
|
+
if ".pdf-" in lower:
|
|
425
|
+
base = _strip_pdf_hash_suffix(base)
|
|
426
|
+
lower = base.lower()
|
|
427
|
+
if lower.endswith(".pdf"):
|
|
428
|
+
base = base[:-4]
|
|
429
|
+
match = re.match(r"\s*(.+?)\s*-\s*((?:19|20)\d{2})\s*-\s*", base)
|
|
430
|
+
if match:
|
|
431
|
+
return match.group(2), match.group(1).strip()
|
|
432
|
+
match = re.match(r"\s*((?:19|20)\d{2})\s*-\s*", base)
|
|
433
|
+
if match:
|
|
434
|
+
return match.group(1), None
|
|
435
|
+
return None, None
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _normalize_author_key(name: str) -> str:
|
|
439
|
+
raw = name.lower().strip()
|
|
440
|
+
raw = raw.replace("et al.", "").replace("et al", "")
|
|
441
|
+
if "," in raw:
|
|
442
|
+
raw = raw.split(",", 1)[0]
|
|
443
|
+
raw = re.sub(r"[^a-z0-9]+", " ", raw)
|
|
444
|
+
raw = re.sub(r"\s+", " ", raw).strip()
|
|
445
|
+
if not raw:
|
|
446
|
+
return ""
|
|
447
|
+
parts = raw.split()
|
|
448
|
+
return parts[-1] if parts else raw
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _title_prefix_key(title_key: str) -> str | None:
|
|
452
|
+
if len(title_key.split()) < _TITLE_MIN_TOKENS:
|
|
453
|
+
return None
|
|
454
|
+
compact = _compact_title_key(title_key)
|
|
455
|
+
if len(compact) < _TITLE_PREFIX_LEN:
|
|
456
|
+
return None
|
|
457
|
+
prefix = compact[:_TITLE_PREFIX_LEN]
|
|
458
|
+
if not prefix:
|
|
459
|
+
return None
|
|
460
|
+
return f"prefix:{prefix}"
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def _title_overlap_match(a: str, b: str) -> bool:
|
|
464
|
+
if not a or not b:
|
|
465
|
+
return False
|
|
466
|
+
if a == b:
|
|
467
|
+
return True
|
|
468
|
+
shorter, longer = (a, b) if len(a) <= len(b) else (b, a)
|
|
469
|
+
token_count = len(shorter.split())
|
|
470
|
+
if len(shorter) >= _TITLE_MIN_CHARS or token_count >= _TITLE_MIN_TOKENS:
|
|
471
|
+
if longer.startswith(shorter) or shorter in longer:
|
|
472
|
+
return True
|
|
473
|
+
return False
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _title_similarity(a: str, b: str) -> float:
|
|
477
|
+
if not a or not b:
|
|
478
|
+
return 0.0
|
|
479
|
+
return difflib.SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def _adaptive_similarity_match(title_key: str, candidates: list[Path]) -> tuple[Path | None, float]:
|
|
483
|
+
if not title_key:
|
|
484
|
+
return None, 0.0
|
|
485
|
+
scored: list[tuple[Path, float]] = []
|
|
486
|
+
for path in candidates:
|
|
487
|
+
candidate_title = _normalize_title_key(_extract_title_from_filename(path.name))
|
|
488
|
+
if not candidate_title:
|
|
489
|
+
continue
|
|
490
|
+
if _title_overlap_match(title_key, candidate_title):
|
|
491
|
+
return path, 1.0
|
|
492
|
+
scored.append((path, _title_similarity(title_key, candidate_title)))
|
|
493
|
+
if not scored:
|
|
494
|
+
return None, 0.0
|
|
495
|
+
|
|
496
|
+
def matches_at(threshold: float) -> list[tuple[Path, float]]:
|
|
497
|
+
return [(path, score) for path, score in scored if score >= threshold]
|
|
498
|
+
|
|
499
|
+
threshold = _SIMILARITY_START
|
|
500
|
+
step = _SIMILARITY_STEP
|
|
501
|
+
prev_threshold = None
|
|
502
|
+
prev_count = None
|
|
503
|
+
for _ in range(_SIMILARITY_MAX_STEPS):
|
|
504
|
+
matches = matches_at(threshold)
|
|
505
|
+
if len(matches) == 1:
|
|
506
|
+
path, score = matches[0]
|
|
507
|
+
return path, score
|
|
508
|
+
if len(matches) == 0:
|
|
509
|
+
prev_threshold = threshold
|
|
510
|
+
prev_count = 0
|
|
511
|
+
threshold -= step
|
|
512
|
+
continue
|
|
513
|
+
if prev_count == 0 and prev_threshold is not None:
|
|
514
|
+
low = threshold
|
|
515
|
+
high = prev_threshold
|
|
516
|
+
for _ in range(_SIMILARITY_MAX_STEPS):
|
|
517
|
+
mid = (low + high) / 2
|
|
518
|
+
mid_matches = matches_at(mid)
|
|
519
|
+
if len(mid_matches) == 1:
|
|
520
|
+
path, score = mid_matches[0]
|
|
521
|
+
return path, score
|
|
522
|
+
if len(mid_matches) == 0:
|
|
523
|
+
high = mid
|
|
524
|
+
else:
|
|
525
|
+
low = mid
|
|
526
|
+
return None, 0.0
|
|
527
|
+
prev_threshold = threshold
|
|
528
|
+
prev_count = len(matches)
|
|
529
|
+
threshold -= step
|
|
530
|
+
return None, 0.0
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def _resolve_by_title_and_meta(
|
|
534
|
+
paper: dict[str, Any],
|
|
535
|
+
file_index: dict[str, list[Path]],
|
|
536
|
+
) -> tuple[Path | None, str | None, float]:
|
|
537
|
+
title = str(paper.get("paper_title") or "")
|
|
538
|
+
title_key = _normalize_title_key(title)
|
|
539
|
+
if not title_key:
|
|
540
|
+
title_key = ""
|
|
541
|
+
candidates = file_index.get(title_key, [])
|
|
542
|
+
if candidates:
|
|
543
|
+
return candidates[0], "title", 1.0
|
|
544
|
+
if title_key:
|
|
545
|
+
compact_key = _compact_title_key(title_key)
|
|
546
|
+
compact_candidates = file_index.get(f"compact:{compact_key}", [])
|
|
547
|
+
if compact_candidates:
|
|
548
|
+
return compact_candidates[0], "title_compact", 1.0
|
|
549
|
+
stripped_key = _strip_leading_numeric_tokens(title_key)
|
|
550
|
+
if stripped_key and stripped_key != title_key:
|
|
551
|
+
stripped_candidates = file_index.get(stripped_key, [])
|
|
552
|
+
if stripped_candidates:
|
|
553
|
+
return stripped_candidates[0], "title_stripped", 1.0
|
|
554
|
+
stripped_compact = _compact_title_key(stripped_key)
|
|
555
|
+
stripped_candidates = file_index.get(f"compact:{stripped_compact}", [])
|
|
556
|
+
if stripped_candidates:
|
|
557
|
+
return stripped_candidates[0], "title_compact", 1.0
|
|
558
|
+
prefix_candidates: list[Path] = []
|
|
559
|
+
prefix_key = _title_prefix_key(title_key)
|
|
560
|
+
if prefix_key:
|
|
561
|
+
prefix_candidates = file_index.get(prefix_key, [])
|
|
562
|
+
if not prefix_candidates:
|
|
563
|
+
stripped_key = _strip_leading_numeric_tokens(title_key)
|
|
564
|
+
if stripped_key and stripped_key != title_key:
|
|
565
|
+
prefix_key = _title_prefix_key(stripped_key)
|
|
566
|
+
if prefix_key:
|
|
567
|
+
prefix_candidates = file_index.get(prefix_key, [])
|
|
568
|
+
if prefix_candidates:
|
|
569
|
+
match, score = _adaptive_similarity_match(title_key, prefix_candidates)
|
|
570
|
+
if match is not None:
|
|
571
|
+
match_type = "title_prefix" if score >= 1.0 else "title_fuzzy"
|
|
572
|
+
return match, match_type, score
|
|
573
|
+
year = str(paper.get("_year") or "").strip()
|
|
574
|
+
if not year.isdigit():
|
|
575
|
+
return None, None, 0.0
|
|
576
|
+
author_key = ""
|
|
577
|
+
authors = paper.get("_authors") or []
|
|
578
|
+
if authors:
|
|
579
|
+
author_key = _normalize_author_key(str(authors[0]))
|
|
580
|
+
candidates = []
|
|
581
|
+
match_type = "year"
|
|
582
|
+
if author_key:
|
|
583
|
+
candidates = file_index.get(f"authoryear:{year}:{author_key}", [])
|
|
584
|
+
if candidates:
|
|
585
|
+
match_type = "author_year"
|
|
586
|
+
if not candidates:
|
|
587
|
+
candidates = file_index.get(f"year:{year}", [])
|
|
588
|
+
if not candidates:
|
|
589
|
+
return None, None, 0.0
|
|
590
|
+
if len(candidates) == 1 and not title_key:
|
|
591
|
+
return candidates[0], match_type, 1.0
|
|
592
|
+
match, score = _adaptive_similarity_match(title_key, candidates)
|
|
593
|
+
if match is not None:
|
|
594
|
+
if score < _AUTHOR_YEAR_MIN_SIMILARITY:
|
|
595
|
+
return None, None, 0.0
|
|
596
|
+
return match, "title_fuzzy", score
|
|
597
|
+
return None, None, 0.0
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
|
|
601
|
+
index: dict[str, list[Path]] = {}
|
|
602
|
+
for root in roots:
|
|
603
|
+
try:
|
|
604
|
+
if not root.exists() or not root.is_dir():
|
|
605
|
+
continue
|
|
606
|
+
except OSError:
|
|
607
|
+
continue
|
|
608
|
+
for path in root.rglob("*"):
|
|
609
|
+
try:
|
|
610
|
+
if not path.is_file():
|
|
611
|
+
continue
|
|
612
|
+
except OSError:
|
|
613
|
+
continue
|
|
614
|
+
suffix = path.suffix.lower()
|
|
615
|
+
if suffix not in suffixes:
|
|
616
|
+
name_lower = path.name.lower()
|
|
617
|
+
if suffixes == {".pdf"} and ".pdf-" in name_lower and suffix != ".md":
|
|
618
|
+
pass
|
|
619
|
+
else:
|
|
620
|
+
continue
|
|
621
|
+
resolved = path.resolve()
|
|
622
|
+
name_key = path.name.lower()
|
|
623
|
+
index.setdefault(name_key, []).append(resolved)
|
|
624
|
+
title_candidate = _extract_title_from_filename(path.name)
|
|
625
|
+
title_key = _normalize_title_key(title_candidate)
|
|
626
|
+
if title_key:
|
|
627
|
+
if title_key != name_key:
|
|
628
|
+
index.setdefault(title_key, []).append(resolved)
|
|
629
|
+
compact_key = _compact_title_key(title_key)
|
|
630
|
+
if compact_key:
|
|
631
|
+
index.setdefault(f"compact:{compact_key}", []).append(resolved)
|
|
632
|
+
prefix_key = _title_prefix_key(title_key)
|
|
633
|
+
if prefix_key:
|
|
634
|
+
index.setdefault(prefix_key, []).append(resolved)
|
|
635
|
+
stripped_key = _strip_leading_numeric_tokens(title_key)
|
|
636
|
+
if stripped_key and stripped_key != title_key:
|
|
637
|
+
index.setdefault(stripped_key, []).append(resolved)
|
|
638
|
+
stripped_compact = _compact_title_key(stripped_key)
|
|
639
|
+
if stripped_compact:
|
|
640
|
+
index.setdefault(f"compact:{stripped_compact}", []).append(resolved)
|
|
641
|
+
stripped_prefix = _title_prefix_key(stripped_key)
|
|
642
|
+
if stripped_prefix:
|
|
643
|
+
index.setdefault(stripped_prefix, []).append(resolved)
|
|
644
|
+
year_hint, author_hint = _extract_year_author_from_filename(path.name)
|
|
645
|
+
if year_hint:
|
|
646
|
+
index.setdefault(f"year:{year_hint}", []).append(resolved)
|
|
647
|
+
if author_hint:
|
|
648
|
+
author_key = _normalize_author_key(author_hint)
|
|
649
|
+
if author_key:
|
|
650
|
+
index.setdefault(f"authoryear:{year_hint}:{author_key}", []).append(resolved)
|
|
651
|
+
return index
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def _build_file_index_from_paths(paths: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
|
|
655
|
+
index: dict[str, list[Path]] = {}
|
|
656
|
+
for path in paths:
|
|
657
|
+
try:
|
|
658
|
+
if not path.is_file():
|
|
659
|
+
continue
|
|
660
|
+
except OSError:
|
|
661
|
+
continue
|
|
662
|
+
suffix = path.suffix.lower()
|
|
663
|
+
if suffix not in suffixes:
|
|
664
|
+
name_lower = path.name.lower()
|
|
665
|
+
if suffixes == {".pdf"} and ".pdf-" in name_lower and suffix != ".md":
|
|
666
|
+
pass
|
|
667
|
+
else:
|
|
668
|
+
continue
|
|
669
|
+
resolved = path.resolve()
|
|
670
|
+
name_key = path.name.lower()
|
|
671
|
+
index.setdefault(name_key, []).append(resolved)
|
|
672
|
+
title_candidate = _extract_title_from_filename(path.name)
|
|
673
|
+
title_key = _normalize_title_key(title_candidate)
|
|
674
|
+
if title_key:
|
|
675
|
+
if title_key != name_key:
|
|
676
|
+
index.setdefault(title_key, []).append(resolved)
|
|
677
|
+
compact_key = _compact_title_key(title_key)
|
|
678
|
+
if compact_key:
|
|
679
|
+
index.setdefault(f"compact:{compact_key}", []).append(resolved)
|
|
680
|
+
prefix_key = _title_prefix_key(title_key)
|
|
681
|
+
if prefix_key:
|
|
682
|
+
index.setdefault(prefix_key, []).append(resolved)
|
|
683
|
+
stripped_key = _strip_leading_numeric_tokens(title_key)
|
|
684
|
+
if stripped_key and stripped_key != title_key:
|
|
685
|
+
index.setdefault(stripped_key, []).append(resolved)
|
|
686
|
+
stripped_compact = _compact_title_key(stripped_key)
|
|
687
|
+
if stripped_compact:
|
|
688
|
+
index.setdefault(f"compact:{stripped_compact}", []).append(resolved)
|
|
689
|
+
stripped_prefix = _title_prefix_key(stripped_key)
|
|
690
|
+
if stripped_prefix:
|
|
691
|
+
index.setdefault(stripped_prefix, []).append(resolved)
|
|
692
|
+
return index
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def _build_translated_index(roots: list[Path]) -> dict[str, dict[str, Path]]:
|
|
696
|
+
index: dict[str, dict[str, Path]] = {}
|
|
697
|
+
candidates: list[Path] = []
|
|
698
|
+
for root in roots:
|
|
699
|
+
try:
|
|
700
|
+
if not root.exists() or not root.is_dir():
|
|
701
|
+
continue
|
|
702
|
+
except OSError:
|
|
703
|
+
continue
|
|
704
|
+
try:
|
|
705
|
+
candidates.extend(root.rglob("*.md"))
|
|
706
|
+
except OSError:
|
|
707
|
+
continue
|
|
708
|
+
for path in sorted(candidates, key=lambda item: str(item)):
|
|
709
|
+
try:
|
|
710
|
+
if not path.is_file():
|
|
711
|
+
continue
|
|
712
|
+
except OSError:
|
|
713
|
+
continue
|
|
714
|
+
name = path.name
|
|
715
|
+
match = re.match(r"^(.+)\.([^.]+)\.md$", name, flags=re.IGNORECASE)
|
|
716
|
+
if not match:
|
|
717
|
+
continue
|
|
718
|
+
base_name = match.group(1).strip()
|
|
719
|
+
lang = match.group(2).strip()
|
|
720
|
+
if not base_name or not lang:
|
|
721
|
+
continue
|
|
722
|
+
base_key = base_name.lower()
|
|
723
|
+
lang_key = lang.lower()
|
|
724
|
+
index.setdefault(base_key, {}).setdefault(lang_key, path.resolve())
|
|
725
|
+
return index
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def _resolve_source_md(paper: dict[str, Any], md_index: dict[str, list[Path]]) -> Path | None:
|
|
729
|
+
source_path = paper.get("source_path")
|
|
730
|
+
if not source_path:
|
|
731
|
+
source_path = ""
|
|
732
|
+
if source_path:
|
|
733
|
+
name = Path(str(source_path)).name.lower()
|
|
734
|
+
candidates = md_index.get(name, [])
|
|
735
|
+
if candidates:
|
|
736
|
+
return candidates[0]
|
|
737
|
+
match, _, _ = _resolve_by_title_and_meta(paper, md_index)
|
|
738
|
+
return match
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def _guess_pdf_names(paper: dict[str, Any]) -> list[str]:
|
|
742
|
+
source_path = paper.get("source_path")
|
|
743
|
+
if not source_path:
|
|
744
|
+
return []
|
|
745
|
+
name = Path(str(source_path)).name
|
|
746
|
+
match = re.match(r"(?i)(.+\.pdf)(?:-[0-9a-f\-]{8,})?\.md$", name)
|
|
747
|
+
if match:
|
|
748
|
+
return [Path(match.group(1)).name]
|
|
749
|
+
if ".pdf-" in name.lower():
|
|
750
|
+
base = name[: name.lower().rfind(".pdf-") + 4]
|
|
751
|
+
return [Path(base).name]
|
|
752
|
+
if name.lower().endswith(".pdf"):
|
|
753
|
+
return [name]
|
|
754
|
+
if name.lower().endswith(".pdf.md"):
|
|
755
|
+
return [name[:-3]]
|
|
756
|
+
return []
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
def _resolve_pdf(paper: dict[str, Any], pdf_index: dict[str, list[Path]]) -> Path | None:
|
|
760
|
+
for filename in _guess_pdf_names(paper):
|
|
761
|
+
candidates = pdf_index.get(filename.lower(), [])
|
|
762
|
+
if candidates:
|
|
763
|
+
return candidates[0]
|
|
764
|
+
match, _, _ = _resolve_by_title_and_meta(paper, pdf_index)
|
|
765
|
+
return match
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def build_index(
|
|
769
|
+
papers: list[dict[str, Any]],
|
|
770
|
+
*,
|
|
771
|
+
md_roots: list[Path] | None = None,
|
|
772
|
+
md_translated_roots: list[Path] | None = None,
|
|
773
|
+
pdf_roots: list[Path] | None = None,
|
|
774
|
+
) -> PaperIndex:
|
|
775
|
+
id_by_hash: dict[str, int] = {}
|
|
776
|
+
by_tag: dict[str, set[int]] = {}
|
|
777
|
+
by_author: dict[str, set[int]] = {}
|
|
778
|
+
by_year: dict[str, set[int]] = {}
|
|
779
|
+
by_month: dict[str, set[int]] = {}
|
|
780
|
+
by_venue: dict[str, set[int]] = {}
|
|
781
|
+
|
|
782
|
+
md_path_by_hash: dict[str, Path] = {}
|
|
783
|
+
translated_md_by_hash: dict[str, dict[str, Path]] = {}
|
|
784
|
+
pdf_path_by_hash: dict[str, Path] = {}
|
|
785
|
+
|
|
786
|
+
md_file_index = _build_file_index(md_roots or [], suffixes={".md"})
|
|
787
|
+
translated_index = _build_translated_index(md_translated_roots or [])
|
|
788
|
+
pdf_file_index = _build_file_index(pdf_roots or [], suffixes={".pdf"})
|
|
789
|
+
|
|
790
|
+
year_counts: dict[str, int] = {}
|
|
791
|
+
month_counts: dict[str, int] = {}
|
|
792
|
+
tag_counts: dict[str, int] = {}
|
|
793
|
+
keyword_counts: dict[str, int] = {}
|
|
794
|
+
author_counts: dict[str, int] = {}
|
|
795
|
+
venue_counts: dict[str, int] = {}
|
|
796
|
+
template_tag_counts: dict[str, int] = {}
|
|
797
|
+
|
|
798
|
+
def add_index(index: dict[str, set[int]], key: str, idx: int) -> None:
|
|
799
|
+
index.setdefault(key, set()).add(idx)
|
|
800
|
+
|
|
801
|
+
for idx, paper in enumerate(papers):
|
|
802
|
+
is_pdf_only = bool(paper.get("_is_pdf_only"))
|
|
803
|
+
source_hash = paper.get("source_hash")
|
|
804
|
+
if not source_hash and paper.get("source_path"):
|
|
805
|
+
source_hash = stable_hash(str(paper.get("source_path")))
|
|
806
|
+
if source_hash:
|
|
807
|
+
id_by_hash[str(source_hash)] = idx
|
|
808
|
+
|
|
809
|
+
title = str(paper.get("paper_title") or "")
|
|
810
|
+
paper["_title_lc"] = title.lower()
|
|
811
|
+
|
|
812
|
+
bib_fields: dict[str, Any] = {}
|
|
813
|
+
if isinstance(paper.get("bibtex"), dict):
|
|
814
|
+
bib_fields = paper.get("bibtex", {}).get("fields", {}) or {}
|
|
815
|
+
|
|
816
|
+
year = None
|
|
817
|
+
if bib_fields.get("year") and str(bib_fields.get("year")).isdigit():
|
|
818
|
+
year = str(bib_fields.get("year"))
|
|
819
|
+
month = _normalize_month_token(bib_fields.get("month"))
|
|
820
|
+
if not year or not month:
|
|
821
|
+
parsed_year, parsed_month = _parse_year_month(str(paper.get("publication_date") or ""))
|
|
822
|
+
year = year or parsed_year
|
|
823
|
+
month = month or parsed_month
|
|
824
|
+
|
|
825
|
+
year_label = year or "Unknown"
|
|
826
|
+
month_label = month or "Unknown"
|
|
827
|
+
paper["_year"] = year_label
|
|
828
|
+
paper["_month"] = month_label
|
|
829
|
+
add_index(by_year, _normalize_key(year_label), idx)
|
|
830
|
+
add_index(by_month, _normalize_key(month_label), idx)
|
|
831
|
+
if not is_pdf_only:
|
|
832
|
+
year_counts[year_label] = year_counts.get(year_label, 0) + 1
|
|
833
|
+
month_counts[month_label] = month_counts.get(month_label, 0) + 1
|
|
834
|
+
|
|
835
|
+
venue = _extract_venue(paper).strip()
|
|
836
|
+
paper["_venue"] = venue
|
|
837
|
+
if venue:
|
|
838
|
+
add_index(by_venue, _normalize_key(venue), idx)
|
|
839
|
+
if not is_pdf_only:
|
|
840
|
+
venue_counts[venue] = venue_counts.get(venue, 0) + 1
|
|
841
|
+
else:
|
|
842
|
+
add_index(by_venue, "unknown", idx)
|
|
843
|
+
if not is_pdf_only:
|
|
844
|
+
venue_counts["Unknown"] = venue_counts.get("Unknown", 0) + 1
|
|
845
|
+
|
|
846
|
+
authors = _extract_authors(paper)
|
|
847
|
+
paper["_authors"] = authors
|
|
848
|
+
for author in authors:
|
|
849
|
+
key = _normalize_key(author)
|
|
850
|
+
add_index(by_author, key, idx)
|
|
851
|
+
if not is_pdf_only:
|
|
852
|
+
author_counts[author] = author_counts.get(author, 0) + 1
|
|
853
|
+
|
|
854
|
+
tags = _extract_tags(paper)
|
|
855
|
+
paper["_tags"] = tags
|
|
856
|
+
for tag in tags:
|
|
857
|
+
key = _normalize_key(tag)
|
|
858
|
+
add_index(by_tag, key, idx)
|
|
859
|
+
if not is_pdf_only:
|
|
860
|
+
tag_counts[tag] = tag_counts.get(tag, 0) + 1
|
|
861
|
+
|
|
862
|
+
keywords = _extract_keywords(paper)
|
|
863
|
+
paper["_keywords"] = keywords
|
|
864
|
+
for keyword in keywords:
|
|
865
|
+
if not is_pdf_only:
|
|
866
|
+
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1
|
|
867
|
+
|
|
868
|
+
template_tags = _available_templates(paper)
|
|
869
|
+
if not template_tags:
|
|
870
|
+
fallback_tag = paper.get("template_tag") or paper.get("prompt_template")
|
|
871
|
+
if fallback_tag:
|
|
872
|
+
template_tags = [str(fallback_tag)]
|
|
873
|
+
paper["_template_tags"] = template_tags
|
|
874
|
+
paper["_template_tags_lc"] = [tag.lower() for tag in template_tags]
|
|
875
|
+
paper["_has_summary"] = _has_summary(paper, template_tags)
|
|
876
|
+
if not is_pdf_only:
|
|
877
|
+
for tag in template_tags:
|
|
878
|
+
template_tag_counts[tag] = template_tag_counts.get(tag, 0) + 1
|
|
879
|
+
|
|
880
|
+
search_parts = [title, venue, " ".join(authors), " ".join(tags)]
|
|
881
|
+
paper["_search_lc"] = " ".join(part for part in search_parts if part).lower()
|
|
882
|
+
|
|
883
|
+
source_hash_str = str(source_hash) if source_hash else str(idx)
|
|
884
|
+
md_path = _resolve_source_md(paper, md_file_index)
|
|
885
|
+
if md_path is not None:
|
|
886
|
+
md_path_by_hash[source_hash_str] = md_path
|
|
887
|
+
base_key = md_path.with_suffix("").name.lower()
|
|
888
|
+
translations = translated_index.get(base_key, {})
|
|
889
|
+
if translations:
|
|
890
|
+
translated_md_by_hash[source_hash_str] = translations
|
|
891
|
+
pdf_path = _resolve_pdf(paper, pdf_file_index)
|
|
892
|
+
if pdf_path is not None:
|
|
893
|
+
pdf_path_by_hash[source_hash_str] = pdf_path
|
|
894
|
+
|
|
895
|
+
def year_sort_key(item: tuple[int, dict[str, Any]]) -> tuple[int, int, str]:
|
|
896
|
+
idx, paper = item
|
|
897
|
+
year_label = str(paper.get("_year") or "Unknown")
|
|
898
|
+
title_label = str(paper.get("paper_title") or "")
|
|
899
|
+
if year_label.isdigit():
|
|
900
|
+
return (0, -int(year_label), title_label.lower())
|
|
901
|
+
return (1, 0, title_label.lower())
|
|
902
|
+
|
|
903
|
+
ordered_ids = [idx for idx, _ in sorted(enumerate(papers), key=year_sort_key)]
|
|
904
|
+
|
|
905
|
+
stats_total = sum(1 for paper in papers if not paper.get("_is_pdf_only"))
|
|
906
|
+
stats = {
|
|
907
|
+
"total": stats_total,
|
|
908
|
+
"years": _sorted_counts(year_counts, numeric_desc=True),
|
|
909
|
+
"months": _sorted_month_counts(month_counts),
|
|
910
|
+
"tags": _sorted_counts(tag_counts),
|
|
911
|
+
"keywords": _sorted_counts(keyword_counts),
|
|
912
|
+
"authors": _sorted_counts(author_counts),
|
|
913
|
+
"venues": _sorted_counts(venue_counts),
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
template_tags = sorted(template_tag_counts.keys(), key=lambda item: item.lower())
|
|
917
|
+
|
|
918
|
+
return PaperIndex(
|
|
919
|
+
papers=papers,
|
|
920
|
+
id_by_hash=id_by_hash,
|
|
921
|
+
ordered_ids=ordered_ids,
|
|
922
|
+
by_tag=by_tag,
|
|
923
|
+
by_author=by_author,
|
|
924
|
+
by_year=by_year,
|
|
925
|
+
by_month=by_month,
|
|
926
|
+
by_venue=by_venue,
|
|
927
|
+
stats=stats,
|
|
928
|
+
md_path_by_hash=md_path_by_hash,
|
|
929
|
+
translated_md_by_hash=translated_md_by_hash,
|
|
930
|
+
pdf_path_by_hash=pdf_path_by_hash,
|
|
931
|
+
template_tags=template_tags,
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
def _sorted_counts(counts: dict[str, int], *, numeric_desc: bool = False) -> list[dict[str, Any]]:
|
|
936
|
+
items = list(counts.items())
|
|
937
|
+
if numeric_desc:
|
|
938
|
+
def key(item: tuple[str, int]) -> tuple[int, int]:
|
|
939
|
+
label, count = item
|
|
940
|
+
if label.isdigit():
|
|
941
|
+
return (0, -int(label))
|
|
942
|
+
return (1, 0)
|
|
943
|
+
items.sort(key=key)
|
|
944
|
+
else:
|
|
945
|
+
items.sort(key=lambda item: item[1], reverse=True)
|
|
946
|
+
return [{"label": k, "count": v} for k, v in items]
|
|
947
|
+
|
|
948
|
+
|
|
949
|
+
def _sorted_month_counts(counts: dict[str, int]) -> list[dict[str, Any]]:
|
|
950
|
+
def month_sort(label: str) -> int:
|
|
951
|
+
if label == "Unknown":
|
|
952
|
+
return 99
|
|
953
|
+
if label.isdigit():
|
|
954
|
+
return int(label)
|
|
955
|
+
return 98
|
|
956
|
+
|
|
957
|
+
items = sorted(counts.items(), key=lambda item: month_sort(item[0]))
|
|
958
|
+
return [{"label": k, "count": v} for k, v in items]
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
# ============================================================================
|
|
962
|
+
# Data Layer Helpers: Load, Merge, Cache, PDF-only Entries
|
|
963
|
+
# ============================================================================
|
|
964
|
+
|
|
965
|
+
_TEMPLATE_INFER_IGNORE_KEYS = {
|
|
966
|
+
"source_path",
|
|
967
|
+
"source_hash",
|
|
968
|
+
"provider",
|
|
969
|
+
"model",
|
|
970
|
+
"extracted_at",
|
|
971
|
+
"truncation",
|
|
972
|
+
"output_language",
|
|
973
|
+
"prompt_template",
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
def _load_paper_inputs(paths: list[Path]) -> list[dict[str, Any]]:
|
|
978
|
+
"""Load paper JSON files and infer template tags if needed."""
|
|
979
|
+
# Delayed import to avoid circular dependency with template_registry
|
|
980
|
+
from deepresearch_flow.paper.template_registry import (
|
|
981
|
+
list_template_names_in_registry_order,
|
|
982
|
+
load_schema_for_template,
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
inputs: list[dict[str, Any]] = []
|
|
986
|
+
for path in paths:
|
|
987
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
988
|
+
if isinstance(payload, list):
|
|
989
|
+
raise ValueError(
|
|
990
|
+
f"Input JSON must be an object with template_tag and papers (got array): {path}"
|
|
991
|
+
)
|
|
992
|
+
if not isinstance(payload, dict):
|
|
993
|
+
raise ValueError(f"Input JSON must be an object: {path}")
|
|
994
|
+
papers = payload.get("papers")
|
|
995
|
+
if not isinstance(papers, list):
|
|
996
|
+
raise ValueError(f"Input JSON missing papers list: {path}")
|
|
997
|
+
template_tag = payload.get("template_tag")
|
|
998
|
+
if not template_tag:
|
|
999
|
+
template_tag = _infer_template_tag(papers, path, list_template_names_in_registry_order, load_schema_for_template)
|
|
1000
|
+
inputs.append({"template_tag": str(template_tag), "papers": papers})
|
|
1001
|
+
return inputs
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
def _infer_template_tag(
|
|
1005
|
+
papers: list[dict[str, Any]],
|
|
1006
|
+
path: Path,
|
|
1007
|
+
list_template_names_in_registry_order,
|
|
1008
|
+
load_schema_for_template,
|
|
1009
|
+
) -> str:
|
|
1010
|
+
"""Infer template tag from paper content."""
|
|
1011
|
+
prompt_tags = {
|
|
1012
|
+
str(paper.get("prompt_template"))
|
|
1013
|
+
for paper in papers
|
|
1014
|
+
if isinstance(paper, dict) and paper.get("prompt_template")
|
|
1015
|
+
}
|
|
1016
|
+
if len(prompt_tags) == 1:
|
|
1017
|
+
return prompt_tags.pop()
|
|
1018
|
+
|
|
1019
|
+
sample = next((paper for paper in papers if isinstance(paper, dict)), None)
|
|
1020
|
+
if sample is None:
|
|
1021
|
+
raise ValueError(f"Input JSON has no paper objects to infer template_tag: {path}")
|
|
1022
|
+
|
|
1023
|
+
paper_keys = {key for key in sample.keys() if key not in _TEMPLATE_INFER_IGNORE_KEYS}
|
|
1024
|
+
if not paper_keys:
|
|
1025
|
+
raise ValueError(f"Input JSON papers have no keys to infer template_tag: {path}")
|
|
1026
|
+
|
|
1027
|
+
best_tag = None
|
|
1028
|
+
best_score = -1
|
|
1029
|
+
for name in list_template_names_in_registry_order():
|
|
1030
|
+
schema = load_schema_for_template(name)
|
|
1031
|
+
schema_keys = set((schema.get("properties") or {}).keys())
|
|
1032
|
+
score = len(paper_keys & schema_keys)
|
|
1033
|
+
if score > best_score:
|
|
1034
|
+
best_score = score
|
|
1035
|
+
best_tag = name
|
|
1036
|
+
elif score == best_score:
|
|
1037
|
+
if best_tag != "simple" and name == "simple":
|
|
1038
|
+
best_tag = name
|
|
1039
|
+
|
|
1040
|
+
if not best_tag:
|
|
1041
|
+
raise ValueError(f"Unable to infer template_tag from input JSON: {path}")
|
|
1042
|
+
return best_tag
|
|
1043
|
+
|
|
1044
|
+
|
|
1045
|
+
def _build_cache_meta(
|
|
1046
|
+
db_paths: list[Path],
|
|
1047
|
+
bibtex_path: Path | None,
|
|
1048
|
+
pdf_roots_meta: list[dict[str, Any]] | None = None,
|
|
1049
|
+
) -> dict[str, Any]:
|
|
1050
|
+
"""Build cache metadata for invalidation."""
|
|
1051
|
+
def file_meta(path: Path) -> dict[str, Any]:
|
|
1052
|
+
try:
|
|
1053
|
+
stats = path.stat()
|
|
1054
|
+
except OSError as exc:
|
|
1055
|
+
raise ValueError(f"Failed to read input metadata for cache: {path}") from exc
|
|
1056
|
+
return {"path": str(path), "mtime": stats.st_mtime, "size": stats.st_size}
|
|
1057
|
+
|
|
1058
|
+
meta = {
|
|
1059
|
+
"version": 1,
|
|
1060
|
+
"inputs": [file_meta(path) for path in db_paths],
|
|
1061
|
+
"bibtex": file_meta(bibtex_path) if bibtex_path else None,
|
|
1062
|
+
}
|
|
1063
|
+
if pdf_roots_meta is not None:
|
|
1064
|
+
meta["pdf_roots"] = pdf_roots_meta
|
|
1065
|
+
return meta
|
|
1066
|
+
|
|
1067
|
+
|
|
1068
|
+
def _load_cached_papers(cache_dir: Path, meta: dict[str, Any]) -> list[dict[str, Any]] | None:
|
|
1069
|
+
"""Load cached papers if metadata matches."""
|
|
1070
|
+
meta_path = cache_dir / "db_serve_cache.meta.json"
|
|
1071
|
+
data_path = cache_dir / "db_serve_cache.papers.json"
|
|
1072
|
+
if not meta_path.exists() or not data_path.exists():
|
|
1073
|
+
return None
|
|
1074
|
+
try:
|
|
1075
|
+
cached_meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
|
1076
|
+
if cached_meta != meta:
|
|
1077
|
+
return None
|
|
1078
|
+
cached_papers = json.loads(data_path.read_text(encoding="utf-8"))
|
|
1079
|
+
if not isinstance(cached_papers, list):
|
|
1080
|
+
return None
|
|
1081
|
+
return cached_papers
|
|
1082
|
+
except Exception:
|
|
1083
|
+
return None
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
def _write_cached_papers(cache_dir: Path, meta: dict[str, Any], papers: list[dict[str, Any]]) -> None:
|
|
1087
|
+
"""Write cached papers and metadata."""
|
|
1088
|
+
meta_path = cache_dir / "db_serve_cache.meta.json"
|
|
1089
|
+
data_path = cache_dir / "db_serve_cache.papers.json"
|
|
1090
|
+
meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
1091
|
+
data_path.write_text(json.dumps(papers, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
1092
|
+
|
|
1093
|
+
|
|
1094
|
+
def _extract_year_for_matching(paper: dict[str, Any]) -> str | None:
|
|
1095
|
+
"""Extract year from bibtex or publication_date for matching."""
|
|
1096
|
+
if isinstance(paper.get("bibtex"), dict):
|
|
1097
|
+
fields = paper.get("bibtex", {}).get("fields", {}) or {}
|
|
1098
|
+
year = fields.get("year")
|
|
1099
|
+
if year and str(year).isdigit():
|
|
1100
|
+
return str(year)
|
|
1101
|
+
parsed_year, _ = _parse_year_month(str(paper.get("publication_date") or ""))
|
|
1102
|
+
return parsed_year
|
|
1103
|
+
|
|
1104
|
+
|
|
1105
|
+
def _prepare_paper_matching_fields(paper: dict[str, Any]) -> None:
|
|
1106
|
+
"""Ensure paper has _authors and _year fields for matching."""
|
|
1107
|
+
if "_authors" not in paper:
|
|
1108
|
+
paper["_authors"] = _extract_authors(paper)
|
|
1109
|
+
if "_year" not in paper:
|
|
1110
|
+
paper["_year"] = _extract_year_for_matching(paper) or ""
|
|
1111
|
+
|
|
1112
|
+
|
|
1113
|
+
def _build_pdf_only_entries(
|
|
1114
|
+
papers: list[dict[str, Any]],
|
|
1115
|
+
pdf_paths: list[Path],
|
|
1116
|
+
pdf_index: dict[str, list[Path]],
|
|
1117
|
+
) -> list[dict[str, Any]]:
|
|
1118
|
+
"""Build paper entries for unmatched PDFs."""
|
|
1119
|
+
matched: set[Path] = set()
|
|
1120
|
+
for paper in papers:
|
|
1121
|
+
_prepare_paper_matching_fields(paper)
|
|
1122
|
+
pdf_path = _resolve_pdf(paper, pdf_index)
|
|
1123
|
+
if pdf_path:
|
|
1124
|
+
matched.add(pdf_path.resolve())
|
|
1125
|
+
|
|
1126
|
+
entries: list[dict[str, Any]] = []
|
|
1127
|
+
for path in pdf_paths:
|
|
1128
|
+
resolved = path.resolve()
|
|
1129
|
+
if resolved in matched:
|
|
1130
|
+
continue
|
|
1131
|
+
title = _read_pdf_metadata_title(resolved) or _extract_title_from_filename(resolved.name)
|
|
1132
|
+
if not title:
|
|
1133
|
+
title = resolved.stem
|
|
1134
|
+
year_hint, author_hint = _extract_year_author_from_filename(resolved.name)
|
|
1135
|
+
entry: dict[str, Any] = {
|
|
1136
|
+
"paper_title": title,
|
|
1137
|
+
"paper_authors": [author_hint] if author_hint else [],
|
|
1138
|
+
"publication_date": year_hint or "",
|
|
1139
|
+
"source_hash": stable_hash(str(resolved)),
|
|
1140
|
+
"source_path": str(resolved),
|
|
1141
|
+
"_is_pdf_only": True,
|
|
1142
|
+
}
|
|
1143
|
+
entries.append(entry)
|
|
1144
|
+
return entries
|
|
1145
|
+
|
|
1146
|
+
|
|
1147
|
+
def _normalize_merge_title(value: str | None) -> str | None:
|
|
1148
|
+
"""Normalize title for merging."""
|
|
1149
|
+
if not value:
|
|
1150
|
+
return None
|
|
1151
|
+
return str(value).replace("{", "").replace("}", "").strip().lower()
|
|
1152
|
+
|
|
1153
|
+
|
|
1154
|
+
def _extract_bibtex_title(paper: dict[str, Any]) -> str | None:
|
|
1155
|
+
"""Extract normalized title from bibtex."""
|
|
1156
|
+
if not isinstance(paper.get("bibtex"), dict):
|
|
1157
|
+
return None
|
|
1158
|
+
fields = paper.get("bibtex", {}).get("fields", {}) or {}
|
|
1159
|
+
return _normalize_merge_title(fields.get("title"))
|
|
1160
|
+
|
|
1161
|
+
|
|
1162
|
+
def _extract_paper_title(paper: dict[str, Any]) -> str | None:
|
|
1163
|
+
"""Extract normalized paper_title."""
|
|
1164
|
+
return _normalize_merge_title(paper.get("paper_title"))
|
|
1165
|
+
|
|
1166
|
+
|
|
1167
|
+
def _titles_match(group: dict[str, Any], paper: dict[str, Any], *, threshold: float) -> bool:
|
|
1168
|
+
"""Check if paper title matches group titles."""
|
|
1169
|
+
bib_title = _extract_bibtex_title(paper)
|
|
1170
|
+
group_bib = group.get("_merge_bibtex_titles") or set()
|
|
1171
|
+
if bib_title and group_bib:
|
|
1172
|
+
return any(_title_similarity(bib_title, existing) >= threshold for existing in group_bib)
|
|
1173
|
+
|
|
1174
|
+
paper_title = _extract_paper_title(paper)
|
|
1175
|
+
group_titles = group.get("_merge_paper_titles") or set()
|
|
1176
|
+
if paper_title and group_titles:
|
|
1177
|
+
return any(_title_similarity(paper_title, existing) >= threshold for existing in group_titles)
|
|
1178
|
+
return False
|
|
1179
|
+
|
|
1180
|
+
|
|
1181
|
+
def _add_merge_titles(group: dict[str, Any], paper: dict[str, Any]) -> None:
|
|
1182
|
+
"""Add paper titles to group merge tracking."""
|
|
1183
|
+
bib_title = _extract_bibtex_title(paper)
|
|
1184
|
+
if bib_title:
|
|
1185
|
+
group.setdefault("_merge_bibtex_titles", set()).add(bib_title)
|
|
1186
|
+
paper_title = _extract_paper_title(paper)
|
|
1187
|
+
if paper_title:
|
|
1188
|
+
group.setdefault("_merge_paper_titles", set()).add(paper_title)
|
|
1189
|
+
|
|
1190
|
+
|
|
1191
|
+
def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
1192
|
+
"""Merge paper inputs from multiple template extractions."""
|
|
1193
|
+
merged: list[dict[str, Any]] = []
|
|
1194
|
+
threshold = 0.95
|
|
1195
|
+
prefix_len = 5
|
|
1196
|
+
bibtex_exact: dict[str, set[int]] = {}
|
|
1197
|
+
bibtex_prefix: dict[str, set[int]] = {}
|
|
1198
|
+
paper_exact: dict[str, set[int]] = {}
|
|
1199
|
+
paper_prefix: dict[str, set[int]] = {}
|
|
1200
|
+
|
|
1201
|
+
def prefix_key(value: str) -> str:
|
|
1202
|
+
return value[:prefix_len] if len(value) >= prefix_len else value
|
|
1203
|
+
|
|
1204
|
+
def add_index(
|
|
1205
|
+
value: str,
|
|
1206
|
+
exact_index: dict[str, set[int]],
|
|
1207
|
+
prefix_index: dict[str, set[int]],
|
|
1208
|
+
idx: int,
|
|
1209
|
+
) -> None:
|
|
1210
|
+
exact_index.setdefault(value, set()).add(idx)
|
|
1211
|
+
prefix_index.setdefault(prefix_key(value), set()).add(idx)
|
|
1212
|
+
|
|
1213
|
+
def candidate_ids(bib_title: str | None, paper_title: str | None) -> list[int]:
|
|
1214
|
+
ids: set[int] = set()
|
|
1215
|
+
if bib_title:
|
|
1216
|
+
ids |= bibtex_exact.get(bib_title, set())
|
|
1217
|
+
ids |= bibtex_prefix.get(prefix_key(bib_title), set())
|
|
1218
|
+
if paper_title:
|
|
1219
|
+
ids |= paper_exact.get(paper_title, set())
|
|
1220
|
+
ids |= paper_prefix.get(prefix_key(paper_title), set())
|
|
1221
|
+
return sorted(ids)
|
|
1222
|
+
|
|
1223
|
+
for bundle in inputs:
|
|
1224
|
+
template_tag = bundle.get("template_tag")
|
|
1225
|
+
papers = bundle.get("papers") or []
|
|
1226
|
+
for paper in papers:
|
|
1227
|
+
if not isinstance(paper, dict):
|
|
1228
|
+
raise ValueError("Input papers must be objects")
|
|
1229
|
+
bib_title = _extract_bibtex_title(paper)
|
|
1230
|
+
paper_title = _extract_paper_title(paper)
|
|
1231
|
+
match = None
|
|
1232
|
+
match_idx = None
|
|
1233
|
+
for idx in candidate_ids(bib_title, paper_title):
|
|
1234
|
+
candidate = merged[idx]
|
|
1235
|
+
if _titles_match(candidate, paper, threshold=threshold):
|
|
1236
|
+
match = candidate
|
|
1237
|
+
match_idx = idx
|
|
1238
|
+
break
|
|
1239
|
+
if match is None:
|
|
1240
|
+
group = {
|
|
1241
|
+
"templates": {template_tag: paper},
|
|
1242
|
+
"template_order": [template_tag],
|
|
1243
|
+
}
|
|
1244
|
+
_add_merge_titles(group, paper)
|
|
1245
|
+
merged.append(group)
|
|
1246
|
+
group_idx = len(merged) - 1
|
|
1247
|
+
if bib_title:
|
|
1248
|
+
add_index(bib_title, bibtex_exact, bibtex_prefix, group_idx)
|
|
1249
|
+
if paper_title:
|
|
1250
|
+
add_index(paper_title, paper_exact, paper_prefix, group_idx)
|
|
1251
|
+
else:
|
|
1252
|
+
templates = match.setdefault("templates", {})
|
|
1253
|
+
templates[template_tag] = paper
|
|
1254
|
+
order = match.setdefault("template_order", [])
|
|
1255
|
+
if template_tag not in order:
|
|
1256
|
+
order.append(template_tag)
|
|
1257
|
+
_add_merge_titles(match, paper)
|
|
1258
|
+
if match_idx is not None:
|
|
1259
|
+
if bib_title:
|
|
1260
|
+
add_index(bib_title, bibtex_exact, bibtex_prefix, match_idx)
|
|
1261
|
+
if paper_title:
|
|
1262
|
+
add_index(paper_title, paper_exact, paper_prefix, match_idx)
|
|
1263
|
+
|
|
1264
|
+
for group in merged:
|
|
1265
|
+
templates = group.get("templates") or {}
|
|
1266
|
+
order = group.get("template_order") or list(templates.keys())
|
|
1267
|
+
default_tag = "simple" if "simple" in order else (order[0] if order else None)
|
|
1268
|
+
group["default_template"] = default_tag
|
|
1269
|
+
if default_tag and default_tag in templates:
|
|
1270
|
+
base = templates[default_tag]
|
|
1271
|
+
for key, value in base.items():
|
|
1272
|
+
group[key] = value
|
|
1273
|
+
group.pop("_merge_bibtex_titles", None)
|
|
1274
|
+
group.pop("_merge_paper_titles", None)
|
|
1275
|
+
return merged
|
|
1276
|
+
|
|
1277
|
+
|
|
1278
|
+
def _normalize_bibtex_title(title: str) -> str:
|
|
1279
|
+
"""Normalize bibtex title for matching."""
|
|
1280
|
+
value = title.replace("{", "").replace("}", "")
|
|
1281
|
+
value = re.sub(r"[^a-z0-9]+", " ", value.lower())
|
|
1282
|
+
return re.sub(r"\s+", " ", value).strip()
|
|
1283
|
+
|
|
1284
|
+
|
|
1285
|
+
def enrich_with_bibtex(papers: list[dict[str, Any]], bibtex_path: Path) -> None:
|
|
1286
|
+
"""Enrich papers with bibtex metadata."""
|
|
1287
|
+
if not PYBTEX_AVAILABLE:
|
|
1288
|
+
raise RuntimeError("pybtex is required for --bibtex support")
|
|
1289
|
+
|
|
1290
|
+
bib_data = parse_file(str(bibtex_path))
|
|
1291
|
+
entries: list[dict[str, Any]] = []
|
|
1292
|
+
by_prefix: dict[str, list[int]] = {}
|
|
1293
|
+
for key, entry in bib_data.entries.items():
|
|
1294
|
+
fields = dict(entry.fields)
|
|
1295
|
+
title = str(fields.get("title") or "").strip()
|
|
1296
|
+
title_norm = _normalize_bibtex_title(title)
|
|
1297
|
+
if not title_norm:
|
|
1298
|
+
continue
|
|
1299
|
+
record = {
|
|
1300
|
+
"key": key,
|
|
1301
|
+
"type": entry.type,
|
|
1302
|
+
"fields": fields,
|
|
1303
|
+
"persons": {role: [str(p) for p in persons] for role, persons in entry.persons.items()},
|
|
1304
|
+
"_title_norm": title_norm,
|
|
1305
|
+
}
|
|
1306
|
+
idx = len(entries)
|
|
1307
|
+
entries.append(record)
|
|
1308
|
+
prefix = title_norm[:16]
|
|
1309
|
+
by_prefix.setdefault(prefix, []).append(idx)
|
|
1310
|
+
|
|
1311
|
+
for paper in papers:
|
|
1312
|
+
if isinstance(paper.get("bibtex"), dict):
|
|
1313
|
+
continue
|
|
1314
|
+
title = str(paper.get("paper_title") or "").strip()
|
|
1315
|
+
if not title:
|
|
1316
|
+
continue
|
|
1317
|
+
norm = _normalize_bibtex_title(title)
|
|
1318
|
+
if not norm:
|
|
1319
|
+
continue
|
|
1320
|
+
|
|
1321
|
+
candidates = []
|
|
1322
|
+
prefix = norm[:16]
|
|
1323
|
+
for cand_idx in by_prefix.get(prefix, []):
|
|
1324
|
+
candidates.append(entries[cand_idx])
|
|
1325
|
+
if not candidates:
|
|
1326
|
+
candidates = entries
|
|
1327
|
+
|
|
1328
|
+
best = None
|
|
1329
|
+
best_score = 0.0
|
|
1330
|
+
for entry in candidates:
|
|
1331
|
+
score = _title_similarity(norm, entry["_title_norm"])
|
|
1332
|
+
if score > best_score:
|
|
1333
|
+
best_score = score
|
|
1334
|
+
best = entry
|
|
1335
|
+
|
|
1336
|
+
if best is not None and best_score >= 0.9:
|
|
1337
|
+
paper["bibtex"] = {k: v for k, v in best.items() if not k.startswith("_")}
|
|
1338
|
+
|
|
1339
|
+
|
|
1340
|
+
def load_and_merge_papers(
|
|
1341
|
+
db_paths: list[Path],
|
|
1342
|
+
bibtex_path: Path | None = None,
|
|
1343
|
+
cache_dir: Path | None = None,
|
|
1344
|
+
use_cache: bool = True,
|
|
1345
|
+
pdf_roots: list[Path] | None = None,
|
|
1346
|
+
) -> list[dict[str, Any]]:
|
|
1347
|
+
"""Load and merge papers from multiple JSON files, with optional caching and PDF-only entries."""
|
|
1348
|
+
cache_meta = None
|
|
1349
|
+
pdf_roots = pdf_roots or []
|
|
1350
|
+
pdf_paths: list[Path] = []
|
|
1351
|
+
pdf_roots_meta: list[dict[str, Any]] | None = None
|
|
1352
|
+
if pdf_roots:
|
|
1353
|
+
pdf_paths, pdf_roots_meta = _scan_pdf_roots(pdf_roots)
|
|
1354
|
+
if cache_dir and use_cache:
|
|
1355
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
1356
|
+
cache_meta = _build_cache_meta(db_paths, bibtex_path, pdf_roots_meta)
|
|
1357
|
+
cached = _load_cached_papers(cache_dir, cache_meta)
|
|
1358
|
+
if cached is not None:
|
|
1359
|
+
return cached
|
|
1360
|
+
|
|
1361
|
+
inputs = _load_paper_inputs(db_paths)
|
|
1362
|
+
if bibtex_path is not None:
|
|
1363
|
+
for bundle in inputs:
|
|
1364
|
+
enrich_with_bibtex(bundle["papers"], bibtex_path)
|
|
1365
|
+
papers = _merge_paper_inputs(inputs)
|
|
1366
|
+
if pdf_paths:
|
|
1367
|
+
pdf_index = _build_file_index_from_paths(pdf_paths, suffixes={".pdf"})
|
|
1368
|
+
papers.extend(_build_pdf_only_entries(papers, pdf_paths, pdf_index))
|
|
1369
|
+
|
|
1370
|
+
if cache_dir and use_cache and cache_meta is not None:
|
|
1371
|
+
_write_cached_papers(cache_dir, cache_meta, papers)
|
|
1372
|
+
return papers
|
|
1373
|
+
|
|
1374
|
+
|
|
1375
|
+
# ============================================================================
|
|
1376
|
+
# Compare Logic for paper db compare
|
|
1377
|
+
# ============================================================================
|
|
1378
|
+
|
|
1379
|
+
from typing import Literal
|
|
1380
|
+
|
|
1381
|
+
|
|
1382
|
+
@dataclass
|
|
1383
|
+
class CompareResult:
|
|
1384
|
+
"""Result of comparing two datasets."""
|
|
1385
|
+
side: Literal["A", "B", "MATCH"]
|
|
1386
|
+
source_hash: str
|
|
1387
|
+
title: str
|
|
1388
|
+
match_status: Literal["matched", "only_in_A", "only_in_B", "matched_pair"]
|
|
1389
|
+
match_type: str | None = None
|
|
1390
|
+
match_score: float = 0.0
|
|
1391
|
+
source_path: str | None = None
|
|
1392
|
+
other_source_hash: str | None = None
|
|
1393
|
+
other_title: str | None = None
|
|
1394
|
+
other_source_path: str | None = None
|
|
1395
|
+
lang: str | None = None
|
|
1396
|
+
|
|
1397
|
+
|
|
1398
|
+
@dataclass
|
|
1399
|
+
class CompareDataset:
|
|
1400
|
+
"""Prepared dataset for compare."""
|
|
1401
|
+
papers: list[dict[str, Any]]
|
|
1402
|
+
md_index: dict[str, list[Path]]
|
|
1403
|
+
pdf_index: dict[str, list[Path]]
|
|
1404
|
+
translated_index: dict[str, dict[str, Path]]
|
|
1405
|
+
paper_index: dict[str, list[dict[str, Any]]]
|
|
1406
|
+
path_to_index: dict[Path, int]
|
|
1407
|
+
hash_to_index: dict[str, int]
|
|
1408
|
+
paper_id_to_index: dict[int, int]
|
|
1409
|
+
|
|
1410
|
+
|
|
1411
|
+
def _scan_md_roots(roots: list[Path]) -> list[Path]:
|
|
1412
|
+
paths: list[Path] = []
|
|
1413
|
+
for root in roots:
|
|
1414
|
+
try:
|
|
1415
|
+
if not root.exists() or not root.is_dir():
|
|
1416
|
+
continue
|
|
1417
|
+
except OSError:
|
|
1418
|
+
continue
|
|
1419
|
+
try:
|
|
1420
|
+
for path in root.rglob("*.md"):
|
|
1421
|
+
try:
|
|
1422
|
+
if not path.is_file():
|
|
1423
|
+
continue
|
|
1424
|
+
except OSError:
|
|
1425
|
+
continue
|
|
1426
|
+
paths.append(path.resolve())
|
|
1427
|
+
except OSError:
|
|
1428
|
+
continue
|
|
1429
|
+
return paths
|
|
1430
|
+
|
|
1431
|
+
|
|
1432
|
+
def _merge_file_indexes(*indexes: dict[str, list[Path]]) -> dict[str, list[Path]]:
|
|
1433
|
+
merged: dict[str, list[Path]] = {}
|
|
1434
|
+
for index in indexes:
|
|
1435
|
+
for key, paths in index.items():
|
|
1436
|
+
merged.setdefault(key, []).extend(paths)
|
|
1437
|
+
return merged
|
|
1438
|
+
|
|
1439
|
+
|
|
1440
|
+
def _build_md_only_entries(
|
|
1441
|
+
papers: list[dict[str, Any]],
|
|
1442
|
+
md_paths: list[Path],
|
|
1443
|
+
md_index: dict[str, list[Path]],
|
|
1444
|
+
) -> list[dict[str, Any]]:
|
|
1445
|
+
matched: set[Path] = set()
|
|
1446
|
+
for paper in papers:
|
|
1447
|
+
_prepare_paper_matching_fields(paper)
|
|
1448
|
+
md_path = _resolve_source_md(paper, md_index)
|
|
1449
|
+
if md_path:
|
|
1450
|
+
matched.add(md_path.resolve())
|
|
1451
|
+
entries: list[dict[str, Any]] = []
|
|
1452
|
+
for path in md_paths:
|
|
1453
|
+
resolved = path.resolve()
|
|
1454
|
+
if resolved in matched:
|
|
1455
|
+
continue
|
|
1456
|
+
title = _extract_title_from_filename(resolved.name) or resolved.stem
|
|
1457
|
+
year_hint, author_hint = _extract_year_author_from_filename(resolved.name)
|
|
1458
|
+
entry: dict[str, Any] = {
|
|
1459
|
+
"paper_title": title,
|
|
1460
|
+
"paper_authors": [author_hint] if author_hint else [],
|
|
1461
|
+
"publication_date": year_hint or "",
|
|
1462
|
+
"source_hash": stable_hash(str(resolved)),
|
|
1463
|
+
"source_path": str(resolved),
|
|
1464
|
+
"_is_md_only": True,
|
|
1465
|
+
}
|
|
1466
|
+
entries.append(entry)
|
|
1467
|
+
return entries
|
|
1468
|
+
|
|
1469
|
+
|
|
1470
|
+
def _translation_base_key_for_paper(paper: dict[str, Any]) -> str:
|
|
1471
|
+
source_path = str(paper.get("source_path") or "")
|
|
1472
|
+
if source_path:
|
|
1473
|
+
return Path(source_path).stem.lower()
|
|
1474
|
+
title = str(paper.get("paper_title") or "")
|
|
1475
|
+
return _normalize_title_key(title)
|
|
1476
|
+
|
|
1477
|
+
|
|
1478
|
+
def _build_translated_only_entries(
|
|
1479
|
+
papers: list[dict[str, Any]],
|
|
1480
|
+
translated_index: dict[str, dict[str, Path]],
|
|
1481
|
+
lang: str,
|
|
1482
|
+
) -> list[dict[str, Any]]:
|
|
1483
|
+
if not lang:
|
|
1484
|
+
return []
|
|
1485
|
+
lang_key = lang.lower()
|
|
1486
|
+
matched: set[Path] = set()
|
|
1487
|
+
for paper in papers:
|
|
1488
|
+
base_key = _translation_base_key_for_paper(paper)
|
|
1489
|
+
if not base_key:
|
|
1490
|
+
continue
|
|
1491
|
+
path = translated_index.get(base_key, {}).get(lang_key)
|
|
1492
|
+
if path:
|
|
1493
|
+
matched.add(path.resolve())
|
|
1494
|
+
entries: list[dict[str, Any]] = []
|
|
1495
|
+
for base_key, translations in translated_index.items():
|
|
1496
|
+
path = translations.get(lang_key)
|
|
1497
|
+
if not path:
|
|
1498
|
+
continue
|
|
1499
|
+
resolved = path.resolve()
|
|
1500
|
+
if resolved in matched:
|
|
1501
|
+
continue
|
|
1502
|
+
title = _extract_title_from_filename(resolved.name) or resolved.stem
|
|
1503
|
+
year_hint, author_hint = _extract_year_author_from_filename(resolved.name)
|
|
1504
|
+
entry: dict[str, Any] = {
|
|
1505
|
+
"paper_title": title,
|
|
1506
|
+
"paper_authors": [author_hint] if author_hint else [],
|
|
1507
|
+
"publication_date": year_hint or "",
|
|
1508
|
+
"source_hash": stable_hash(str(resolved)),
|
|
1509
|
+
"source_path": str(resolved),
|
|
1510
|
+
"_is_translated_only": True,
|
|
1511
|
+
"translation_lang": lang_key,
|
|
1512
|
+
}
|
|
1513
|
+
entries.append(entry)
|
|
1514
|
+
return entries
|
|
1515
|
+
|
|
1516
|
+
|
|
1517
|
+
def _build_paper_index(papers: list[dict[str, Any]]) -> dict[str, list[dict[str, Any]]]:
|
|
1518
|
+
index: dict[str, list[dict[str, Any]]] = {}
|
|
1519
|
+
for paper in papers:
|
|
1520
|
+
_prepare_paper_matching_fields(paper)
|
|
1521
|
+
title = str(paper.get("paper_title") or "")
|
|
1522
|
+
title_key = _normalize_title_key(title)
|
|
1523
|
+
if title_key:
|
|
1524
|
+
index.setdefault(title_key, []).append(paper)
|
|
1525
|
+
compact_key = _compact_title_key(title_key)
|
|
1526
|
+
if compact_key:
|
|
1527
|
+
index.setdefault(f"compact:{compact_key}", []).append(paper)
|
|
1528
|
+
prefix_key = _title_prefix_key(title_key)
|
|
1529
|
+
if prefix_key:
|
|
1530
|
+
index.setdefault(prefix_key, []).append(paper)
|
|
1531
|
+
stripped_key = _strip_leading_numeric_tokens(title_key)
|
|
1532
|
+
if stripped_key and stripped_key != title_key:
|
|
1533
|
+
index.setdefault(stripped_key, []).append(paper)
|
|
1534
|
+
stripped_compact = _compact_title_key(stripped_key)
|
|
1535
|
+
if stripped_compact:
|
|
1536
|
+
index.setdefault(f"compact:{stripped_compact}", []).append(paper)
|
|
1537
|
+
stripped_prefix = _title_prefix_key(stripped_key)
|
|
1538
|
+
if stripped_prefix:
|
|
1539
|
+
index.setdefault(stripped_prefix, []).append(paper)
|
|
1540
|
+
year = str(paper.get("_year") or "").strip()
|
|
1541
|
+
if year:
|
|
1542
|
+
index.setdefault(f"year:{year}", []).append(paper)
|
|
1543
|
+
authors = paper.get("_authors") or []
|
|
1544
|
+
if authors:
|
|
1545
|
+
author_key = _normalize_author_key(str(authors[0]))
|
|
1546
|
+
if author_key:
|
|
1547
|
+
index.setdefault(f"authoryear:{year}:{author_key}", []).append(paper)
|
|
1548
|
+
return index
|
|
1549
|
+
|
|
1550
|
+
|
|
1551
|
+
def _adaptive_similarity_match_papers(
|
|
1552
|
+
title_key: str,
|
|
1553
|
+
candidates: list[dict[str, Any]],
|
|
1554
|
+
) -> tuple[dict[str, Any] | None, float]:
|
|
1555
|
+
if not title_key:
|
|
1556
|
+
return None, 0.0
|
|
1557
|
+
scored: list[tuple[dict[str, Any], float]] = []
|
|
1558
|
+
for paper in candidates:
|
|
1559
|
+
candidate_title = _normalize_title_key(str(paper.get("paper_title") or ""))
|
|
1560
|
+
if not candidate_title:
|
|
1561
|
+
continue
|
|
1562
|
+
if _title_overlap_match(title_key, candidate_title):
|
|
1563
|
+
return paper, 1.0
|
|
1564
|
+
scored.append((paper, _title_similarity(title_key, candidate_title)))
|
|
1565
|
+
if not scored:
|
|
1566
|
+
return None, 0.0
|
|
1567
|
+
|
|
1568
|
+
def matches_at(threshold: float) -> list[tuple[dict[str, Any], float]]:
|
|
1569
|
+
return [(paper, score) for paper, score in scored if score >= threshold]
|
|
1570
|
+
|
|
1571
|
+
threshold = _SIMILARITY_START
|
|
1572
|
+
step = _SIMILARITY_STEP
|
|
1573
|
+
prev_threshold = None
|
|
1574
|
+
prev_count = None
|
|
1575
|
+
for _ in range(_SIMILARITY_MAX_STEPS):
|
|
1576
|
+
matches = matches_at(threshold)
|
|
1577
|
+
if len(matches) == 1:
|
|
1578
|
+
paper, score = matches[0]
|
|
1579
|
+
return paper, score
|
|
1580
|
+
if len(matches) == 0:
|
|
1581
|
+
prev_threshold = threshold
|
|
1582
|
+
prev_count = 0
|
|
1583
|
+
threshold -= step
|
|
1584
|
+
continue
|
|
1585
|
+
if prev_count == 0 and prev_threshold is not None:
|
|
1586
|
+
low = threshold
|
|
1587
|
+
high = prev_threshold
|
|
1588
|
+
for _ in range(_SIMILARITY_MAX_STEPS):
|
|
1589
|
+
mid = (low + high) / 2
|
|
1590
|
+
mid_matches = matches_at(mid)
|
|
1591
|
+
if len(mid_matches) == 1:
|
|
1592
|
+
paper, score = mid_matches[0]
|
|
1593
|
+
return paper, score
|
|
1594
|
+
if len(mid_matches) == 0:
|
|
1595
|
+
high = mid
|
|
1596
|
+
else:
|
|
1597
|
+
low = mid
|
|
1598
|
+
return None, 0.0
|
|
1599
|
+
prev_threshold = threshold
|
|
1600
|
+
prev_count = len(matches)
|
|
1601
|
+
threshold -= step
|
|
1602
|
+
return None, 0.0
|
|
1603
|
+
|
|
1604
|
+
|
|
1605
|
+
def _resolve_paper_by_title_and_meta(
|
|
1606
|
+
paper: dict[str, Any],
|
|
1607
|
+
paper_index: dict[str, list[dict[str, Any]]],
|
|
1608
|
+
) -> tuple[dict[str, Any] | None, str | None, float]:
|
|
1609
|
+
title = str(paper.get("paper_title") or "")
|
|
1610
|
+
title_key = _normalize_title_key(title)
|
|
1611
|
+
if not title_key:
|
|
1612
|
+
title_key = ""
|
|
1613
|
+
candidates = paper_index.get(title_key, [])
|
|
1614
|
+
if candidates:
|
|
1615
|
+
return candidates[0], "title", 1.0
|
|
1616
|
+
if title_key:
|
|
1617
|
+
compact_key = _compact_title_key(title_key)
|
|
1618
|
+
compact_candidates = paper_index.get(f"compact:{compact_key}", [])
|
|
1619
|
+
if compact_candidates:
|
|
1620
|
+
return compact_candidates[0], "title_compact", 1.0
|
|
1621
|
+
stripped_key = _strip_leading_numeric_tokens(title_key)
|
|
1622
|
+
if stripped_key and stripped_key != title_key:
|
|
1623
|
+
stripped_candidates = paper_index.get(stripped_key, [])
|
|
1624
|
+
if stripped_candidates:
|
|
1625
|
+
return stripped_candidates[0], "title_stripped", 1.0
|
|
1626
|
+
stripped_compact = _compact_title_key(stripped_key)
|
|
1627
|
+
stripped_candidates = paper_index.get(f"compact:{stripped_compact}", [])
|
|
1628
|
+
if stripped_candidates:
|
|
1629
|
+
return stripped_candidates[0], "title_compact", 1.0
|
|
1630
|
+
prefix_candidates: list[dict[str, Any]] = []
|
|
1631
|
+
prefix_key = _title_prefix_key(title_key)
|
|
1632
|
+
if prefix_key:
|
|
1633
|
+
prefix_candidates = paper_index.get(prefix_key, [])
|
|
1634
|
+
if not prefix_candidates:
|
|
1635
|
+
stripped_key = _strip_leading_numeric_tokens(title_key)
|
|
1636
|
+
if stripped_key and stripped_key != title_key:
|
|
1637
|
+
prefix_key = _title_prefix_key(stripped_key)
|
|
1638
|
+
if prefix_key:
|
|
1639
|
+
prefix_candidates = paper_index.get(prefix_key, [])
|
|
1640
|
+
if prefix_candidates:
|
|
1641
|
+
match, score = _adaptive_similarity_match_papers(title_key, prefix_candidates)
|
|
1642
|
+
if match is not None:
|
|
1643
|
+
match_type = "title_prefix" if score >= 1.0 else "title_fuzzy"
|
|
1644
|
+
return match, match_type, score
|
|
1645
|
+
year = str(paper.get("_year") or "").strip()
|
|
1646
|
+
if not year.isdigit():
|
|
1647
|
+
return None, None, 0.0
|
|
1648
|
+
author_key = ""
|
|
1649
|
+
authors = paper.get("_authors") or []
|
|
1650
|
+
if authors:
|
|
1651
|
+
author_key = _normalize_author_key(str(authors[0]))
|
|
1652
|
+
candidates = []
|
|
1653
|
+
match_type = "year"
|
|
1654
|
+
if author_key:
|
|
1655
|
+
candidates = paper_index.get(f"authoryear:{year}:{author_key}", [])
|
|
1656
|
+
if candidates:
|
|
1657
|
+
match_type = "author_year"
|
|
1658
|
+
if not candidates:
|
|
1659
|
+
candidates = paper_index.get(f"year:{year}", [])
|
|
1660
|
+
if not candidates:
|
|
1661
|
+
return None, None, 0.0
|
|
1662
|
+
if len(candidates) == 1 and not title_key:
|
|
1663
|
+
return candidates[0], match_type, 1.0
|
|
1664
|
+
match, score = _adaptive_similarity_match_papers(title_key, candidates)
|
|
1665
|
+
if match is not None:
|
|
1666
|
+
if score < _AUTHOR_YEAR_MIN_SIMILARITY:
|
|
1667
|
+
return None, None, 0.0
|
|
1668
|
+
return match, "title_fuzzy", score
|
|
1669
|
+
return None, None, 0.0
|
|
1670
|
+
|
|
1671
|
+
|
|
1672
|
+
def _get_paper_identifier(paper: dict[str, Any]) -> str:
|
|
1673
|
+
"""Get a unique identifier for a paper."""
|
|
1674
|
+
return str(paper.get("source_hash") or paper.get("source_path", ""))
|
|
1675
|
+
|
|
1676
|
+
|
|
1677
|
+
def _match_datasets(
|
|
1678
|
+
dataset_a: CompareDataset,
|
|
1679
|
+
dataset_b: CompareDataset,
|
|
1680
|
+
*,
|
|
1681
|
+
lang: str | None = None,
|
|
1682
|
+
) -> list[CompareResult]:
|
|
1683
|
+
"""Match papers between two datasets using db_ops parity."""
|
|
1684
|
+
results: list[CompareResult] = []
|
|
1685
|
+
matched_a: set[int] = set()
|
|
1686
|
+
matched_b: set[int] = set()
|
|
1687
|
+
matched_b_info: dict[int, tuple[int, str | None, float]] = {}
|
|
1688
|
+
match_pairs: list[tuple[int, int, str | None, float]] = []
|
|
1689
|
+
|
|
1690
|
+
file_index_b = _merge_file_indexes(dataset_b.md_index, dataset_b.pdf_index)
|
|
1691
|
+
|
|
1692
|
+
for idx_a, paper in enumerate(dataset_a.papers):
|
|
1693
|
+
_prepare_paper_matching_fields(paper)
|
|
1694
|
+
source_hash = str(paper.get("source_hash") or "")
|
|
1695
|
+
title = str(paper.get("paper_title") or "")
|
|
1696
|
+
source_path = str(paper.get("source_path") or "")
|
|
1697
|
+
|
|
1698
|
+
match_type = None
|
|
1699
|
+
match_score = 0.0
|
|
1700
|
+
match_status = "only_in_A"
|
|
1701
|
+
matched_b_idx: int | None = None
|
|
1702
|
+
matched_b_paper: dict[str, Any] | None = None
|
|
1703
|
+
|
|
1704
|
+
if source_hash and source_hash in dataset_b.hash_to_index:
|
|
1705
|
+
matched_b_idx = dataset_b.hash_to_index[source_hash]
|
|
1706
|
+
matched_b_paper = dataset_b.papers[matched_b_idx]
|
|
1707
|
+
match_status = "matched"
|
|
1708
|
+
match_type = "hash"
|
|
1709
|
+
match_score = 1.0
|
|
1710
|
+
else:
|
|
1711
|
+
if file_index_b:
|
|
1712
|
+
matched_path, mt, score = _resolve_by_title_and_meta(paper, file_index_b)
|
|
1713
|
+
if matched_path is not None:
|
|
1714
|
+
matched_b_idx = dataset_b.path_to_index.get(matched_path.resolve())
|
|
1715
|
+
matched_b_paper = dataset_b.papers[matched_b_idx] if matched_b_idx is not None else None
|
|
1716
|
+
match_status = "matched"
|
|
1717
|
+
match_type = mt
|
|
1718
|
+
match_score = score
|
|
1719
|
+
if matched_b_idx is None:
|
|
1720
|
+
match_paper, mt, score = _resolve_paper_by_title_and_meta(paper, dataset_b.paper_index)
|
|
1721
|
+
if match_paper is not None:
|
|
1722
|
+
matched_b_idx = dataset_b.paper_id_to_index.get(id(match_paper))
|
|
1723
|
+
matched_b_paper = match_paper
|
|
1724
|
+
match_status = "matched"
|
|
1725
|
+
match_type = mt
|
|
1726
|
+
match_score = score
|
|
1727
|
+
if matched_b_idx is None and lang:
|
|
1728
|
+
base_key = _translation_base_key_for_paper(paper)
|
|
1729
|
+
if base_key:
|
|
1730
|
+
translated_path = dataset_b.translated_index.get(base_key, {}).get(lang.lower())
|
|
1731
|
+
if translated_path is not None:
|
|
1732
|
+
matched_b_idx = dataset_b.path_to_index.get(translated_path.resolve())
|
|
1733
|
+
matched_b_paper = dataset_b.papers[matched_b_idx] if matched_b_idx is not None else None
|
|
1734
|
+
match_status = "matched"
|
|
1735
|
+
match_type = f"translated_{lang.lower()}"
|
|
1736
|
+
match_score = 1.0
|
|
1737
|
+
|
|
1738
|
+
other_hash = None
|
|
1739
|
+
other_title = None
|
|
1740
|
+
other_path = None
|
|
1741
|
+
if matched_b_idx is not None and matched_b_paper is not None:
|
|
1742
|
+
matched_a.add(idx_a)
|
|
1743
|
+
matched_b.add(matched_b_idx)
|
|
1744
|
+
other_hash = str(matched_b_paper.get("source_hash") or "")
|
|
1745
|
+
other_title = str(matched_b_paper.get("paper_title") or "")
|
|
1746
|
+
other_path = str(matched_b_paper.get("source_path") or "")
|
|
1747
|
+
matched_b_info[matched_b_idx] = (idx_a, match_type, match_score)
|
|
1748
|
+
match_pairs.append((idx_a, matched_b_idx, match_type, match_score))
|
|
1749
|
+
|
|
1750
|
+
results.append(
|
|
1751
|
+
CompareResult(
|
|
1752
|
+
side="A",
|
|
1753
|
+
source_hash=source_hash,
|
|
1754
|
+
title=title,
|
|
1755
|
+
match_status=match_status,
|
|
1756
|
+
match_type=match_type,
|
|
1757
|
+
match_score=match_score,
|
|
1758
|
+
source_path=source_path if source_path else None,
|
|
1759
|
+
other_source_hash=other_hash,
|
|
1760
|
+
other_title=other_title,
|
|
1761
|
+
other_source_path=other_path,
|
|
1762
|
+
lang=lang.lower() if lang else None,
|
|
1763
|
+
)
|
|
1764
|
+
)
|
|
1765
|
+
|
|
1766
|
+
for idx_b, paper in enumerate(dataset_b.papers):
|
|
1767
|
+
_prepare_paper_matching_fields(paper)
|
|
1768
|
+
source_hash = str(paper.get("source_hash") or "")
|
|
1769
|
+
title = str(paper.get("paper_title") or "")
|
|
1770
|
+
source_path = str(paper.get("source_path") or "")
|
|
1771
|
+
match_status = "only_in_B"
|
|
1772
|
+
match_type = None
|
|
1773
|
+
match_score = 0.0
|
|
1774
|
+
other_hash = None
|
|
1775
|
+
other_title = None
|
|
1776
|
+
other_path = None
|
|
1777
|
+
if idx_b in matched_b:
|
|
1778
|
+
match_status = "matched"
|
|
1779
|
+
info = matched_b_info.get(idx_b)
|
|
1780
|
+
if info:
|
|
1781
|
+
idx_a, match_type, match_score = info
|
|
1782
|
+
a_paper = dataset_a.papers[idx_a]
|
|
1783
|
+
other_hash = str(a_paper.get("source_hash") or "")
|
|
1784
|
+
other_title = str(a_paper.get("paper_title") or "")
|
|
1785
|
+
other_path = str(a_paper.get("source_path") or "")
|
|
1786
|
+
results.append(
|
|
1787
|
+
CompareResult(
|
|
1788
|
+
side="B",
|
|
1789
|
+
source_hash=source_hash,
|
|
1790
|
+
title=title,
|
|
1791
|
+
match_status=match_status,
|
|
1792
|
+
match_type=match_type,
|
|
1793
|
+
match_score=match_score,
|
|
1794
|
+
source_path=source_path if source_path else None,
|
|
1795
|
+
other_source_hash=other_hash,
|
|
1796
|
+
other_title=other_title,
|
|
1797
|
+
other_source_path=other_path,
|
|
1798
|
+
lang=lang.lower() if lang else None,
|
|
1799
|
+
)
|
|
1800
|
+
)
|
|
1801
|
+
|
|
1802
|
+
for idx_a, idx_b, match_type, match_score in match_pairs:
|
|
1803
|
+
paper_a = dataset_a.papers[idx_a]
|
|
1804
|
+
paper_b = dataset_b.papers[idx_b]
|
|
1805
|
+
results.append(
|
|
1806
|
+
CompareResult(
|
|
1807
|
+
side="MATCH",
|
|
1808
|
+
source_hash=str(paper_a.get("source_hash") or ""),
|
|
1809
|
+
title=str(paper_a.get("paper_title") or ""),
|
|
1810
|
+
match_status="matched_pair",
|
|
1811
|
+
match_type=match_type,
|
|
1812
|
+
match_score=match_score,
|
|
1813
|
+
source_path=str(paper_a.get("source_path") or "") or None,
|
|
1814
|
+
other_source_hash=str(paper_b.get("source_hash") or ""),
|
|
1815
|
+
other_title=str(paper_b.get("paper_title") or ""),
|
|
1816
|
+
other_source_path=str(paper_b.get("source_path") or "") or None,
|
|
1817
|
+
lang=lang.lower() if lang else None,
|
|
1818
|
+
)
|
|
1819
|
+
)
|
|
1820
|
+
|
|
1821
|
+
return results
|
|
1822
|
+
|
|
1823
|
+
|
|
1824
|
+
def build_compare_dataset(
|
|
1825
|
+
*,
|
|
1826
|
+
json_paths: list[Path] | None = None,
|
|
1827
|
+
pdf_roots: list[Path] | None = None,
|
|
1828
|
+
md_roots: list[Path] | None = None,
|
|
1829
|
+
md_translated_roots: list[Path] | None = None,
|
|
1830
|
+
bibtex_path: Path | None = None,
|
|
1831
|
+
lang: str | None = None,
|
|
1832
|
+
) -> CompareDataset:
|
|
1833
|
+
"""Load and index a dataset from various sources."""
|
|
1834
|
+
papers: list[dict[str, Any]] = []
|
|
1835
|
+
|
|
1836
|
+
# Load from JSON files
|
|
1837
|
+
if json_paths:
|
|
1838
|
+
for path in json_paths:
|
|
1839
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
1840
|
+
if isinstance(data, list):
|
|
1841
|
+
# Array format - direct list of papers
|
|
1842
|
+
papers.extend(data)
|
|
1843
|
+
elif isinstance(data, dict):
|
|
1844
|
+
# Object format with template_tag and papers
|
|
1845
|
+
if isinstance(data.get("papers"), list):
|
|
1846
|
+
papers.extend(data["papers"])
|
|
1847
|
+
else:
|
|
1848
|
+
raise ValueError(f"Invalid JSON format in {path}")
|
|
1849
|
+
|
|
1850
|
+
# Enrich with bibtex if provided
|
|
1851
|
+
if bibtex_path and PYBTEX_AVAILABLE:
|
|
1852
|
+
enrich_with_bibtex(papers, bibtex_path)
|
|
1853
|
+
|
|
1854
|
+
for paper in papers:
|
|
1855
|
+
_prepare_paper_matching_fields(paper)
|
|
1856
|
+
|
|
1857
|
+
md_paths = _scan_md_roots(md_roots or [])
|
|
1858
|
+
pdf_paths, _ = _scan_pdf_roots(pdf_roots or [])
|
|
1859
|
+
md_index = _build_file_index(md_roots or [], suffixes={".md"})
|
|
1860
|
+
pdf_index = _build_file_index(pdf_roots or [], suffixes={".pdf"})
|
|
1861
|
+
translated_index = _build_translated_index(md_translated_roots or [])
|
|
1862
|
+
|
|
1863
|
+
if pdf_paths:
|
|
1864
|
+
papers.extend(_build_pdf_only_entries(papers, pdf_paths, pdf_index))
|
|
1865
|
+
if md_paths:
|
|
1866
|
+
papers.extend(_build_md_only_entries(papers, md_paths, md_index))
|
|
1867
|
+
if translated_index and lang:
|
|
1868
|
+
papers.extend(_build_translated_only_entries(papers, translated_index, lang))
|
|
1869
|
+
|
|
1870
|
+
for paper in papers:
|
|
1871
|
+
_prepare_paper_matching_fields(paper)
|
|
1872
|
+
|
|
1873
|
+
paper_index = _build_paper_index(papers)
|
|
1874
|
+
path_to_index: dict[Path, int] = {}
|
|
1875
|
+
hash_to_index: dict[str, int] = {}
|
|
1876
|
+
paper_id_to_index: dict[int, int] = {}
|
|
1877
|
+
for idx, paper in enumerate(papers):
|
|
1878
|
+
paper_id_to_index[id(paper)] = idx
|
|
1879
|
+
source_hash = str(paper.get("source_hash") or "")
|
|
1880
|
+
if source_hash and source_hash not in hash_to_index:
|
|
1881
|
+
hash_to_index[source_hash] = idx
|
|
1882
|
+
source_path = paper.get("source_path")
|
|
1883
|
+
if source_path:
|
|
1884
|
+
path_to_index[Path(str(source_path)).resolve()] = idx
|
|
1885
|
+
|
|
1886
|
+
return CompareDataset(
|
|
1887
|
+
papers=papers,
|
|
1888
|
+
md_index=md_index,
|
|
1889
|
+
pdf_index=pdf_index,
|
|
1890
|
+
translated_index=translated_index,
|
|
1891
|
+
paper_index=paper_index,
|
|
1892
|
+
path_to_index=path_to_index,
|
|
1893
|
+
hash_to_index=hash_to_index,
|
|
1894
|
+
paper_id_to_index=paper_id_to_index,
|
|
1895
|
+
)
|
|
1896
|
+
|
|
1897
|
+
|
|
1898
|
+
def compare_datasets(
|
|
1899
|
+
*,
|
|
1900
|
+
json_paths_a: list[Path] | None = None,
|
|
1901
|
+
pdf_roots_a: list[Path] | None = None,
|
|
1902
|
+
md_roots_a: list[Path] | None = None,
|
|
1903
|
+
md_translated_roots_a: list[Path] | None = None,
|
|
1904
|
+
json_paths_b: list[Path] | None = None,
|
|
1905
|
+
pdf_roots_b: list[Path] | None = None,
|
|
1906
|
+
md_roots_b: list[Path] | None = None,
|
|
1907
|
+
md_translated_roots_b: list[Path] | None = None,
|
|
1908
|
+
bibtex_path: Path | None = None,
|
|
1909
|
+
lang: str | None = None,
|
|
1910
|
+
) -> list[CompareResult]:
|
|
1911
|
+
"""Compare two datasets and return comparison results."""
|
|
1912
|
+
# Validate language requirement for translated inputs
|
|
1913
|
+
has_translated_a = md_translated_roots_a is not None and len(md_translated_roots_a) > 0
|
|
1914
|
+
has_translated_b = md_translated_roots_b is not None and len(md_translated_roots_b) > 0
|
|
1915
|
+
|
|
1916
|
+
if (has_translated_a or has_translated_b) and lang is None:
|
|
1917
|
+
raise ValueError(
|
|
1918
|
+
"--lang parameter is required when comparing translated Markdown datasets"
|
|
1919
|
+
)
|
|
1920
|
+
|
|
1921
|
+
dataset_a = build_compare_dataset(
|
|
1922
|
+
json_paths=json_paths_a,
|
|
1923
|
+
pdf_roots=pdf_roots_a,
|
|
1924
|
+
md_roots=md_roots_a,
|
|
1925
|
+
md_translated_roots=md_translated_roots_a,
|
|
1926
|
+
bibtex_path=bibtex_path,
|
|
1927
|
+
lang=lang,
|
|
1928
|
+
)
|
|
1929
|
+
|
|
1930
|
+
dataset_b = build_compare_dataset(
|
|
1931
|
+
json_paths=json_paths_b,
|
|
1932
|
+
pdf_roots=pdf_roots_b,
|
|
1933
|
+
md_roots=md_roots_b,
|
|
1934
|
+
md_translated_roots=md_translated_roots_b,
|
|
1935
|
+
bibtex_path=bibtex_path,
|
|
1936
|
+
lang=lang,
|
|
1937
|
+
)
|
|
1938
|
+
|
|
1939
|
+
return _match_datasets(dataset_a, dataset_b, lang=lang)
|