deepresearch-flow 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. deepresearch_flow/cli.py +2 -0
  2. deepresearch_flow/paper/config.py +15 -0
  3. deepresearch_flow/paper/db.py +193 -0
  4. deepresearch_flow/paper/db_ops.py +1939 -0
  5. deepresearch_flow/paper/llm.py +2 -0
  6. deepresearch_flow/paper/web/app.py +46 -3320
  7. deepresearch_flow/paper/web/constants.py +23 -0
  8. deepresearch_flow/paper/web/filters.py +255 -0
  9. deepresearch_flow/paper/web/handlers/__init__.py +14 -0
  10. deepresearch_flow/paper/web/handlers/api.py +217 -0
  11. deepresearch_flow/paper/web/handlers/pages.py +334 -0
  12. deepresearch_flow/paper/web/markdown.py +549 -0
  13. deepresearch_flow/paper/web/static/css/main.css +857 -0
  14. deepresearch_flow/paper/web/static/js/detail.js +406 -0
  15. deepresearch_flow/paper/web/static/js/index.js +266 -0
  16. deepresearch_flow/paper/web/static/js/outline.js +58 -0
  17. deepresearch_flow/paper/web/static/js/stats.js +39 -0
  18. deepresearch_flow/paper/web/templates/base.html +43 -0
  19. deepresearch_flow/paper/web/templates/detail.html +332 -0
  20. deepresearch_flow/paper/web/templates/index.html +114 -0
  21. deepresearch_flow/paper/web/templates/stats.html +29 -0
  22. deepresearch_flow/paper/web/templates.py +85 -0
  23. deepresearch_flow/paper/web/text.py +68 -0
  24. deepresearch_flow/recognize/cli.py +157 -3
  25. deepresearch_flow/recognize/organize.py +58 -0
  26. deepresearch_flow/translator/__init__.py +1 -0
  27. deepresearch_flow/translator/cli.py +451 -0
  28. deepresearch_flow/translator/config.py +19 -0
  29. deepresearch_flow/translator/engine.py +959 -0
  30. deepresearch_flow/translator/fixers.py +451 -0
  31. deepresearch_flow/translator/placeholder.py +62 -0
  32. deepresearch_flow/translator/prompts.py +116 -0
  33. deepresearch_flow/translator/protector.py +291 -0
  34. deepresearch_flow/translator/segment.py +180 -0
  35. deepresearch_flow-0.4.0.dist-info/METADATA +327 -0
  36. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +40 -13
  37. deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
  38. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
  39. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
  40. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
  41. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1939 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import difflib
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any
8
+ import re
9
+ import unicodedata
10
+
11
+ from deepresearch_flow.paper.utils import stable_hash
12
+
13
+ try:
14
+ from pypdf import PdfReader
15
+ PYPDF_AVAILABLE = True
16
+ except Exception:
17
+ PYPDF_AVAILABLE = False
18
+
19
+ try:
20
+ from pybtex.database import parse_file
21
+ PYBTEX_AVAILABLE = True
22
+ except Exception:
23
+ PYBTEX_AVAILABLE = False
24
+
25
+ @dataclass(frozen=True)
26
+ class PaperIndex:
27
+ papers: list[dict[str, Any]]
28
+ id_by_hash: dict[str, int]
29
+ ordered_ids: list[int]
30
+ by_tag: dict[str, set[int]]
31
+ by_author: dict[str, set[int]]
32
+ by_year: dict[str, set[int]]
33
+ by_month: dict[str, set[int]]
34
+ by_venue: dict[str, set[int]]
35
+ stats: dict[str, Any]
36
+ md_path_by_hash: dict[str, Path]
37
+ translated_md_by_hash: dict[str, dict[str, Path]]
38
+ pdf_path_by_hash: dict[str, Path]
39
+ template_tags: list[str]
40
+
41
+
42
+ def _normalize_key(value: str) -> str:
43
+ return value.strip().lower()
44
+
45
+
46
+ def _parse_year_month(date_str: str | None) -> tuple[str | None, str | None]:
47
+ if not date_str:
48
+ return None, None
49
+ text = str(date_str).strip()
50
+ year = None
51
+ month = None
52
+
53
+ year_match = re.search(r"(19|20)\d{2}", text)
54
+ if year_match:
55
+ year = year_match.group(0)
56
+
57
+ numeric_match = re.search(r"(19|20)\d{2}[-/](\d{1,2})", text)
58
+ if numeric_match:
59
+ m = int(numeric_match.group(2))
60
+ if 1 <= m <= 12:
61
+ month = f"{m:02d}"
62
+ return year, month
63
+
64
+ month_word = re.search(
65
+ r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|"
66
+ r"january|february|march|april|june|july|august|september|october|november|december)",
67
+ text.lower(),
68
+ )
69
+ if month_word:
70
+ lookup = {
71
+ "january": "01",
72
+ "february": "02",
73
+ "march": "03",
74
+ "april": "04",
75
+ "may": "05",
76
+ "june": "06",
77
+ "july": "07",
78
+ "august": "08",
79
+ "september": "09",
80
+ "october": "10",
81
+ "november": "11",
82
+ "december": "12",
83
+ "jan": "01",
84
+ "feb": "02",
85
+ "mar": "03",
86
+ "apr": "04",
87
+ "jun": "06",
88
+ "jul": "07",
89
+ "aug": "08",
90
+ "sep": "09",
91
+ "sept": "09",
92
+ "oct": "10",
93
+ "nov": "11",
94
+ "dec": "12",
95
+ }
96
+ month = lookup.get(month_word.group(0))
97
+
98
+ return year, month
99
+
100
+
101
+ def _normalize_month_token(value: str | None) -> str | None:
102
+ if not value:
103
+ return None
104
+ raw = str(value).strip().lower()
105
+ if not raw:
106
+ return None
107
+ if raw.isdigit():
108
+ num = int(raw)
109
+ if 1 <= num <= 12:
110
+ return f"{num:02d}"
111
+ lookup = {
112
+ "january": "01",
113
+ "february": "02",
114
+ "march": "03",
115
+ "april": "04",
116
+ "may": "05",
117
+ "june": "06",
118
+ "july": "07",
119
+ "august": "08",
120
+ "september": "09",
121
+ "october": "10",
122
+ "november": "11",
123
+ "december": "12",
124
+ "jan": "01",
125
+ "feb": "02",
126
+ "mar": "03",
127
+ "apr": "04",
128
+ "jun": "06",
129
+ "jul": "07",
130
+ "aug": "08",
131
+ "sep": "09",
132
+ "sept": "09",
133
+ "oct": "10",
134
+ "nov": "11",
135
+ "dec": "12",
136
+ }
137
+ return lookup.get(raw)
138
+
139
+
140
+ def _extract_authors(paper: dict[str, Any]) -> list[str]:
141
+ value = paper.get("paper_authors")
142
+ if value is None:
143
+ return []
144
+ if isinstance(value, list):
145
+ return [str(item).strip() for item in value if str(item).strip()]
146
+ if isinstance(value, str):
147
+ return [part.strip() for part in value.split(",") if part.strip()]
148
+ return [str(value)]
149
+
150
+
151
+ def _extract_tags(paper: dict[str, Any]) -> list[str]:
152
+ tags = paper.get("ai_generated_tags") or []
153
+ if isinstance(tags, list):
154
+ return [str(tag).strip() for tag in tags if str(tag).strip()]
155
+ return []
156
+
157
+
158
+ def _extract_keywords(paper: dict[str, Any]) -> list[str]:
159
+ keywords = paper.get("keywords") or []
160
+ if isinstance(keywords, list):
161
+ return [str(keyword).strip() for keyword in keywords if str(keyword).strip()]
162
+ if isinstance(keywords, str):
163
+ parts = re.split(r"[;,]", keywords)
164
+ return [part.strip() for part in parts if part.strip()]
165
+ return []
166
+
167
+
168
+ _SUMMARY_FIELDS = (
169
+ "summary",
170
+ "abstract",
171
+ "keywords",
172
+ "question1",
173
+ "question2",
174
+ "question3",
175
+ "question4",
176
+ "question5",
177
+ "question6",
178
+ "question7",
179
+ "question8",
180
+ )
181
+
182
+
183
+ def _has_summary(paper: dict[str, Any], template_tags: list[str]) -> bool:
184
+ if template_tags:
185
+ return True
186
+ for key in _SUMMARY_FIELDS:
187
+ value = paper.get(key)
188
+ if isinstance(value, str) and value.strip():
189
+ return True
190
+ return False
191
+
192
+
193
+ def _extract_venue(paper: dict[str, Any]) -> str:
194
+ if isinstance(paper.get("bibtex"), dict):
195
+ bib = paper.get("bibtex") or {}
196
+ fields = bib.get("fields") or {}
197
+ bib_type = (bib.get("type") or "").lower()
198
+ if bib_type == "article" and fields.get("journal"):
199
+ return str(fields.get("journal"))
200
+ if bib_type in {"inproceedings", "conference", "proceedings"} and fields.get("booktitle"):
201
+ return str(fields.get("booktitle"))
202
+ return str(paper.get("publication_venue") or "")
203
+
204
+
205
+ def _available_templates(paper: dict[str, Any]) -> list[str]:
206
+ templates = paper.get("templates")
207
+ if not isinstance(templates, dict):
208
+ return []
209
+ order = paper.get("template_order") or list(templates.keys())
210
+ seen: set[str] = set()
211
+ available: list[str] = []
212
+ for tag in order:
213
+ if tag in templates and tag not in seen:
214
+ available.append(tag)
215
+ seen.add(tag)
216
+ for tag in templates:
217
+ if tag not in seen:
218
+ available.append(tag)
219
+ seen.add(tag)
220
+ return available
221
+
222
+
223
+ _TITLE_PREFIX_LEN = 16
224
+ _TITLE_MIN_CHARS = 24
225
+ _TITLE_MIN_TOKENS = 4
226
+ _AUTHOR_YEAR_MIN_SIMILARITY = 0.8
227
+ _LEADING_NUMERIC_MAX_LEN = 2
228
+ _SIMILARITY_START = 0.95
229
+ _SIMILARITY_STEP = 0.05
230
+ _SIMILARITY_MAX_STEPS = 10
231
+
232
+
233
+ def _normalize_title_key(title: str) -> str:
234
+ value = unicodedata.normalize("NFKD", title)
235
+ greek_map = {
236
+ "α": "alpha",
237
+ "β": "beta",
238
+ "γ": "gamma",
239
+ "δ": "delta",
240
+ "ε": "epsilon",
241
+ "ζ": "zeta",
242
+ "η": "eta",
243
+ "θ": "theta",
244
+ "ι": "iota",
245
+ "κ": "kappa",
246
+ "λ": "lambda",
247
+ "μ": "mu",
248
+ "ν": "nu",
249
+ "ξ": "xi",
250
+ "ο": "omicron",
251
+ "π": "pi",
252
+ "ρ": "rho",
253
+ "σ": "sigma",
254
+ "τ": "tau",
255
+ "υ": "upsilon",
256
+ "φ": "phi",
257
+ "χ": "chi",
258
+ "ψ": "psi",
259
+ "ω": "omega",
260
+ }
261
+ for char, name in greek_map.items():
262
+ value = value.replace(char, f" {name} ")
263
+ value = re.sub(
264
+ r"\\(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\b",
265
+ r" \1 ",
266
+ value,
267
+ flags=re.IGNORECASE,
268
+ )
269
+ value = value.replace("{", "").replace("}", "")
270
+ value = value.replace("_", " ")
271
+ value = re.sub(r"([a-z])([0-9])", r"\1 \2", value, flags=re.IGNORECASE)
272
+ value = re.sub(r"([0-9])([a-z])", r"\1 \2", value, flags=re.IGNORECASE)
273
+ value = re.sub(r"[^a-z0-9]+", " ", value.lower())
274
+ value = re.sub(r"\s+", " ", value).strip()
275
+ tokens = value.split()
276
+ if not tokens:
277
+ return ""
278
+ merged: list[str] = []
279
+ idx = 0
280
+ while idx < len(tokens):
281
+ token = tokens[idx]
282
+ if len(token) == 1 and idx + 1 < len(tokens):
283
+ merged.append(token + tokens[idx + 1])
284
+ idx += 2
285
+ continue
286
+ merged.append(token)
287
+ idx += 1
288
+ return " ".join(merged)
289
+
290
+
291
+ def _compact_title_key(title_key: str) -> str:
292
+ return title_key.replace(" ", "")
293
+
294
+
295
+ def _strip_leading_numeric_tokens(title_key: str) -> str:
296
+ tokens = title_key.split()
297
+ idx = 0
298
+ while idx < len(tokens):
299
+ token = tokens[idx]
300
+ if token.isdigit() and len(token) <= _LEADING_NUMERIC_MAX_LEN:
301
+ idx += 1
302
+ continue
303
+ break
304
+ if idx == 0:
305
+ return title_key
306
+ return " ".join(tokens[idx:])
307
+
308
+
309
+ def _strip_pdf_hash_suffix(name: str) -> str:
310
+ return re.sub(r"(?i)(\.pdf)(?:-[0-9a-f\-]{8,})$", r"\1", name)
311
+
312
+
313
+ def _extract_title_from_filename(name: str) -> str:
314
+ base = name
315
+ lower = base.lower()
316
+ if lower.endswith(".md"):
317
+ base = base[:-3]
318
+ lower = base.lower()
319
+ if ".pdf-" in lower:
320
+ base = _strip_pdf_hash_suffix(base)
321
+ lower = base.lower()
322
+ if lower.endswith(".pdf"):
323
+ base = base[:-4]
324
+ base = base.replace("_", " ").strip()
325
+ match = re.match(r"\s*\d{4}\s*-\s*(.+)$", base)
326
+ if match:
327
+ return match.group(1).strip()
328
+ match = re.match(r"\s*.+?\s*-\s*\d{4}\s*-\s*(.+)$", base)
329
+ if match:
330
+ return match.group(1).strip()
331
+ return base.strip()
332
+
333
+
334
+ def _clean_pdf_metadata_title(value: str | None, path: Path) -> str | None:
335
+ if not value:
336
+ return None
337
+ text = str(value).replace("\x00", "").strip()
338
+ if not text:
339
+ return None
340
+ text = re.sub(r"(?i)^microsoft\\s+word\\s*-\\s*", "", text)
341
+ text = re.sub(r"(?i)^pdf\\s*-\\s*", "", text)
342
+ text = re.sub(r"(?i)^untitled\\b", "", text).strip()
343
+ if text.lower().endswith(".pdf"):
344
+ text = text[:-4].strip()
345
+ if len(text) < 3:
346
+ return None
347
+ stem = path.stem.strip()
348
+ if stem and text.lower() == stem.lower():
349
+ return None
350
+ return text
351
+
352
+
353
+ def _read_pdf_metadata_title(path: Path) -> str | None:
354
+ if not PYPDF_AVAILABLE:
355
+ return None
356
+ try:
357
+ reader = PdfReader(str(path))
358
+ meta = reader.metadata
359
+ title = meta.title if meta else None
360
+ except Exception:
361
+ return None
362
+ return _clean_pdf_metadata_title(title, path)
363
+
364
+
365
+ def _is_pdf_like(path: Path) -> bool:
366
+ suffix = path.suffix.lower()
367
+ if suffix == ".pdf":
368
+ return True
369
+ name_lower = path.name.lower()
370
+ return ".pdf-" in name_lower and not name_lower.endswith(".md")
371
+
372
+
373
+ def _scan_pdf_roots(roots: list[Path]) -> tuple[list[Path], list[dict[str, Any]]]:
374
+ pdf_paths: list[Path] = []
375
+ meta: list[dict[str, Any]] = []
376
+ seen: set[Path] = set()
377
+ for root in roots:
378
+ try:
379
+ if not root.exists() or not root.is_dir():
380
+ continue
381
+ except OSError:
382
+ continue
383
+ files: list[Path] = []
384
+ for path in root.rglob("*"):
385
+ try:
386
+ if not path.is_file():
387
+ continue
388
+ except OSError:
389
+ continue
390
+ if not _is_pdf_like(path):
391
+ continue
392
+ resolved = path.resolve()
393
+ if resolved in seen:
394
+ continue
395
+ seen.add(resolved)
396
+ files.append(resolved)
397
+ max_mtime = 0.0
398
+ total_size = 0
399
+ for path in files:
400
+ try:
401
+ stats = path.stat()
402
+ except OSError:
403
+ continue
404
+ max_mtime = max(max_mtime, stats.st_mtime)
405
+ total_size += stats.st_size
406
+ pdf_paths.extend(files)
407
+ meta.append(
408
+ {
409
+ "path": str(root),
410
+ "count": len(files),
411
+ "max_mtime": max_mtime,
412
+ "size": total_size,
413
+ }
414
+ )
415
+ return pdf_paths, meta
416
+
417
+
418
+ def _extract_year_author_from_filename(name: str) -> tuple[str | None, str | None]:
419
+ base = name
420
+ lower = base.lower()
421
+ if lower.endswith(".md"):
422
+ base = base[:-3]
423
+ lower = base.lower()
424
+ if ".pdf-" in lower:
425
+ base = _strip_pdf_hash_suffix(base)
426
+ lower = base.lower()
427
+ if lower.endswith(".pdf"):
428
+ base = base[:-4]
429
+ match = re.match(r"\s*(.+?)\s*-\s*((?:19|20)\d{2})\s*-\s*", base)
430
+ if match:
431
+ return match.group(2), match.group(1).strip()
432
+ match = re.match(r"\s*((?:19|20)\d{2})\s*-\s*", base)
433
+ if match:
434
+ return match.group(1), None
435
+ return None, None
436
+
437
+
438
+ def _normalize_author_key(name: str) -> str:
439
+ raw = name.lower().strip()
440
+ raw = raw.replace("et al.", "").replace("et al", "")
441
+ if "," in raw:
442
+ raw = raw.split(",", 1)[0]
443
+ raw = re.sub(r"[^a-z0-9]+", " ", raw)
444
+ raw = re.sub(r"\s+", " ", raw).strip()
445
+ if not raw:
446
+ return ""
447
+ parts = raw.split()
448
+ return parts[-1] if parts else raw
449
+
450
+
451
+ def _title_prefix_key(title_key: str) -> str | None:
452
+ if len(title_key.split()) < _TITLE_MIN_TOKENS:
453
+ return None
454
+ compact = _compact_title_key(title_key)
455
+ if len(compact) < _TITLE_PREFIX_LEN:
456
+ return None
457
+ prefix = compact[:_TITLE_PREFIX_LEN]
458
+ if not prefix:
459
+ return None
460
+ return f"prefix:{prefix}"
461
+
462
+
463
+ def _title_overlap_match(a: str, b: str) -> bool:
464
+ if not a or not b:
465
+ return False
466
+ if a == b:
467
+ return True
468
+ shorter, longer = (a, b) if len(a) <= len(b) else (b, a)
469
+ token_count = len(shorter.split())
470
+ if len(shorter) >= _TITLE_MIN_CHARS or token_count >= _TITLE_MIN_TOKENS:
471
+ if longer.startswith(shorter) or shorter in longer:
472
+ return True
473
+ return False
474
+
475
+
476
+ def _title_similarity(a: str, b: str) -> float:
477
+ if not a or not b:
478
+ return 0.0
479
+ return difflib.SequenceMatcher(None, a.lower(), b.lower()).ratio()
480
+
481
+
482
+ def _adaptive_similarity_match(title_key: str, candidates: list[Path]) -> tuple[Path | None, float]:
483
+ if not title_key:
484
+ return None, 0.0
485
+ scored: list[tuple[Path, float]] = []
486
+ for path in candidates:
487
+ candidate_title = _normalize_title_key(_extract_title_from_filename(path.name))
488
+ if not candidate_title:
489
+ continue
490
+ if _title_overlap_match(title_key, candidate_title):
491
+ return path, 1.0
492
+ scored.append((path, _title_similarity(title_key, candidate_title)))
493
+ if not scored:
494
+ return None, 0.0
495
+
496
+ def matches_at(threshold: float) -> list[tuple[Path, float]]:
497
+ return [(path, score) for path, score in scored if score >= threshold]
498
+
499
+ threshold = _SIMILARITY_START
500
+ step = _SIMILARITY_STEP
501
+ prev_threshold = None
502
+ prev_count = None
503
+ for _ in range(_SIMILARITY_MAX_STEPS):
504
+ matches = matches_at(threshold)
505
+ if len(matches) == 1:
506
+ path, score = matches[0]
507
+ return path, score
508
+ if len(matches) == 0:
509
+ prev_threshold = threshold
510
+ prev_count = 0
511
+ threshold -= step
512
+ continue
513
+ if prev_count == 0 and prev_threshold is not None:
514
+ low = threshold
515
+ high = prev_threshold
516
+ for _ in range(_SIMILARITY_MAX_STEPS):
517
+ mid = (low + high) / 2
518
+ mid_matches = matches_at(mid)
519
+ if len(mid_matches) == 1:
520
+ path, score = mid_matches[0]
521
+ return path, score
522
+ if len(mid_matches) == 0:
523
+ high = mid
524
+ else:
525
+ low = mid
526
+ return None, 0.0
527
+ prev_threshold = threshold
528
+ prev_count = len(matches)
529
+ threshold -= step
530
+ return None, 0.0
531
+
532
+
533
+ def _resolve_by_title_and_meta(
534
+ paper: dict[str, Any],
535
+ file_index: dict[str, list[Path]],
536
+ ) -> tuple[Path | None, str | None, float]:
537
+ title = str(paper.get("paper_title") or "")
538
+ title_key = _normalize_title_key(title)
539
+ if not title_key:
540
+ title_key = ""
541
+ candidates = file_index.get(title_key, [])
542
+ if candidates:
543
+ return candidates[0], "title", 1.0
544
+ if title_key:
545
+ compact_key = _compact_title_key(title_key)
546
+ compact_candidates = file_index.get(f"compact:{compact_key}", [])
547
+ if compact_candidates:
548
+ return compact_candidates[0], "title_compact", 1.0
549
+ stripped_key = _strip_leading_numeric_tokens(title_key)
550
+ if stripped_key and stripped_key != title_key:
551
+ stripped_candidates = file_index.get(stripped_key, [])
552
+ if stripped_candidates:
553
+ return stripped_candidates[0], "title_stripped", 1.0
554
+ stripped_compact = _compact_title_key(stripped_key)
555
+ stripped_candidates = file_index.get(f"compact:{stripped_compact}", [])
556
+ if stripped_candidates:
557
+ return stripped_candidates[0], "title_compact", 1.0
558
+ prefix_candidates: list[Path] = []
559
+ prefix_key = _title_prefix_key(title_key)
560
+ if prefix_key:
561
+ prefix_candidates = file_index.get(prefix_key, [])
562
+ if not prefix_candidates:
563
+ stripped_key = _strip_leading_numeric_tokens(title_key)
564
+ if stripped_key and stripped_key != title_key:
565
+ prefix_key = _title_prefix_key(stripped_key)
566
+ if prefix_key:
567
+ prefix_candidates = file_index.get(prefix_key, [])
568
+ if prefix_candidates:
569
+ match, score = _adaptive_similarity_match(title_key, prefix_candidates)
570
+ if match is not None:
571
+ match_type = "title_prefix" if score >= 1.0 else "title_fuzzy"
572
+ return match, match_type, score
573
+ year = str(paper.get("_year") or "").strip()
574
+ if not year.isdigit():
575
+ return None, None, 0.0
576
+ author_key = ""
577
+ authors = paper.get("_authors") or []
578
+ if authors:
579
+ author_key = _normalize_author_key(str(authors[0]))
580
+ candidates = []
581
+ match_type = "year"
582
+ if author_key:
583
+ candidates = file_index.get(f"authoryear:{year}:{author_key}", [])
584
+ if candidates:
585
+ match_type = "author_year"
586
+ if not candidates:
587
+ candidates = file_index.get(f"year:{year}", [])
588
+ if not candidates:
589
+ return None, None, 0.0
590
+ if len(candidates) == 1 and not title_key:
591
+ return candidates[0], match_type, 1.0
592
+ match, score = _adaptive_similarity_match(title_key, candidates)
593
+ if match is not None:
594
+ if score < _AUTHOR_YEAR_MIN_SIMILARITY:
595
+ return None, None, 0.0
596
+ return match, "title_fuzzy", score
597
+ return None, None, 0.0
598
+
599
+
600
+ def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
601
+ index: dict[str, list[Path]] = {}
602
+ for root in roots:
603
+ try:
604
+ if not root.exists() or not root.is_dir():
605
+ continue
606
+ except OSError:
607
+ continue
608
+ for path in root.rglob("*"):
609
+ try:
610
+ if not path.is_file():
611
+ continue
612
+ except OSError:
613
+ continue
614
+ suffix = path.suffix.lower()
615
+ if suffix not in suffixes:
616
+ name_lower = path.name.lower()
617
+ if suffixes == {".pdf"} and ".pdf-" in name_lower and suffix != ".md":
618
+ pass
619
+ else:
620
+ continue
621
+ resolved = path.resolve()
622
+ name_key = path.name.lower()
623
+ index.setdefault(name_key, []).append(resolved)
624
+ title_candidate = _extract_title_from_filename(path.name)
625
+ title_key = _normalize_title_key(title_candidate)
626
+ if title_key:
627
+ if title_key != name_key:
628
+ index.setdefault(title_key, []).append(resolved)
629
+ compact_key = _compact_title_key(title_key)
630
+ if compact_key:
631
+ index.setdefault(f"compact:{compact_key}", []).append(resolved)
632
+ prefix_key = _title_prefix_key(title_key)
633
+ if prefix_key:
634
+ index.setdefault(prefix_key, []).append(resolved)
635
+ stripped_key = _strip_leading_numeric_tokens(title_key)
636
+ if stripped_key and stripped_key != title_key:
637
+ index.setdefault(stripped_key, []).append(resolved)
638
+ stripped_compact = _compact_title_key(stripped_key)
639
+ if stripped_compact:
640
+ index.setdefault(f"compact:{stripped_compact}", []).append(resolved)
641
+ stripped_prefix = _title_prefix_key(stripped_key)
642
+ if stripped_prefix:
643
+ index.setdefault(stripped_prefix, []).append(resolved)
644
+ year_hint, author_hint = _extract_year_author_from_filename(path.name)
645
+ if year_hint:
646
+ index.setdefault(f"year:{year_hint}", []).append(resolved)
647
+ if author_hint:
648
+ author_key = _normalize_author_key(author_hint)
649
+ if author_key:
650
+ index.setdefault(f"authoryear:{year_hint}:{author_key}", []).append(resolved)
651
+ return index
652
+
653
+
654
+ def _build_file_index_from_paths(paths: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
655
+ index: dict[str, list[Path]] = {}
656
+ for path in paths:
657
+ try:
658
+ if not path.is_file():
659
+ continue
660
+ except OSError:
661
+ continue
662
+ suffix = path.suffix.lower()
663
+ if suffix not in suffixes:
664
+ name_lower = path.name.lower()
665
+ if suffixes == {".pdf"} and ".pdf-" in name_lower and suffix != ".md":
666
+ pass
667
+ else:
668
+ continue
669
+ resolved = path.resolve()
670
+ name_key = path.name.lower()
671
+ index.setdefault(name_key, []).append(resolved)
672
+ title_candidate = _extract_title_from_filename(path.name)
673
+ title_key = _normalize_title_key(title_candidate)
674
+ if title_key:
675
+ if title_key != name_key:
676
+ index.setdefault(title_key, []).append(resolved)
677
+ compact_key = _compact_title_key(title_key)
678
+ if compact_key:
679
+ index.setdefault(f"compact:{compact_key}", []).append(resolved)
680
+ prefix_key = _title_prefix_key(title_key)
681
+ if prefix_key:
682
+ index.setdefault(prefix_key, []).append(resolved)
683
+ stripped_key = _strip_leading_numeric_tokens(title_key)
684
+ if stripped_key and stripped_key != title_key:
685
+ index.setdefault(stripped_key, []).append(resolved)
686
+ stripped_compact = _compact_title_key(stripped_key)
687
+ if stripped_compact:
688
+ index.setdefault(f"compact:{stripped_compact}", []).append(resolved)
689
+ stripped_prefix = _title_prefix_key(stripped_key)
690
+ if stripped_prefix:
691
+ index.setdefault(stripped_prefix, []).append(resolved)
692
+ return index
693
+
694
+
695
+ def _build_translated_index(roots: list[Path]) -> dict[str, dict[str, Path]]:
696
+ index: dict[str, dict[str, Path]] = {}
697
+ candidates: list[Path] = []
698
+ for root in roots:
699
+ try:
700
+ if not root.exists() or not root.is_dir():
701
+ continue
702
+ except OSError:
703
+ continue
704
+ try:
705
+ candidates.extend(root.rglob("*.md"))
706
+ except OSError:
707
+ continue
708
+ for path in sorted(candidates, key=lambda item: str(item)):
709
+ try:
710
+ if not path.is_file():
711
+ continue
712
+ except OSError:
713
+ continue
714
+ name = path.name
715
+ match = re.match(r"^(.+)\.([^.]+)\.md$", name, flags=re.IGNORECASE)
716
+ if not match:
717
+ continue
718
+ base_name = match.group(1).strip()
719
+ lang = match.group(2).strip()
720
+ if not base_name or not lang:
721
+ continue
722
+ base_key = base_name.lower()
723
+ lang_key = lang.lower()
724
+ index.setdefault(base_key, {}).setdefault(lang_key, path.resolve())
725
+ return index
726
+
727
+
728
+ def _resolve_source_md(paper: dict[str, Any], md_index: dict[str, list[Path]]) -> Path | None:
729
+ source_path = paper.get("source_path")
730
+ if not source_path:
731
+ source_path = ""
732
+ if source_path:
733
+ name = Path(str(source_path)).name.lower()
734
+ candidates = md_index.get(name, [])
735
+ if candidates:
736
+ return candidates[0]
737
+ match, _, _ = _resolve_by_title_and_meta(paper, md_index)
738
+ return match
739
+
740
+
741
+ def _guess_pdf_names(paper: dict[str, Any]) -> list[str]:
742
+ source_path = paper.get("source_path")
743
+ if not source_path:
744
+ return []
745
+ name = Path(str(source_path)).name
746
+ match = re.match(r"(?i)(.+\.pdf)(?:-[0-9a-f\-]{8,})?\.md$", name)
747
+ if match:
748
+ return [Path(match.group(1)).name]
749
+ if ".pdf-" in name.lower():
750
+ base = name[: name.lower().rfind(".pdf-") + 4]
751
+ return [Path(base).name]
752
+ if name.lower().endswith(".pdf"):
753
+ return [name]
754
+ if name.lower().endswith(".pdf.md"):
755
+ return [name[:-3]]
756
+ return []
757
+
758
+
759
+ def _resolve_pdf(paper: dict[str, Any], pdf_index: dict[str, list[Path]]) -> Path | None:
760
+ for filename in _guess_pdf_names(paper):
761
+ candidates = pdf_index.get(filename.lower(), [])
762
+ if candidates:
763
+ return candidates[0]
764
+ match, _, _ = _resolve_by_title_and_meta(paper, pdf_index)
765
+ return match
766
+
767
+
768
+ def build_index(
769
+ papers: list[dict[str, Any]],
770
+ *,
771
+ md_roots: list[Path] | None = None,
772
+ md_translated_roots: list[Path] | None = None,
773
+ pdf_roots: list[Path] | None = None,
774
+ ) -> PaperIndex:
775
+ id_by_hash: dict[str, int] = {}
776
+ by_tag: dict[str, set[int]] = {}
777
+ by_author: dict[str, set[int]] = {}
778
+ by_year: dict[str, set[int]] = {}
779
+ by_month: dict[str, set[int]] = {}
780
+ by_venue: dict[str, set[int]] = {}
781
+
782
+ md_path_by_hash: dict[str, Path] = {}
783
+ translated_md_by_hash: dict[str, dict[str, Path]] = {}
784
+ pdf_path_by_hash: dict[str, Path] = {}
785
+
786
+ md_file_index = _build_file_index(md_roots or [], suffixes={".md"})
787
+ translated_index = _build_translated_index(md_translated_roots or [])
788
+ pdf_file_index = _build_file_index(pdf_roots or [], suffixes={".pdf"})
789
+
790
+ year_counts: dict[str, int] = {}
791
+ month_counts: dict[str, int] = {}
792
+ tag_counts: dict[str, int] = {}
793
+ keyword_counts: dict[str, int] = {}
794
+ author_counts: dict[str, int] = {}
795
+ venue_counts: dict[str, int] = {}
796
+ template_tag_counts: dict[str, int] = {}
797
+
798
+ def add_index(index: dict[str, set[int]], key: str, idx: int) -> None:
799
+ index.setdefault(key, set()).add(idx)
800
+
801
+ for idx, paper in enumerate(papers):
802
+ is_pdf_only = bool(paper.get("_is_pdf_only"))
803
+ source_hash = paper.get("source_hash")
804
+ if not source_hash and paper.get("source_path"):
805
+ source_hash = stable_hash(str(paper.get("source_path")))
806
+ if source_hash:
807
+ id_by_hash[str(source_hash)] = idx
808
+
809
+ title = str(paper.get("paper_title") or "")
810
+ paper["_title_lc"] = title.lower()
811
+
812
+ bib_fields: dict[str, Any] = {}
813
+ if isinstance(paper.get("bibtex"), dict):
814
+ bib_fields = paper.get("bibtex", {}).get("fields", {}) or {}
815
+
816
+ year = None
817
+ if bib_fields.get("year") and str(bib_fields.get("year")).isdigit():
818
+ year = str(bib_fields.get("year"))
819
+ month = _normalize_month_token(bib_fields.get("month"))
820
+ if not year or not month:
821
+ parsed_year, parsed_month = _parse_year_month(str(paper.get("publication_date") or ""))
822
+ year = year or parsed_year
823
+ month = month or parsed_month
824
+
825
+ year_label = year or "Unknown"
826
+ month_label = month or "Unknown"
827
+ paper["_year"] = year_label
828
+ paper["_month"] = month_label
829
+ add_index(by_year, _normalize_key(year_label), idx)
830
+ add_index(by_month, _normalize_key(month_label), idx)
831
+ if not is_pdf_only:
832
+ year_counts[year_label] = year_counts.get(year_label, 0) + 1
833
+ month_counts[month_label] = month_counts.get(month_label, 0) + 1
834
+
835
+ venue = _extract_venue(paper).strip()
836
+ paper["_venue"] = venue
837
+ if venue:
838
+ add_index(by_venue, _normalize_key(venue), idx)
839
+ if not is_pdf_only:
840
+ venue_counts[venue] = venue_counts.get(venue, 0) + 1
841
+ else:
842
+ add_index(by_venue, "unknown", idx)
843
+ if not is_pdf_only:
844
+ venue_counts["Unknown"] = venue_counts.get("Unknown", 0) + 1
845
+
846
+ authors = _extract_authors(paper)
847
+ paper["_authors"] = authors
848
+ for author in authors:
849
+ key = _normalize_key(author)
850
+ add_index(by_author, key, idx)
851
+ if not is_pdf_only:
852
+ author_counts[author] = author_counts.get(author, 0) + 1
853
+
854
+ tags = _extract_tags(paper)
855
+ paper["_tags"] = tags
856
+ for tag in tags:
857
+ key = _normalize_key(tag)
858
+ add_index(by_tag, key, idx)
859
+ if not is_pdf_only:
860
+ tag_counts[tag] = tag_counts.get(tag, 0) + 1
861
+
862
+ keywords = _extract_keywords(paper)
863
+ paper["_keywords"] = keywords
864
+ for keyword in keywords:
865
+ if not is_pdf_only:
866
+ keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1
867
+
868
+ template_tags = _available_templates(paper)
869
+ if not template_tags:
870
+ fallback_tag = paper.get("template_tag") or paper.get("prompt_template")
871
+ if fallback_tag:
872
+ template_tags = [str(fallback_tag)]
873
+ paper["_template_tags"] = template_tags
874
+ paper["_template_tags_lc"] = [tag.lower() for tag in template_tags]
875
+ paper["_has_summary"] = _has_summary(paper, template_tags)
876
+ if not is_pdf_only:
877
+ for tag in template_tags:
878
+ template_tag_counts[tag] = template_tag_counts.get(tag, 0) + 1
879
+
880
+ search_parts = [title, venue, " ".join(authors), " ".join(tags)]
881
+ paper["_search_lc"] = " ".join(part for part in search_parts if part).lower()
882
+
883
+ source_hash_str = str(source_hash) if source_hash else str(idx)
884
+ md_path = _resolve_source_md(paper, md_file_index)
885
+ if md_path is not None:
886
+ md_path_by_hash[source_hash_str] = md_path
887
+ base_key = md_path.with_suffix("").name.lower()
888
+ translations = translated_index.get(base_key, {})
889
+ if translations:
890
+ translated_md_by_hash[source_hash_str] = translations
891
+ pdf_path = _resolve_pdf(paper, pdf_file_index)
892
+ if pdf_path is not None:
893
+ pdf_path_by_hash[source_hash_str] = pdf_path
894
+
895
+ def year_sort_key(item: tuple[int, dict[str, Any]]) -> tuple[int, int, str]:
896
+ idx, paper = item
897
+ year_label = str(paper.get("_year") or "Unknown")
898
+ title_label = str(paper.get("paper_title") or "")
899
+ if year_label.isdigit():
900
+ return (0, -int(year_label), title_label.lower())
901
+ return (1, 0, title_label.lower())
902
+
903
+ ordered_ids = [idx for idx, _ in sorted(enumerate(papers), key=year_sort_key)]
904
+
905
+ stats_total = sum(1 for paper in papers if not paper.get("_is_pdf_only"))
906
+ stats = {
907
+ "total": stats_total,
908
+ "years": _sorted_counts(year_counts, numeric_desc=True),
909
+ "months": _sorted_month_counts(month_counts),
910
+ "tags": _sorted_counts(tag_counts),
911
+ "keywords": _sorted_counts(keyword_counts),
912
+ "authors": _sorted_counts(author_counts),
913
+ "venues": _sorted_counts(venue_counts),
914
+ }
915
+
916
+ template_tags = sorted(template_tag_counts.keys(), key=lambda item: item.lower())
917
+
918
+ return PaperIndex(
919
+ papers=papers,
920
+ id_by_hash=id_by_hash,
921
+ ordered_ids=ordered_ids,
922
+ by_tag=by_tag,
923
+ by_author=by_author,
924
+ by_year=by_year,
925
+ by_month=by_month,
926
+ by_venue=by_venue,
927
+ stats=stats,
928
+ md_path_by_hash=md_path_by_hash,
929
+ translated_md_by_hash=translated_md_by_hash,
930
+ pdf_path_by_hash=pdf_path_by_hash,
931
+ template_tags=template_tags,
932
+ )
933
+
934
+
935
+ def _sorted_counts(counts: dict[str, int], *, numeric_desc: bool = False) -> list[dict[str, Any]]:
936
+ items = list(counts.items())
937
+ if numeric_desc:
938
+ def key(item: tuple[str, int]) -> tuple[int, int]:
939
+ label, count = item
940
+ if label.isdigit():
941
+ return (0, -int(label))
942
+ return (1, 0)
943
+ items.sort(key=key)
944
+ else:
945
+ items.sort(key=lambda item: item[1], reverse=True)
946
+ return [{"label": k, "count": v} for k, v in items]
947
+
948
+
949
+ def _sorted_month_counts(counts: dict[str, int]) -> list[dict[str, Any]]:
950
+ def month_sort(label: str) -> int:
951
+ if label == "Unknown":
952
+ return 99
953
+ if label.isdigit():
954
+ return int(label)
955
+ return 98
956
+
957
+ items = sorted(counts.items(), key=lambda item: month_sort(item[0]))
958
+ return [{"label": k, "count": v} for k, v in items]
959
+
960
+
961
+ # ============================================================================
962
+ # Data Layer Helpers: Load, Merge, Cache, PDF-only Entries
963
+ # ============================================================================
964
+
965
+ _TEMPLATE_INFER_IGNORE_KEYS = {
966
+ "source_path",
967
+ "source_hash",
968
+ "provider",
969
+ "model",
970
+ "extracted_at",
971
+ "truncation",
972
+ "output_language",
973
+ "prompt_template",
974
+ }
975
+
976
+
977
+ def _load_paper_inputs(paths: list[Path]) -> list[dict[str, Any]]:
978
+ """Load paper JSON files and infer template tags if needed."""
979
+ # Delayed import to avoid circular dependency with template_registry
980
+ from deepresearch_flow.paper.template_registry import (
981
+ list_template_names_in_registry_order,
982
+ load_schema_for_template,
983
+ )
984
+
985
+ inputs: list[dict[str, Any]] = []
986
+ for path in paths:
987
+ payload = json.loads(path.read_text(encoding="utf-8"))
988
+ if isinstance(payload, list):
989
+ raise ValueError(
990
+ f"Input JSON must be an object with template_tag and papers (got array): {path}"
991
+ )
992
+ if not isinstance(payload, dict):
993
+ raise ValueError(f"Input JSON must be an object: {path}")
994
+ papers = payload.get("papers")
995
+ if not isinstance(papers, list):
996
+ raise ValueError(f"Input JSON missing papers list: {path}")
997
+ template_tag = payload.get("template_tag")
998
+ if not template_tag:
999
+ template_tag = _infer_template_tag(papers, path, list_template_names_in_registry_order, load_schema_for_template)
1000
+ inputs.append({"template_tag": str(template_tag), "papers": papers})
1001
+ return inputs
1002
+
1003
+
1004
+ def _infer_template_tag(
1005
+ papers: list[dict[str, Any]],
1006
+ path: Path,
1007
+ list_template_names_in_registry_order,
1008
+ load_schema_for_template,
1009
+ ) -> str:
1010
+ """Infer template tag from paper content."""
1011
+ prompt_tags = {
1012
+ str(paper.get("prompt_template"))
1013
+ for paper in papers
1014
+ if isinstance(paper, dict) and paper.get("prompt_template")
1015
+ }
1016
+ if len(prompt_tags) == 1:
1017
+ return prompt_tags.pop()
1018
+
1019
+ sample = next((paper for paper in papers if isinstance(paper, dict)), None)
1020
+ if sample is None:
1021
+ raise ValueError(f"Input JSON has no paper objects to infer template_tag: {path}")
1022
+
1023
+ paper_keys = {key for key in sample.keys() if key not in _TEMPLATE_INFER_IGNORE_KEYS}
1024
+ if not paper_keys:
1025
+ raise ValueError(f"Input JSON papers have no keys to infer template_tag: {path}")
1026
+
1027
+ best_tag = None
1028
+ best_score = -1
1029
+ for name in list_template_names_in_registry_order():
1030
+ schema = load_schema_for_template(name)
1031
+ schema_keys = set((schema.get("properties") or {}).keys())
1032
+ score = len(paper_keys & schema_keys)
1033
+ if score > best_score:
1034
+ best_score = score
1035
+ best_tag = name
1036
+ elif score == best_score:
1037
+ if best_tag != "simple" and name == "simple":
1038
+ best_tag = name
1039
+
1040
+ if not best_tag:
1041
+ raise ValueError(f"Unable to infer template_tag from input JSON: {path}")
1042
+ return best_tag
1043
+
1044
+
1045
+ def _build_cache_meta(
1046
+ db_paths: list[Path],
1047
+ bibtex_path: Path | None,
1048
+ pdf_roots_meta: list[dict[str, Any]] | None = None,
1049
+ ) -> dict[str, Any]:
1050
+ """Build cache metadata for invalidation."""
1051
+ def file_meta(path: Path) -> dict[str, Any]:
1052
+ try:
1053
+ stats = path.stat()
1054
+ except OSError as exc:
1055
+ raise ValueError(f"Failed to read input metadata for cache: {path}") from exc
1056
+ return {"path": str(path), "mtime": stats.st_mtime, "size": stats.st_size}
1057
+
1058
+ meta = {
1059
+ "version": 1,
1060
+ "inputs": [file_meta(path) for path in db_paths],
1061
+ "bibtex": file_meta(bibtex_path) if bibtex_path else None,
1062
+ }
1063
+ if pdf_roots_meta is not None:
1064
+ meta["pdf_roots"] = pdf_roots_meta
1065
+ return meta
1066
+
1067
+
1068
+ def _load_cached_papers(cache_dir: Path, meta: dict[str, Any]) -> list[dict[str, Any]] | None:
1069
+ """Load cached papers if metadata matches."""
1070
+ meta_path = cache_dir / "db_serve_cache.meta.json"
1071
+ data_path = cache_dir / "db_serve_cache.papers.json"
1072
+ if not meta_path.exists() or not data_path.exists():
1073
+ return None
1074
+ try:
1075
+ cached_meta = json.loads(meta_path.read_text(encoding="utf-8"))
1076
+ if cached_meta != meta:
1077
+ return None
1078
+ cached_papers = json.loads(data_path.read_text(encoding="utf-8"))
1079
+ if not isinstance(cached_papers, list):
1080
+ return None
1081
+ return cached_papers
1082
+ except Exception:
1083
+ return None
1084
+
1085
+
1086
+ def _write_cached_papers(cache_dir: Path, meta: dict[str, Any], papers: list[dict[str, Any]]) -> None:
1087
+ """Write cached papers and metadata."""
1088
+ meta_path = cache_dir / "db_serve_cache.meta.json"
1089
+ data_path = cache_dir / "db_serve_cache.papers.json"
1090
+ meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
1091
+ data_path.write_text(json.dumps(papers, ensure_ascii=False, indent=2), encoding="utf-8")
1092
+
1093
+
1094
+ def _extract_year_for_matching(paper: dict[str, Any]) -> str | None:
1095
+ """Extract year from bibtex or publication_date for matching."""
1096
+ if isinstance(paper.get("bibtex"), dict):
1097
+ fields = paper.get("bibtex", {}).get("fields", {}) or {}
1098
+ year = fields.get("year")
1099
+ if year and str(year).isdigit():
1100
+ return str(year)
1101
+ parsed_year, _ = _parse_year_month(str(paper.get("publication_date") or ""))
1102
+ return parsed_year
1103
+
1104
+
1105
+ def _prepare_paper_matching_fields(paper: dict[str, Any]) -> None:
1106
+ """Ensure paper has _authors and _year fields for matching."""
1107
+ if "_authors" not in paper:
1108
+ paper["_authors"] = _extract_authors(paper)
1109
+ if "_year" not in paper:
1110
+ paper["_year"] = _extract_year_for_matching(paper) or ""
1111
+
1112
+
1113
+ def _build_pdf_only_entries(
1114
+ papers: list[dict[str, Any]],
1115
+ pdf_paths: list[Path],
1116
+ pdf_index: dict[str, list[Path]],
1117
+ ) -> list[dict[str, Any]]:
1118
+ """Build paper entries for unmatched PDFs."""
1119
+ matched: set[Path] = set()
1120
+ for paper in papers:
1121
+ _prepare_paper_matching_fields(paper)
1122
+ pdf_path = _resolve_pdf(paper, pdf_index)
1123
+ if pdf_path:
1124
+ matched.add(pdf_path.resolve())
1125
+
1126
+ entries: list[dict[str, Any]] = []
1127
+ for path in pdf_paths:
1128
+ resolved = path.resolve()
1129
+ if resolved in matched:
1130
+ continue
1131
+ title = _read_pdf_metadata_title(resolved) or _extract_title_from_filename(resolved.name)
1132
+ if not title:
1133
+ title = resolved.stem
1134
+ year_hint, author_hint = _extract_year_author_from_filename(resolved.name)
1135
+ entry: dict[str, Any] = {
1136
+ "paper_title": title,
1137
+ "paper_authors": [author_hint] if author_hint else [],
1138
+ "publication_date": year_hint or "",
1139
+ "source_hash": stable_hash(str(resolved)),
1140
+ "source_path": str(resolved),
1141
+ "_is_pdf_only": True,
1142
+ }
1143
+ entries.append(entry)
1144
+ return entries
1145
+
1146
+
1147
+ def _normalize_merge_title(value: str | None) -> str | None:
1148
+ """Normalize title for merging."""
1149
+ if not value:
1150
+ return None
1151
+ return str(value).replace("{", "").replace("}", "").strip().lower()
1152
+
1153
+
1154
+ def _extract_bibtex_title(paper: dict[str, Any]) -> str | None:
1155
+ """Extract normalized title from bibtex."""
1156
+ if not isinstance(paper.get("bibtex"), dict):
1157
+ return None
1158
+ fields = paper.get("bibtex", {}).get("fields", {}) or {}
1159
+ return _normalize_merge_title(fields.get("title"))
1160
+
1161
+
1162
+ def _extract_paper_title(paper: dict[str, Any]) -> str | None:
1163
+ """Extract normalized paper_title."""
1164
+ return _normalize_merge_title(paper.get("paper_title"))
1165
+
1166
+
1167
+ def _titles_match(group: dict[str, Any], paper: dict[str, Any], *, threshold: float) -> bool:
1168
+ """Check if paper title matches group titles."""
1169
+ bib_title = _extract_bibtex_title(paper)
1170
+ group_bib = group.get("_merge_bibtex_titles") or set()
1171
+ if bib_title and group_bib:
1172
+ return any(_title_similarity(bib_title, existing) >= threshold for existing in group_bib)
1173
+
1174
+ paper_title = _extract_paper_title(paper)
1175
+ group_titles = group.get("_merge_paper_titles") or set()
1176
+ if paper_title and group_titles:
1177
+ return any(_title_similarity(paper_title, existing) >= threshold for existing in group_titles)
1178
+ return False
1179
+
1180
+
1181
+ def _add_merge_titles(group: dict[str, Any], paper: dict[str, Any]) -> None:
1182
+ """Add paper titles to group merge tracking."""
1183
+ bib_title = _extract_bibtex_title(paper)
1184
+ if bib_title:
1185
+ group.setdefault("_merge_bibtex_titles", set()).add(bib_title)
1186
+ paper_title = _extract_paper_title(paper)
1187
+ if paper_title:
1188
+ group.setdefault("_merge_paper_titles", set()).add(paper_title)
1189
+
1190
+
1191
+ def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
1192
+ """Merge paper inputs from multiple template extractions."""
1193
+ merged: list[dict[str, Any]] = []
1194
+ threshold = 0.95
1195
+ prefix_len = 5
1196
+ bibtex_exact: dict[str, set[int]] = {}
1197
+ bibtex_prefix: dict[str, set[int]] = {}
1198
+ paper_exact: dict[str, set[int]] = {}
1199
+ paper_prefix: dict[str, set[int]] = {}
1200
+
1201
+ def prefix_key(value: str) -> str:
1202
+ return value[:prefix_len] if len(value) >= prefix_len else value
1203
+
1204
+ def add_index(
1205
+ value: str,
1206
+ exact_index: dict[str, set[int]],
1207
+ prefix_index: dict[str, set[int]],
1208
+ idx: int,
1209
+ ) -> None:
1210
+ exact_index.setdefault(value, set()).add(idx)
1211
+ prefix_index.setdefault(prefix_key(value), set()).add(idx)
1212
+
1213
+ def candidate_ids(bib_title: str | None, paper_title: str | None) -> list[int]:
1214
+ ids: set[int] = set()
1215
+ if bib_title:
1216
+ ids |= bibtex_exact.get(bib_title, set())
1217
+ ids |= bibtex_prefix.get(prefix_key(bib_title), set())
1218
+ if paper_title:
1219
+ ids |= paper_exact.get(paper_title, set())
1220
+ ids |= paper_prefix.get(prefix_key(paper_title), set())
1221
+ return sorted(ids)
1222
+
1223
+ for bundle in inputs:
1224
+ template_tag = bundle.get("template_tag")
1225
+ papers = bundle.get("papers") or []
1226
+ for paper in papers:
1227
+ if not isinstance(paper, dict):
1228
+ raise ValueError("Input papers must be objects")
1229
+ bib_title = _extract_bibtex_title(paper)
1230
+ paper_title = _extract_paper_title(paper)
1231
+ match = None
1232
+ match_idx = None
1233
+ for idx in candidate_ids(bib_title, paper_title):
1234
+ candidate = merged[idx]
1235
+ if _titles_match(candidate, paper, threshold=threshold):
1236
+ match = candidate
1237
+ match_idx = idx
1238
+ break
1239
+ if match is None:
1240
+ group = {
1241
+ "templates": {template_tag: paper},
1242
+ "template_order": [template_tag],
1243
+ }
1244
+ _add_merge_titles(group, paper)
1245
+ merged.append(group)
1246
+ group_idx = len(merged) - 1
1247
+ if bib_title:
1248
+ add_index(bib_title, bibtex_exact, bibtex_prefix, group_idx)
1249
+ if paper_title:
1250
+ add_index(paper_title, paper_exact, paper_prefix, group_idx)
1251
+ else:
1252
+ templates = match.setdefault("templates", {})
1253
+ templates[template_tag] = paper
1254
+ order = match.setdefault("template_order", [])
1255
+ if template_tag not in order:
1256
+ order.append(template_tag)
1257
+ _add_merge_titles(match, paper)
1258
+ if match_idx is not None:
1259
+ if bib_title:
1260
+ add_index(bib_title, bibtex_exact, bibtex_prefix, match_idx)
1261
+ if paper_title:
1262
+ add_index(paper_title, paper_exact, paper_prefix, match_idx)
1263
+
1264
+ for group in merged:
1265
+ templates = group.get("templates") or {}
1266
+ order = group.get("template_order") or list(templates.keys())
1267
+ default_tag = "simple" if "simple" in order else (order[0] if order else None)
1268
+ group["default_template"] = default_tag
1269
+ if default_tag and default_tag in templates:
1270
+ base = templates[default_tag]
1271
+ for key, value in base.items():
1272
+ group[key] = value
1273
+ group.pop("_merge_bibtex_titles", None)
1274
+ group.pop("_merge_paper_titles", None)
1275
+ return merged
1276
+
1277
+
1278
+ def _normalize_bibtex_title(title: str) -> str:
1279
+ """Normalize bibtex title for matching."""
1280
+ value = title.replace("{", "").replace("}", "")
1281
+ value = re.sub(r"[^a-z0-9]+", " ", value.lower())
1282
+ return re.sub(r"\s+", " ", value).strip()
1283
+
1284
+
1285
+ def enrich_with_bibtex(papers: list[dict[str, Any]], bibtex_path: Path) -> None:
1286
+ """Enrich papers with bibtex metadata."""
1287
+ if not PYBTEX_AVAILABLE:
1288
+ raise RuntimeError("pybtex is required for --bibtex support")
1289
+
1290
+ bib_data = parse_file(str(bibtex_path))
1291
+ entries: list[dict[str, Any]] = []
1292
+ by_prefix: dict[str, list[int]] = {}
1293
+ for key, entry in bib_data.entries.items():
1294
+ fields = dict(entry.fields)
1295
+ title = str(fields.get("title") or "").strip()
1296
+ title_norm = _normalize_bibtex_title(title)
1297
+ if not title_norm:
1298
+ continue
1299
+ record = {
1300
+ "key": key,
1301
+ "type": entry.type,
1302
+ "fields": fields,
1303
+ "persons": {role: [str(p) for p in persons] for role, persons in entry.persons.items()},
1304
+ "_title_norm": title_norm,
1305
+ }
1306
+ idx = len(entries)
1307
+ entries.append(record)
1308
+ prefix = title_norm[:16]
1309
+ by_prefix.setdefault(prefix, []).append(idx)
1310
+
1311
+ for paper in papers:
1312
+ if isinstance(paper.get("bibtex"), dict):
1313
+ continue
1314
+ title = str(paper.get("paper_title") or "").strip()
1315
+ if not title:
1316
+ continue
1317
+ norm = _normalize_bibtex_title(title)
1318
+ if not norm:
1319
+ continue
1320
+
1321
+ candidates = []
1322
+ prefix = norm[:16]
1323
+ for cand_idx in by_prefix.get(prefix, []):
1324
+ candidates.append(entries[cand_idx])
1325
+ if not candidates:
1326
+ candidates = entries
1327
+
1328
+ best = None
1329
+ best_score = 0.0
1330
+ for entry in candidates:
1331
+ score = _title_similarity(norm, entry["_title_norm"])
1332
+ if score > best_score:
1333
+ best_score = score
1334
+ best = entry
1335
+
1336
+ if best is not None and best_score >= 0.9:
1337
+ paper["bibtex"] = {k: v for k, v in best.items() if not k.startswith("_")}
1338
+
1339
+
1340
+ def load_and_merge_papers(
1341
+ db_paths: list[Path],
1342
+ bibtex_path: Path | None = None,
1343
+ cache_dir: Path | None = None,
1344
+ use_cache: bool = True,
1345
+ pdf_roots: list[Path] | None = None,
1346
+ ) -> list[dict[str, Any]]:
1347
+ """Load and merge papers from multiple JSON files, with optional caching and PDF-only entries."""
1348
+ cache_meta = None
1349
+ pdf_roots = pdf_roots or []
1350
+ pdf_paths: list[Path] = []
1351
+ pdf_roots_meta: list[dict[str, Any]] | None = None
1352
+ if pdf_roots:
1353
+ pdf_paths, pdf_roots_meta = _scan_pdf_roots(pdf_roots)
1354
+ if cache_dir and use_cache:
1355
+ cache_dir.mkdir(parents=True, exist_ok=True)
1356
+ cache_meta = _build_cache_meta(db_paths, bibtex_path, pdf_roots_meta)
1357
+ cached = _load_cached_papers(cache_dir, cache_meta)
1358
+ if cached is not None:
1359
+ return cached
1360
+
1361
+ inputs = _load_paper_inputs(db_paths)
1362
+ if bibtex_path is not None:
1363
+ for bundle in inputs:
1364
+ enrich_with_bibtex(bundle["papers"], bibtex_path)
1365
+ papers = _merge_paper_inputs(inputs)
1366
+ if pdf_paths:
1367
+ pdf_index = _build_file_index_from_paths(pdf_paths, suffixes={".pdf"})
1368
+ papers.extend(_build_pdf_only_entries(papers, pdf_paths, pdf_index))
1369
+
1370
+ if cache_dir and use_cache and cache_meta is not None:
1371
+ _write_cached_papers(cache_dir, cache_meta, papers)
1372
+ return papers
1373
+
1374
+
1375
+ # ============================================================================
1376
+ # Compare Logic for paper db compare
1377
+ # ============================================================================
1378
+
1379
+ from typing import Literal
1380
+
1381
+
1382
+ @dataclass
1383
+ class CompareResult:
1384
+ """Result of comparing two datasets."""
1385
+ side: Literal["A", "B", "MATCH"]
1386
+ source_hash: str
1387
+ title: str
1388
+ match_status: Literal["matched", "only_in_A", "only_in_B", "matched_pair"]
1389
+ match_type: str | None = None
1390
+ match_score: float = 0.0
1391
+ source_path: str | None = None
1392
+ other_source_hash: str | None = None
1393
+ other_title: str | None = None
1394
+ other_source_path: str | None = None
1395
+ lang: str | None = None
1396
+
1397
+
1398
+ @dataclass
1399
+ class CompareDataset:
1400
+ """Prepared dataset for compare."""
1401
+ papers: list[dict[str, Any]]
1402
+ md_index: dict[str, list[Path]]
1403
+ pdf_index: dict[str, list[Path]]
1404
+ translated_index: dict[str, dict[str, Path]]
1405
+ paper_index: dict[str, list[dict[str, Any]]]
1406
+ path_to_index: dict[Path, int]
1407
+ hash_to_index: dict[str, int]
1408
+ paper_id_to_index: dict[int, int]
1409
+
1410
+
1411
+ def _scan_md_roots(roots: list[Path]) -> list[Path]:
1412
+ paths: list[Path] = []
1413
+ for root in roots:
1414
+ try:
1415
+ if not root.exists() or not root.is_dir():
1416
+ continue
1417
+ except OSError:
1418
+ continue
1419
+ try:
1420
+ for path in root.rglob("*.md"):
1421
+ try:
1422
+ if not path.is_file():
1423
+ continue
1424
+ except OSError:
1425
+ continue
1426
+ paths.append(path.resolve())
1427
+ except OSError:
1428
+ continue
1429
+ return paths
1430
+
1431
+
1432
+ def _merge_file_indexes(*indexes: dict[str, list[Path]]) -> dict[str, list[Path]]:
1433
+ merged: dict[str, list[Path]] = {}
1434
+ for index in indexes:
1435
+ for key, paths in index.items():
1436
+ merged.setdefault(key, []).extend(paths)
1437
+ return merged
1438
+
1439
+
1440
+ def _build_md_only_entries(
1441
+ papers: list[dict[str, Any]],
1442
+ md_paths: list[Path],
1443
+ md_index: dict[str, list[Path]],
1444
+ ) -> list[dict[str, Any]]:
1445
+ matched: set[Path] = set()
1446
+ for paper in papers:
1447
+ _prepare_paper_matching_fields(paper)
1448
+ md_path = _resolve_source_md(paper, md_index)
1449
+ if md_path:
1450
+ matched.add(md_path.resolve())
1451
+ entries: list[dict[str, Any]] = []
1452
+ for path in md_paths:
1453
+ resolved = path.resolve()
1454
+ if resolved in matched:
1455
+ continue
1456
+ title = _extract_title_from_filename(resolved.name) or resolved.stem
1457
+ year_hint, author_hint = _extract_year_author_from_filename(resolved.name)
1458
+ entry: dict[str, Any] = {
1459
+ "paper_title": title,
1460
+ "paper_authors": [author_hint] if author_hint else [],
1461
+ "publication_date": year_hint or "",
1462
+ "source_hash": stable_hash(str(resolved)),
1463
+ "source_path": str(resolved),
1464
+ "_is_md_only": True,
1465
+ }
1466
+ entries.append(entry)
1467
+ return entries
1468
+
1469
+
1470
+ def _translation_base_key_for_paper(paper: dict[str, Any]) -> str:
1471
+ source_path = str(paper.get("source_path") or "")
1472
+ if source_path:
1473
+ return Path(source_path).stem.lower()
1474
+ title = str(paper.get("paper_title") or "")
1475
+ return _normalize_title_key(title)
1476
+
1477
+
1478
+ def _build_translated_only_entries(
1479
+ papers: list[dict[str, Any]],
1480
+ translated_index: dict[str, dict[str, Path]],
1481
+ lang: str,
1482
+ ) -> list[dict[str, Any]]:
1483
+ if not lang:
1484
+ return []
1485
+ lang_key = lang.lower()
1486
+ matched: set[Path] = set()
1487
+ for paper in papers:
1488
+ base_key = _translation_base_key_for_paper(paper)
1489
+ if not base_key:
1490
+ continue
1491
+ path = translated_index.get(base_key, {}).get(lang_key)
1492
+ if path:
1493
+ matched.add(path.resolve())
1494
+ entries: list[dict[str, Any]] = []
1495
+ for base_key, translations in translated_index.items():
1496
+ path = translations.get(lang_key)
1497
+ if not path:
1498
+ continue
1499
+ resolved = path.resolve()
1500
+ if resolved in matched:
1501
+ continue
1502
+ title = _extract_title_from_filename(resolved.name) or resolved.stem
1503
+ year_hint, author_hint = _extract_year_author_from_filename(resolved.name)
1504
+ entry: dict[str, Any] = {
1505
+ "paper_title": title,
1506
+ "paper_authors": [author_hint] if author_hint else [],
1507
+ "publication_date": year_hint or "",
1508
+ "source_hash": stable_hash(str(resolved)),
1509
+ "source_path": str(resolved),
1510
+ "_is_translated_only": True,
1511
+ "translation_lang": lang_key,
1512
+ }
1513
+ entries.append(entry)
1514
+ return entries
1515
+
1516
+
1517
+ def _build_paper_index(papers: list[dict[str, Any]]) -> dict[str, list[dict[str, Any]]]:
1518
+ index: dict[str, list[dict[str, Any]]] = {}
1519
+ for paper in papers:
1520
+ _prepare_paper_matching_fields(paper)
1521
+ title = str(paper.get("paper_title") or "")
1522
+ title_key = _normalize_title_key(title)
1523
+ if title_key:
1524
+ index.setdefault(title_key, []).append(paper)
1525
+ compact_key = _compact_title_key(title_key)
1526
+ if compact_key:
1527
+ index.setdefault(f"compact:{compact_key}", []).append(paper)
1528
+ prefix_key = _title_prefix_key(title_key)
1529
+ if prefix_key:
1530
+ index.setdefault(prefix_key, []).append(paper)
1531
+ stripped_key = _strip_leading_numeric_tokens(title_key)
1532
+ if stripped_key and stripped_key != title_key:
1533
+ index.setdefault(stripped_key, []).append(paper)
1534
+ stripped_compact = _compact_title_key(stripped_key)
1535
+ if stripped_compact:
1536
+ index.setdefault(f"compact:{stripped_compact}", []).append(paper)
1537
+ stripped_prefix = _title_prefix_key(stripped_key)
1538
+ if stripped_prefix:
1539
+ index.setdefault(stripped_prefix, []).append(paper)
1540
+ year = str(paper.get("_year") or "").strip()
1541
+ if year:
1542
+ index.setdefault(f"year:{year}", []).append(paper)
1543
+ authors = paper.get("_authors") or []
1544
+ if authors:
1545
+ author_key = _normalize_author_key(str(authors[0]))
1546
+ if author_key:
1547
+ index.setdefault(f"authoryear:{year}:{author_key}", []).append(paper)
1548
+ return index
1549
+
1550
+
1551
+ def _adaptive_similarity_match_papers(
1552
+ title_key: str,
1553
+ candidates: list[dict[str, Any]],
1554
+ ) -> tuple[dict[str, Any] | None, float]:
1555
+ if not title_key:
1556
+ return None, 0.0
1557
+ scored: list[tuple[dict[str, Any], float]] = []
1558
+ for paper in candidates:
1559
+ candidate_title = _normalize_title_key(str(paper.get("paper_title") or ""))
1560
+ if not candidate_title:
1561
+ continue
1562
+ if _title_overlap_match(title_key, candidate_title):
1563
+ return paper, 1.0
1564
+ scored.append((paper, _title_similarity(title_key, candidate_title)))
1565
+ if not scored:
1566
+ return None, 0.0
1567
+
1568
+ def matches_at(threshold: float) -> list[tuple[dict[str, Any], float]]:
1569
+ return [(paper, score) for paper, score in scored if score >= threshold]
1570
+
1571
+ threshold = _SIMILARITY_START
1572
+ step = _SIMILARITY_STEP
1573
+ prev_threshold = None
1574
+ prev_count = None
1575
+ for _ in range(_SIMILARITY_MAX_STEPS):
1576
+ matches = matches_at(threshold)
1577
+ if len(matches) == 1:
1578
+ paper, score = matches[0]
1579
+ return paper, score
1580
+ if len(matches) == 0:
1581
+ prev_threshold = threshold
1582
+ prev_count = 0
1583
+ threshold -= step
1584
+ continue
1585
+ if prev_count == 0 and prev_threshold is not None:
1586
+ low = threshold
1587
+ high = prev_threshold
1588
+ for _ in range(_SIMILARITY_MAX_STEPS):
1589
+ mid = (low + high) / 2
1590
+ mid_matches = matches_at(mid)
1591
+ if len(mid_matches) == 1:
1592
+ paper, score = mid_matches[0]
1593
+ return paper, score
1594
+ if len(mid_matches) == 0:
1595
+ high = mid
1596
+ else:
1597
+ low = mid
1598
+ return None, 0.0
1599
+ prev_threshold = threshold
1600
+ prev_count = len(matches)
1601
+ threshold -= step
1602
+ return None, 0.0
1603
+
1604
+
1605
+ def _resolve_paper_by_title_and_meta(
1606
+ paper: dict[str, Any],
1607
+ paper_index: dict[str, list[dict[str, Any]]],
1608
+ ) -> tuple[dict[str, Any] | None, str | None, float]:
1609
+ title = str(paper.get("paper_title") or "")
1610
+ title_key = _normalize_title_key(title)
1611
+ if not title_key:
1612
+ title_key = ""
1613
+ candidates = paper_index.get(title_key, [])
1614
+ if candidates:
1615
+ return candidates[0], "title", 1.0
1616
+ if title_key:
1617
+ compact_key = _compact_title_key(title_key)
1618
+ compact_candidates = paper_index.get(f"compact:{compact_key}", [])
1619
+ if compact_candidates:
1620
+ return compact_candidates[0], "title_compact", 1.0
1621
+ stripped_key = _strip_leading_numeric_tokens(title_key)
1622
+ if stripped_key and stripped_key != title_key:
1623
+ stripped_candidates = paper_index.get(stripped_key, [])
1624
+ if stripped_candidates:
1625
+ return stripped_candidates[0], "title_stripped", 1.0
1626
+ stripped_compact = _compact_title_key(stripped_key)
1627
+ stripped_candidates = paper_index.get(f"compact:{stripped_compact}", [])
1628
+ if stripped_candidates:
1629
+ return stripped_candidates[0], "title_compact", 1.0
1630
+ prefix_candidates: list[dict[str, Any]] = []
1631
+ prefix_key = _title_prefix_key(title_key)
1632
+ if prefix_key:
1633
+ prefix_candidates = paper_index.get(prefix_key, [])
1634
+ if not prefix_candidates:
1635
+ stripped_key = _strip_leading_numeric_tokens(title_key)
1636
+ if stripped_key and stripped_key != title_key:
1637
+ prefix_key = _title_prefix_key(stripped_key)
1638
+ if prefix_key:
1639
+ prefix_candidates = paper_index.get(prefix_key, [])
1640
+ if prefix_candidates:
1641
+ match, score = _adaptive_similarity_match_papers(title_key, prefix_candidates)
1642
+ if match is not None:
1643
+ match_type = "title_prefix" if score >= 1.0 else "title_fuzzy"
1644
+ return match, match_type, score
1645
+ year = str(paper.get("_year") or "").strip()
1646
+ if not year.isdigit():
1647
+ return None, None, 0.0
1648
+ author_key = ""
1649
+ authors = paper.get("_authors") or []
1650
+ if authors:
1651
+ author_key = _normalize_author_key(str(authors[0]))
1652
+ candidates = []
1653
+ match_type = "year"
1654
+ if author_key:
1655
+ candidates = paper_index.get(f"authoryear:{year}:{author_key}", [])
1656
+ if candidates:
1657
+ match_type = "author_year"
1658
+ if not candidates:
1659
+ candidates = paper_index.get(f"year:{year}", [])
1660
+ if not candidates:
1661
+ return None, None, 0.0
1662
+ if len(candidates) == 1 and not title_key:
1663
+ return candidates[0], match_type, 1.0
1664
+ match, score = _adaptive_similarity_match_papers(title_key, candidates)
1665
+ if match is not None:
1666
+ if score < _AUTHOR_YEAR_MIN_SIMILARITY:
1667
+ return None, None, 0.0
1668
+ return match, "title_fuzzy", score
1669
+ return None, None, 0.0
1670
+
1671
+
1672
+ def _get_paper_identifier(paper: dict[str, Any]) -> str:
1673
+ """Get a unique identifier for a paper."""
1674
+ return str(paper.get("source_hash") or paper.get("source_path", ""))
1675
+
1676
+
1677
+ def _match_datasets(
1678
+ dataset_a: CompareDataset,
1679
+ dataset_b: CompareDataset,
1680
+ *,
1681
+ lang: str | None = None,
1682
+ ) -> list[CompareResult]:
1683
+ """Match papers between two datasets using db_ops parity."""
1684
+ results: list[CompareResult] = []
1685
+ matched_a: set[int] = set()
1686
+ matched_b: set[int] = set()
1687
+ matched_b_info: dict[int, tuple[int, str | None, float]] = {}
1688
+ match_pairs: list[tuple[int, int, str | None, float]] = []
1689
+
1690
+ file_index_b = _merge_file_indexes(dataset_b.md_index, dataset_b.pdf_index)
1691
+
1692
+ for idx_a, paper in enumerate(dataset_a.papers):
1693
+ _prepare_paper_matching_fields(paper)
1694
+ source_hash = str(paper.get("source_hash") or "")
1695
+ title = str(paper.get("paper_title") or "")
1696
+ source_path = str(paper.get("source_path") or "")
1697
+
1698
+ match_type = None
1699
+ match_score = 0.0
1700
+ match_status = "only_in_A"
1701
+ matched_b_idx: int | None = None
1702
+ matched_b_paper: dict[str, Any] | None = None
1703
+
1704
+ if source_hash and source_hash in dataset_b.hash_to_index:
1705
+ matched_b_idx = dataset_b.hash_to_index[source_hash]
1706
+ matched_b_paper = dataset_b.papers[matched_b_idx]
1707
+ match_status = "matched"
1708
+ match_type = "hash"
1709
+ match_score = 1.0
1710
+ else:
1711
+ if file_index_b:
1712
+ matched_path, mt, score = _resolve_by_title_and_meta(paper, file_index_b)
1713
+ if matched_path is not None:
1714
+ matched_b_idx = dataset_b.path_to_index.get(matched_path.resolve())
1715
+ matched_b_paper = dataset_b.papers[matched_b_idx] if matched_b_idx is not None else None
1716
+ match_status = "matched"
1717
+ match_type = mt
1718
+ match_score = score
1719
+ if matched_b_idx is None:
1720
+ match_paper, mt, score = _resolve_paper_by_title_and_meta(paper, dataset_b.paper_index)
1721
+ if match_paper is not None:
1722
+ matched_b_idx = dataset_b.paper_id_to_index.get(id(match_paper))
1723
+ matched_b_paper = match_paper
1724
+ match_status = "matched"
1725
+ match_type = mt
1726
+ match_score = score
1727
+ if matched_b_idx is None and lang:
1728
+ base_key = _translation_base_key_for_paper(paper)
1729
+ if base_key:
1730
+ translated_path = dataset_b.translated_index.get(base_key, {}).get(lang.lower())
1731
+ if translated_path is not None:
1732
+ matched_b_idx = dataset_b.path_to_index.get(translated_path.resolve())
1733
+ matched_b_paper = dataset_b.papers[matched_b_idx] if matched_b_idx is not None else None
1734
+ match_status = "matched"
1735
+ match_type = f"translated_{lang.lower()}"
1736
+ match_score = 1.0
1737
+
1738
+ other_hash = None
1739
+ other_title = None
1740
+ other_path = None
1741
+ if matched_b_idx is not None and matched_b_paper is not None:
1742
+ matched_a.add(idx_a)
1743
+ matched_b.add(matched_b_idx)
1744
+ other_hash = str(matched_b_paper.get("source_hash") or "")
1745
+ other_title = str(matched_b_paper.get("paper_title") or "")
1746
+ other_path = str(matched_b_paper.get("source_path") or "")
1747
+ matched_b_info[matched_b_idx] = (idx_a, match_type, match_score)
1748
+ match_pairs.append((idx_a, matched_b_idx, match_type, match_score))
1749
+
1750
+ results.append(
1751
+ CompareResult(
1752
+ side="A",
1753
+ source_hash=source_hash,
1754
+ title=title,
1755
+ match_status=match_status,
1756
+ match_type=match_type,
1757
+ match_score=match_score,
1758
+ source_path=source_path if source_path else None,
1759
+ other_source_hash=other_hash,
1760
+ other_title=other_title,
1761
+ other_source_path=other_path,
1762
+ lang=lang.lower() if lang else None,
1763
+ )
1764
+ )
1765
+
1766
+ for idx_b, paper in enumerate(dataset_b.papers):
1767
+ _prepare_paper_matching_fields(paper)
1768
+ source_hash = str(paper.get("source_hash") or "")
1769
+ title = str(paper.get("paper_title") or "")
1770
+ source_path = str(paper.get("source_path") or "")
1771
+ match_status = "only_in_B"
1772
+ match_type = None
1773
+ match_score = 0.0
1774
+ other_hash = None
1775
+ other_title = None
1776
+ other_path = None
1777
+ if idx_b in matched_b:
1778
+ match_status = "matched"
1779
+ info = matched_b_info.get(idx_b)
1780
+ if info:
1781
+ idx_a, match_type, match_score = info
1782
+ a_paper = dataset_a.papers[idx_a]
1783
+ other_hash = str(a_paper.get("source_hash") or "")
1784
+ other_title = str(a_paper.get("paper_title") or "")
1785
+ other_path = str(a_paper.get("source_path") or "")
1786
+ results.append(
1787
+ CompareResult(
1788
+ side="B",
1789
+ source_hash=source_hash,
1790
+ title=title,
1791
+ match_status=match_status,
1792
+ match_type=match_type,
1793
+ match_score=match_score,
1794
+ source_path=source_path if source_path else None,
1795
+ other_source_hash=other_hash,
1796
+ other_title=other_title,
1797
+ other_source_path=other_path,
1798
+ lang=lang.lower() if lang else None,
1799
+ )
1800
+ )
1801
+
1802
+ for idx_a, idx_b, match_type, match_score in match_pairs:
1803
+ paper_a = dataset_a.papers[idx_a]
1804
+ paper_b = dataset_b.papers[idx_b]
1805
+ results.append(
1806
+ CompareResult(
1807
+ side="MATCH",
1808
+ source_hash=str(paper_a.get("source_hash") or ""),
1809
+ title=str(paper_a.get("paper_title") or ""),
1810
+ match_status="matched_pair",
1811
+ match_type=match_type,
1812
+ match_score=match_score,
1813
+ source_path=str(paper_a.get("source_path") or "") or None,
1814
+ other_source_hash=str(paper_b.get("source_hash") or ""),
1815
+ other_title=str(paper_b.get("paper_title") or ""),
1816
+ other_source_path=str(paper_b.get("source_path") or "") or None,
1817
+ lang=lang.lower() if lang else None,
1818
+ )
1819
+ )
1820
+
1821
+ return results
1822
+
1823
+
1824
+ def build_compare_dataset(
1825
+ *,
1826
+ json_paths: list[Path] | None = None,
1827
+ pdf_roots: list[Path] | None = None,
1828
+ md_roots: list[Path] | None = None,
1829
+ md_translated_roots: list[Path] | None = None,
1830
+ bibtex_path: Path | None = None,
1831
+ lang: str | None = None,
1832
+ ) -> CompareDataset:
1833
+ """Load and index a dataset from various sources."""
1834
+ papers: list[dict[str, Any]] = []
1835
+
1836
+ # Load from JSON files
1837
+ if json_paths:
1838
+ for path in json_paths:
1839
+ data = json.loads(path.read_text(encoding="utf-8"))
1840
+ if isinstance(data, list):
1841
+ # Array format - direct list of papers
1842
+ papers.extend(data)
1843
+ elif isinstance(data, dict):
1844
+ # Object format with template_tag and papers
1845
+ if isinstance(data.get("papers"), list):
1846
+ papers.extend(data["papers"])
1847
+ else:
1848
+ raise ValueError(f"Invalid JSON format in {path}")
1849
+
1850
+ # Enrich with bibtex if provided
1851
+ if bibtex_path and PYBTEX_AVAILABLE:
1852
+ enrich_with_bibtex(papers, bibtex_path)
1853
+
1854
+ for paper in papers:
1855
+ _prepare_paper_matching_fields(paper)
1856
+
1857
+ md_paths = _scan_md_roots(md_roots or [])
1858
+ pdf_paths, _ = _scan_pdf_roots(pdf_roots or [])
1859
+ md_index = _build_file_index(md_roots or [], suffixes={".md"})
1860
+ pdf_index = _build_file_index(pdf_roots or [], suffixes={".pdf"})
1861
+ translated_index = _build_translated_index(md_translated_roots or [])
1862
+
1863
+ if pdf_paths:
1864
+ papers.extend(_build_pdf_only_entries(papers, pdf_paths, pdf_index))
1865
+ if md_paths:
1866
+ papers.extend(_build_md_only_entries(papers, md_paths, md_index))
1867
+ if translated_index and lang:
1868
+ papers.extend(_build_translated_only_entries(papers, translated_index, lang))
1869
+
1870
+ for paper in papers:
1871
+ _prepare_paper_matching_fields(paper)
1872
+
1873
+ paper_index = _build_paper_index(papers)
1874
+ path_to_index: dict[Path, int] = {}
1875
+ hash_to_index: dict[str, int] = {}
1876
+ paper_id_to_index: dict[int, int] = {}
1877
+ for idx, paper in enumerate(papers):
1878
+ paper_id_to_index[id(paper)] = idx
1879
+ source_hash = str(paper.get("source_hash") or "")
1880
+ if source_hash and source_hash not in hash_to_index:
1881
+ hash_to_index[source_hash] = idx
1882
+ source_path = paper.get("source_path")
1883
+ if source_path:
1884
+ path_to_index[Path(str(source_path)).resolve()] = idx
1885
+
1886
+ return CompareDataset(
1887
+ papers=papers,
1888
+ md_index=md_index,
1889
+ pdf_index=pdf_index,
1890
+ translated_index=translated_index,
1891
+ paper_index=paper_index,
1892
+ path_to_index=path_to_index,
1893
+ hash_to_index=hash_to_index,
1894
+ paper_id_to_index=paper_id_to_index,
1895
+ )
1896
+
1897
+
1898
+ def compare_datasets(
1899
+ *,
1900
+ json_paths_a: list[Path] | None = None,
1901
+ pdf_roots_a: list[Path] | None = None,
1902
+ md_roots_a: list[Path] | None = None,
1903
+ md_translated_roots_a: list[Path] | None = None,
1904
+ json_paths_b: list[Path] | None = None,
1905
+ pdf_roots_b: list[Path] | None = None,
1906
+ md_roots_b: list[Path] | None = None,
1907
+ md_translated_roots_b: list[Path] | None = None,
1908
+ bibtex_path: Path | None = None,
1909
+ lang: str | None = None,
1910
+ ) -> list[CompareResult]:
1911
+ """Compare two datasets and return comparison results."""
1912
+ # Validate language requirement for translated inputs
1913
+ has_translated_a = md_translated_roots_a is not None and len(md_translated_roots_a) > 0
1914
+ has_translated_b = md_translated_roots_b is not None and len(md_translated_roots_b) > 0
1915
+
1916
+ if (has_translated_a or has_translated_b) and lang is None:
1917
+ raise ValueError(
1918
+ "--lang parameter is required when comparing translated Markdown datasets"
1919
+ )
1920
+
1921
+ dataset_a = build_compare_dataset(
1922
+ json_paths=json_paths_a,
1923
+ pdf_roots=pdf_roots_a,
1924
+ md_roots=md_roots_a,
1925
+ md_translated_roots=md_translated_roots_a,
1926
+ bibtex_path=bibtex_path,
1927
+ lang=lang,
1928
+ )
1929
+
1930
+ dataset_b = build_compare_dataset(
1931
+ json_paths=json_paths_b,
1932
+ pdf_roots=pdf_roots_b,
1933
+ md_roots=md_roots_b,
1934
+ md_translated_roots=md_translated_roots_b,
1935
+ bibtex_path=bibtex_path,
1936
+ lang=lang,
1937
+ )
1938
+
1939
+ return _match_datasets(dataset_a, dataset_b, lang=lang)