deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. deepresearch_flow/paper/cli.py +63 -0
  2. deepresearch_flow/paper/config.py +87 -12
  3. deepresearch_flow/paper/db.py +1154 -35
  4. deepresearch_flow/paper/db_ops.py +124 -19
  5. deepresearch_flow/paper/extract.py +1546 -152
  6. deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
  7. deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
  8. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
  9. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
  10. deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
  11. deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
  14. deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
  15. deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
  16. deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
  17. deepresearch_flow/paper/providers/azure_openai.py +45 -3
  18. deepresearch_flow/paper/providers/openai_compatible.py +45 -3
  19. deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
  20. deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
  21. deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
  22. deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
  23. deepresearch_flow/paper/snapshot/__init__.py +4 -0
  24. deepresearch_flow/paper/snapshot/api.py +941 -0
  25. deepresearch_flow/paper/snapshot/builder.py +965 -0
  26. deepresearch_flow/paper/snapshot/identity.py +239 -0
  27. deepresearch_flow/paper/snapshot/schema.py +245 -0
  28. deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
  29. deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
  30. deepresearch_flow/paper/snapshot/text.py +154 -0
  31. deepresearch_flow/paper/template_registry.py +1 -0
  32. deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
  33. deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
  34. deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
  35. deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
  36. deepresearch_flow/paper/web/app.py +10 -3
  37. deepresearch_flow/recognize/cli.py +380 -103
  38. deepresearch_flow/recognize/markdown.py +31 -7
  39. deepresearch_flow/recognize/math.py +47 -12
  40. deepresearch_flow/recognize/mermaid.py +320 -10
  41. deepresearch_flow/recognize/organize.py +29 -7
  42. deepresearch_flow/translator/cli.py +71 -20
  43. deepresearch_flow/translator/engine.py +220 -81
  44. deepresearch_flow/translator/prompts.py +19 -2
  45. deepresearch_flow/translator/protector.py +15 -3
  46. deepresearch_flow-0.6.1.dist-info/METADATA +849 -0
  47. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/RECORD +51 -43
  48. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/WHEEL +1 -1
  49. deepresearch_flow-0.5.1.dist-info/METADATA +0 -440
  50. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/entry_points.txt +0 -0
  51. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/licenses/LICENSE +0 -0
  52. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,965 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import base64
5
+ from datetime import datetime, timezone
6
+ import hashlib
7
+ import json
8
+ import mimetypes
9
+ from pathlib import Path
10
+ import re
11
+ import sqlite3
12
+ from typing import Any
13
+ import uuid
14
+
15
+ from deepresearch_flow.paper.db_ops import build_index, load_and_merge_papers
16
+
17
+ from deepresearch_flow.paper.render import load_default_template
18
+ from deepresearch_flow.paper.template_registry import load_render_template
19
+ from deepresearch_flow.paper.snapshot.identity import (
20
+ PaperKeyCandidate,
21
+ build_paper_key_candidates,
22
+ choose_preferred_key,
23
+ meta_fingerprint_divergent,
24
+ paper_id_for_key,
25
+ )
26
+ from deepresearch_flow.paper.snapshot.schema import (
27
+ init_snapshot_db,
28
+ recompute_facet_counts,
29
+ recompute_paper_index,
30
+ )
31
+ from deepresearch_flow.paper.snapshot.text import (
32
+ insert_cjk_spaces,
33
+ markdown_to_plain_text,
34
+ )
35
+ from deepresearch_flow.paper.utils import stable_hash
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class SnapshotBuildOptions:
40
+ input_paths: list[Path]
41
+ bibtex_path: Path | None
42
+ md_roots: list[Path]
43
+ md_translated_roots: list[Path]
44
+ pdf_roots: list[Path]
45
+ output_db: Path
46
+ static_export_dir: Path
47
+ previous_snapshot_db: Path | None
48
+ min_meta_title_similarity: float = 0.6
49
+ min_meta_author_jaccard: float = 0.4
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class PreviousAlias:
54
+ paper_id: str
55
+ paper_key_type: str
56
+ meta_fingerprint: str | None
57
+
58
+
59
+ def _hash_file(path: Path) -> str:
60
+ digest = hashlib.sha256()
61
+ with path.open("rb") as handle:
62
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
63
+ digest.update(chunk)
64
+ return digest.hexdigest()
65
+
66
+
67
+ def _hash_text(text: str) -> str:
68
+ return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest()
69
+
70
+
71
+ def _hash_bytes(data: bytes) -> str:
72
+ return hashlib.sha256(data).hexdigest()
73
+
74
+
75
+ def _safe_read_text(path: Path) -> str:
76
+ try:
77
+ return path.read_text(encoding="utf-8")
78
+ except UnicodeDecodeError:
79
+ return path.read_text(encoding="latin-1")
80
+
81
+
82
+ _MD_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
83
+ _DATA_URL_PATTERN = re.compile(r"^data:([^;,]+)(;base64)?,(.*)$", re.DOTALL)
84
+ _IMG_TAG_PATTERN = re.compile(r"<img\\b[^>]*>", re.IGNORECASE)
85
+ _SRC_ATTR_PATTERN = re.compile(r"\\bsrc\\s*=\\s*(\"[^\"]*\"|'[^']*'|[^\\s>]+)", re.IGNORECASE | re.DOTALL)
86
+ _EXTENSION_OVERRIDES = {".jpe": ".jpg"}
87
+ _WHITESPACE_RE = re.compile(r"\s+")
88
+
89
+
90
+ def _split_link_target(raw_link: str) -> tuple[str, str, str, str]:
91
+ link = raw_link.strip()
92
+ if link.startswith("<"):
93
+ end = link.find(">")
94
+ if end != -1:
95
+ return link[1:end], link[end + 1 :], "<", ">"
96
+ parts = link.split()
97
+ if not parts:
98
+ return "", "", "", ""
99
+ target = parts[0]
100
+ suffix = link[len(target) :]
101
+ return target, suffix, "", ""
102
+
103
+
104
+ def _normalize_facet_value(value: str | None) -> str:
105
+ cleaned = str(value or "").strip().lower()
106
+ cleaned = _WHITESPACE_RE.sub(" ", cleaned)
107
+ return cleaned
108
+
109
+
110
+ def _extension_from_mime(mime: str) -> str | None:
111
+ ext = mimetypes.guess_extension(mime, strict=False)
112
+ if ext in _EXTENSION_OVERRIDES:
113
+ return _EXTENSION_OVERRIDES[ext]
114
+ return ext
115
+
116
+
117
+ def _parse_data_url(target: str) -> tuple[str, bytes] | None:
118
+ match = _DATA_URL_PATTERN.match(target)
119
+ if not match:
120
+ return None
121
+ mime = match.group(1) or ""
122
+ if not mime.startswith("image/"):
123
+ return None
124
+ if match.group(2) != ";base64":
125
+ return None
126
+ payload = match.group(3) or ""
127
+ try:
128
+ return mime, base64.b64decode(payload)
129
+ except Exception:
130
+ return None
131
+
132
+
133
+ def _is_absolute_url(target: str) -> bool:
134
+ lowered = target.lower()
135
+ return lowered.startswith(("http://", "https://", "data:", "mailto:", "file:", "#")) or target.startswith("/")
136
+
137
+
138
+ def _rewrite_markdown_images(
139
+ markdown: str,
140
+ *,
141
+ source_path: Path,
142
+ images_output_dir: Path,
143
+ written: set[str],
144
+ ) -> tuple[str, list[dict[str, Any]]]:
145
+ images: list[dict[str, Any]] = []
146
+
147
+ def store_bytes(mime: str, data: bytes) -> str | None:
148
+ ext = _extension_from_mime(mime)
149
+ if not ext:
150
+ return None
151
+ digest = _hash_bytes(data)
152
+ filename = f"{digest}{ext}"
153
+ rel = f"images/{filename}"
154
+ if filename not in written:
155
+ images_output_dir.mkdir(parents=True, exist_ok=True)
156
+ dest = images_output_dir / filename
157
+ if not dest.exists():
158
+ dest.write_bytes(data)
159
+ written.add(filename)
160
+ images.append({"path": rel, "sha256": digest, "ext": ext.lstrip("."), "status": "available"})
161
+ return rel
162
+
163
+ def store_local(target: str) -> str | None:
164
+ cleaned = target.strip()
165
+ while cleaned.startswith("../"):
166
+ cleaned = cleaned[3:]
167
+ cleaned = cleaned.replace("\\", "/")
168
+ cleaned = cleaned.lstrip("./")
169
+ cleaned = cleaned.lstrip("/")
170
+
171
+ local_path = (source_path.parent / cleaned).resolve()
172
+ if local_path.exists() and local_path.is_file():
173
+ ext = local_path.suffix.lower()
174
+ digest = _hash_file(local_path)
175
+ filename = f"{digest}{ext}" if ext else digest
176
+ rel = f"images/{filename}"
177
+ if filename not in written:
178
+ images_output_dir.mkdir(parents=True, exist_ok=True)
179
+ dest = images_output_dir / filename
180
+ if not dest.exists():
181
+ dest.write_bytes(local_path.read_bytes())
182
+ written.add(filename)
183
+ images.append({"path": rel, "sha256": digest, "ext": ext.lstrip("."), "status": "available"})
184
+ return rel
185
+
186
+ images.append({"path": cleaned, "sha256": None, "ext": Path(cleaned).suffix.lstrip("."), "status": "missing"})
187
+ return None
188
+
189
+ def replace(match) -> str:
190
+ alt_text = match.group(1)
191
+ raw_link = match.group(2)
192
+ target, suffix, prefix, postfix = _split_link_target(raw_link)
193
+ parsed = _parse_data_url(target)
194
+ if parsed is not None:
195
+ mime, data = parsed
196
+ replacement = store_bytes(mime, data)
197
+ if not replacement:
198
+ return match.group(0)
199
+ new_link = f"{prefix}{replacement}{postfix}{suffix}"
200
+ return f"![{alt_text}]({new_link})"
201
+ if not target or _is_absolute_url(target):
202
+ return match.group(0)
203
+
204
+ rel = store_local(target)
205
+ if not rel:
206
+ return match.group(0)
207
+ new_link = f"{prefix}{rel}{postfix}{suffix}"
208
+ return f"![{alt_text}]({new_link})"
209
+
210
+ rewritten = _MD_IMAGE_RE.sub(replace, markdown)
211
+
212
+ def replace_img(match: re.Match[str]) -> str:
213
+ tag = match.group(0)
214
+ src_match = _SRC_ATTR_PATTERN.search(tag)
215
+ if not src_match:
216
+ return tag
217
+ raw_value = src_match.group(1)
218
+ quote = ""
219
+ if raw_value and raw_value[0] in {"\"", "'"}:
220
+ quote = raw_value[0]
221
+ value = raw_value[1:-1]
222
+ else:
223
+ value = raw_value
224
+ parsed = _parse_data_url(value)
225
+ if parsed is not None:
226
+ mime, data = parsed
227
+ replacement = store_bytes(mime, data)
228
+ elif not _is_absolute_url(value):
229
+ replacement = store_local(value)
230
+ else:
231
+ replacement = None
232
+ if not replacement:
233
+ return tag
234
+ new_src = f"{quote}{replacement}{quote}" if quote else replacement
235
+ return tag[: src_match.start(1)] + new_src + tag[src_match.end(1) :]
236
+
237
+ rewritten = _IMG_TAG_PATTERN.sub(replace_img, rewritten)
238
+ return rewritten, images
239
+
240
+
241
+ def _sanitize_component(value: str) -> str:
242
+ import re
243
+
244
+ text = (value or "").strip()
245
+ text = re.sub(r'[\\/:\*\?"<>\|]+', "_", text)
246
+ text = re.sub(r"\s+", "_", text)
247
+ text = re.sub(r"_+", "_", text)
248
+ return text.strip("_")
249
+
250
+
251
+ def _normalize_display_venue(value: str) -> str:
252
+ if not value:
253
+ return ""
254
+ text = re.sub(r"\{\{|\}\}", "", value)
255
+ text = re.sub(r"\s+", " ", text).strip()
256
+ return text
257
+
258
+
259
+ def _truncate(value: str, max_len: int) -> str:
260
+ if max_len <= 0:
261
+ return value
262
+ return value if len(value) <= max_len else value[:max_len].rstrip("_")
263
+
264
+
265
+ def _folder_names(first_author: str, year: str, title: str, paper_id: str) -> tuple[str, str]:
266
+ base_author = _truncate(_sanitize_component(first_author) or "unknown", 32)
267
+ base_year = _sanitize_component(year) or "unknown"
268
+ base_title = _truncate(_sanitize_component(title) or "untitled", 80)
269
+ full = _sanitize_component(f"{base_author}_{base_year}_{base_title}__{paper_id}")
270
+ short = _sanitize_component(f"{base_author}_{base_year}__{paper_id}")
271
+ if len(full) > 200:
272
+ return short, _sanitize_component(paper_id)
273
+ return full, short
274
+
275
+
276
+ _MONTH_WORDS = {
277
+ "jan": "01",
278
+ "january": "01",
279
+ "feb": "02",
280
+ "february": "02",
281
+ "mar": "03",
282
+ "march": "03",
283
+ "apr": "04",
284
+ "april": "04",
285
+ "may": "05",
286
+ "jun": "06",
287
+ "june": "06",
288
+ "jul": "07",
289
+ "july": "07",
290
+ "aug": "08",
291
+ "august": "08",
292
+ "sep": "09",
293
+ "sept": "09",
294
+ "september": "09",
295
+ "oct": "10",
296
+ "october": "10",
297
+ "nov": "11",
298
+ "november": "11",
299
+ "dec": "12",
300
+ "december": "12",
301
+ }
302
+
303
+
304
+ def _parse_year_month_from_text(text: str) -> tuple[str | None, str | None]:
305
+ if not text:
306
+ return None, None
307
+ value = str(text).strip()
308
+ if not value:
309
+ return None, None
310
+ year_match = re.search(r"(19|20)\d{2}", value)
311
+ year = year_match.group(0) if year_match else None
312
+
313
+ numeric_match = re.search(r"(19|20)\d{2}[-/](\d{1,2})", value)
314
+ if numeric_match:
315
+ m = int(numeric_match.group(2))
316
+ month = f"{m:02d}" if 1 <= m <= 12 else None
317
+ return year, month
318
+
319
+ word_match = re.search(
320
+ r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|"
321
+ r"january|february|march|april|june|july|august|september|october|november|december)\b",
322
+ value.lower(),
323
+ )
324
+ if word_match:
325
+ return year, _MONTH_WORDS.get(word_match.group(0))
326
+
327
+ return year, None
328
+
329
+
330
+ def _extract_publication_date(paper: dict[str, Any]) -> str:
331
+ value = paper.get("publication_date") or paper.get("paper_publication_date") or ""
332
+ return str(value).strip()
333
+
334
+
335
+ def _load_previous_aliases(db_path: Path) -> dict[str, PreviousAlias]:
336
+ if not db_path:
337
+ return {}
338
+ if not db_path.exists():
339
+ return {}
340
+ conn = sqlite3.connect(str(db_path))
341
+ try:
342
+ rows = conn.execute(
343
+ "SELECT paper_key, paper_id, paper_key_type, meta_fingerprint FROM paper_key_alias"
344
+ ).fetchall()
345
+ except sqlite3.Error:
346
+ return {}
347
+ finally:
348
+ conn.close()
349
+ out: dict[str, PreviousAlias] = {}
350
+ for paper_key, paper_id, paper_key_type, meta_fingerprint in rows:
351
+ out[str(paper_key)] = PreviousAlias(
352
+ paper_id=str(paper_id),
353
+ paper_key_type=str(paper_key_type),
354
+ meta_fingerprint=str(meta_fingerprint) if meta_fingerprint is not None else None,
355
+ )
356
+ return out
357
+
358
+
359
+ def _pick_paper_id(
360
+ candidates: list[PaperKeyCandidate],
361
+ *,
362
+ previous: dict[str, PreviousAlias],
363
+ min_meta_title_similarity: float,
364
+ min_meta_author_jaccard: float,
365
+ ) -> tuple[str, PaperKeyCandidate, list[str]]:
366
+ preferred = choose_preferred_key(candidates)
367
+ matched: list[tuple[PaperKeyCandidate, PreviousAlias]] = []
368
+ for cand in candidates:
369
+ prev = previous.get(cand.paper_key)
370
+ if prev:
371
+ matched.append((cand, prev))
372
+ if not matched:
373
+ return paper_id_for_key(preferred.paper_key), preferred, []
374
+
375
+ matched.sort(key=lambda pair: pair[0].strength, reverse=True)
376
+ chosen_cand, chosen_prev = matched[0]
377
+ conflicts = []
378
+ for cand, prev in matched[1:]:
379
+ if prev.paper_id != chosen_prev.paper_id:
380
+ conflicts.append(
381
+ f"key_conflict:{cand.paper_key} maps {prev.paper_id} vs {chosen_prev.paper_id}"
382
+ )
383
+
384
+ if chosen_cand.key_type == "meta":
385
+ if meta_fingerprint_divergent(
386
+ chosen_prev.meta_fingerprint,
387
+ chosen_cand.meta_fingerprint,
388
+ min_title_similarity=min_meta_title_similarity,
389
+ min_author_jaccard=min_meta_author_jaccard,
390
+ ):
391
+ conflicts.append(f"meta_divergent:{chosen_cand.paper_key}")
392
+ return paper_id_for_key(preferred.paper_key), preferred, conflicts
393
+
394
+ return chosen_prev.paper_id, preferred, conflicts
395
+
396
+
397
+ def _extract_summary_markdown(paper: dict[str, Any]) -> str:
398
+ if isinstance(paper.get("summary"), str) and paper.get("summary").strip():
399
+ return str(paper.get("summary"))
400
+ templates = paper.get("templates")
401
+ if isinstance(templates, dict):
402
+ for template_tag in ("simple", "simple_phi"):
403
+ tmpl = templates.get(template_tag)
404
+ if isinstance(tmpl, dict) and isinstance(tmpl.get("summary"), str) and tmpl.get("summary").strip():
405
+ return str(tmpl.get("summary"))
406
+ if isinstance(paper.get("abstract"), str) and paper.get("abstract").strip():
407
+ return str(paper.get("abstract"))
408
+ return ""
409
+
410
+
411
+ def _canonical_template_tag(value: str) -> str:
412
+ tag = (value or "").strip().lower()
413
+ tag = re.sub(r"[^a-z0-9_-]+", "_", tag)
414
+ tag = re.sub(r"_+", "_", tag).strip("_")
415
+ return tag or "default"
416
+
417
+
418
+ def _extract_template_summaries(paper: dict[str, Any]) -> dict[str, str]:
419
+ summaries: dict[str, str] = {}
420
+ templates = paper.get("templates")
421
+ if isinstance(templates, dict):
422
+ for tag, payload in templates.items():
423
+ if not isinstance(tag, str) or not tag.strip():
424
+ continue
425
+ canonical_tag = _canonical_template_tag(tag)
426
+ if not isinstance(payload, dict):
427
+ continue
428
+ for key in ("summary", "abstract"):
429
+ value = payload.get(key)
430
+ if isinstance(value, str) and value.strip():
431
+ summaries[canonical_tag] = value.strip()
432
+ break
433
+ if canonical_tag not in summaries:
434
+ summaries[canonical_tag] = _render_template_fallback_markdown(
435
+ paper, template_tag=canonical_tag, template_payload=payload
436
+ )
437
+
438
+ top_level = paper.get("summary")
439
+ if isinstance(top_level, str) and top_level.strip():
440
+ tag = _canonical_template_tag(
441
+ str(paper.get("default_template") or paper.get("prompt_template") or paper.get("template_tag") or "default")
442
+ )
443
+ summaries.setdefault(tag, top_level.strip())
444
+
445
+ if not summaries:
446
+ fallback = _extract_summary_markdown(paper)
447
+ if fallback:
448
+ summaries["default"] = fallback
449
+
450
+ return summaries
451
+
452
+
453
+ def _render_template_fallback_markdown(
454
+ paper: dict[str, Any],
455
+ *,
456
+ template_tag: str,
457
+ template_payload: dict[str, Any],
458
+ ) -> str:
459
+ context = dict(paper)
460
+ context.update(template_payload)
461
+ context.setdefault("output_language", paper.get("output_language") or "en")
462
+
463
+ try:
464
+ template = load_render_template(template_tag)
465
+ except Exception:
466
+ template = load_default_template()
467
+
468
+ try:
469
+ rendered = template.render(**context)
470
+ return rendered.strip() if isinstance(rendered, str) else ""
471
+ except Exception:
472
+ payload = json.dumps(template_payload, ensure_ascii=False, indent=2)
473
+ return f"```json\n{payload}\n```"
474
+
475
+
476
+ def _choose_preferred_summary_template(paper: dict[str, Any], summaries: dict[str, str]) -> str:
477
+ if not summaries:
478
+ return "default"
479
+ preferred = _canonical_template_tag(str(paper.get("prompt_template") or paper.get("template_tag") or ""))
480
+ if preferred and preferred in summaries:
481
+ return preferred
482
+ for key in ("simple", "simple_phi"):
483
+ if key in summaries:
484
+ return key
485
+ return sorted(summaries.keys(), key=lambda item: item.lower())[0]
486
+
487
+
488
+ def _summary_preview(markdown: str, *, max_len: int = 320) -> str:
489
+ if not markdown:
490
+ return ""
491
+ text = markdown_to_plain_text(markdown)
492
+ if len(text) > max_len:
493
+ return text[: max_len - 1].rstrip() + "…"
494
+ return text
495
+
496
+
497
+ def _write_json(path: Path, data: Any) -> None:
498
+ path.parent.mkdir(parents=True, exist_ok=True)
499
+ path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
500
+
501
+
502
+ def build_snapshot(opts: SnapshotBuildOptions) -> None:
503
+ if opts.output_db.exists():
504
+ opts.output_db.unlink()
505
+
506
+ papers = load_and_merge_papers(
507
+ opts.input_paths,
508
+ opts.bibtex_path,
509
+ cache_dir=None,
510
+ use_cache=False,
511
+ pdf_roots=opts.pdf_roots,
512
+ )
513
+ index = build_index(
514
+ papers,
515
+ md_roots=opts.md_roots,
516
+ md_translated_roots=opts.md_translated_roots,
517
+ pdf_roots=opts.pdf_roots,
518
+ )
519
+
520
+ previous_aliases = _load_previous_aliases(opts.previous_snapshot_db) if opts.previous_snapshot_db else {}
521
+ snapshot_build_id = uuid.uuid4().hex
522
+
523
+ opts.output_db.parent.mkdir(parents=True, exist_ok=True)
524
+ conn = sqlite3.connect(str(opts.output_db))
525
+ conn.row_factory = sqlite3.Row
526
+ try:
527
+ init_snapshot_db(conn)
528
+ conn.execute(
529
+ "INSERT OR REPLACE INTO snapshot_meta(key, value) VALUES (?, ?)",
530
+ ("snapshot_build_id", snapshot_build_id),
531
+ )
532
+ conn.execute(
533
+ "INSERT OR REPLACE INTO snapshot_meta(key, value) VALUES (?, ?)",
534
+ ("built_at", datetime.now(timezone.utc).isoformat()),
535
+ )
536
+
537
+ static_root = opts.static_export_dir
538
+ (static_root / "pdf").mkdir(parents=True, exist_ok=True)
539
+ (static_root / "md").mkdir(parents=True, exist_ok=True)
540
+ (static_root / "md_translate").mkdir(parents=True, exist_ok=True)
541
+ (static_root / "images").mkdir(parents=True, exist_ok=True)
542
+ (static_root / "summary").mkdir(parents=True, exist_ok=True)
543
+ (static_root / "manifest").mkdir(parents=True, exist_ok=True)
544
+
545
+ written_images: set[str] = set()
546
+ facet_node_cache: dict[tuple[str, str], int] = {}
547
+
548
+ def get_facet_node_id(facet_type: str, value: str | None) -> int | None:
549
+ normalized = _normalize_facet_value(value)
550
+ if not normalized or normalized == "unknown":
551
+ return None
552
+ key = (facet_type, normalized)
553
+ cached = facet_node_cache.get(key)
554
+ if cached:
555
+ return cached
556
+ conn.execute(
557
+ "INSERT OR IGNORE INTO facet_node(facet_type, value) VALUES (?, ?)",
558
+ (facet_type, normalized),
559
+ )
560
+ row = conn.execute(
561
+ "SELECT node_id FROM facet_node WHERE facet_type = ? AND value = ?",
562
+ (facet_type, normalized),
563
+ ).fetchone()
564
+ if not row:
565
+ return None
566
+ node_id = int(row["node_id"])
567
+ facet_node_cache[key] = node_id
568
+ return node_id
569
+
570
+ with conn:
571
+ for idx, paper in enumerate(index.papers):
572
+ candidates = build_paper_key_candidates(paper)
573
+ paper_id, preferred, conflicts = _pick_paper_id(
574
+ candidates,
575
+ previous=previous_aliases,
576
+ min_meta_title_similarity=opts.min_meta_title_similarity,
577
+ min_meta_author_jaccard=opts.min_meta_author_jaccard,
578
+ )
579
+
580
+ title = str(paper.get("paper_title") or "").strip()
581
+ year = str(paper.get("_year") or "unknown").strip() or "unknown"
582
+ year = year if year.isdigit() else year.lower()
583
+ month = "unknown"
584
+ pub_date = _extract_publication_date(paper)
585
+
586
+ bib = paper.get("bibtex") if isinstance(paper.get("bibtex"), dict) else None
587
+ bib_fields = (bib.get("fields") if isinstance(bib, dict) else None) or {}
588
+ bib_year = str(bib_fields.get("year") or "").strip()
589
+ bib_month = str(bib_fields.get("month") or "").strip()
590
+ if bib_year and not year.isdigit():
591
+ parsed_year, _ = _parse_year_month_from_text(bib_year)
592
+ if parsed_year:
593
+ year = parsed_year
594
+ if bib_month:
595
+ _, parsed_month = _parse_year_month_from_text(f"2000-{bib_month}")
596
+ if parsed_month:
597
+ month = parsed_month
598
+ if month == "unknown" and pub_date:
599
+ _, parsed_month = _parse_year_month_from_text(pub_date)
600
+ if parsed_month:
601
+ month = parsed_month
602
+
603
+ if not pub_date:
604
+ pub_date = year if year.isdigit() else ""
605
+ venue = _normalize_display_venue(str(paper.get("_venue") or "").strip()) or "unknown"
606
+ source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
607
+
608
+ authors = paper.get("_authors") or paper.get("paper_authors") or []
609
+ if not isinstance(authors, list):
610
+ authors = [str(authors)]
611
+ first_author = str(authors[0]) if authors else "unknown"
612
+
613
+ pdf_hash = None
614
+ source_md_hash = None
615
+ translated_hashes: dict[str, str] = {}
616
+ images: list[dict[str, Any]] = []
617
+
618
+ md_path = index.md_path_by_hash.get(source_hash)
619
+ if md_path:
620
+ raw_md = _safe_read_text(md_path)
621
+ rewritten_md, md_images = _rewrite_markdown_images(
622
+ raw_md,
623
+ source_path=md_path,
624
+ images_output_dir=static_root / "images",
625
+ written=written_images,
626
+ )
627
+ source_md_hash = _hash_text(rewritten_md)
628
+ md_target = static_root / "md" / f"{source_md_hash}.md"
629
+ if not md_target.exists():
630
+ md_target.write_text(rewritten_md, encoding="utf-8")
631
+ images.extend(md_images)
632
+
633
+ translations = index.translated_md_by_hash.get(source_hash, {})
634
+ for lang, t_path in translations.items():
635
+ raw_md = _safe_read_text(t_path)
636
+ rewritten_md, md_images = _rewrite_markdown_images(
637
+ raw_md,
638
+ source_path=t_path,
639
+ images_output_dir=static_root / "images",
640
+ written=written_images,
641
+ )
642
+ md_hash = _hash_text(rewritten_md)
643
+ lang_norm = str(lang).lower()
644
+ (static_root / "md_translate" / lang_norm).mkdir(parents=True, exist_ok=True)
645
+ md_target = static_root / "md_translate" / lang_norm / f"{md_hash}.md"
646
+ if not md_target.exists():
647
+ md_target.write_text(rewritten_md, encoding="utf-8")
648
+ translated_hashes[lang_norm] = md_hash
649
+ images.extend(md_images)
650
+
651
+ pdf_path = index.pdf_path_by_hash.get(source_hash)
652
+ if pdf_path:
653
+ pdf_hash = _hash_file(pdf_path)
654
+ pdf_target = static_root / "pdf" / f"{pdf_hash}.pdf"
655
+ if not pdf_target.exists():
656
+ pdf_target.write_bytes(pdf_path.read_bytes())
657
+
658
+ template_summaries = _extract_template_summaries(paper)
659
+ preferred_summary_template = _choose_preferred_summary_template(paper, template_summaries)
660
+ preferred_summary_markdown = template_summaries.get(preferred_summary_template) or ""
661
+ preview_source = template_summaries.get("simple") or preferred_summary_markdown
662
+ summary_preview = _summary_preview(preview_source)
663
+
664
+ base_summary_payload = {
665
+ "paper_id": paper_id,
666
+ "paper_title": title,
667
+ "paper_authors": authors,
668
+ "publication_date": paper.get("publication_date") or "",
669
+ "publication_venue": _normalize_display_venue(str(paper.get("publication_venue") or venue)),
670
+ "abstract": paper.get("abstract") or "",
671
+ "keywords": paper.get("keywords") or paper.get("_keywords") or [],
672
+ "paper_institutions": paper.get("paper_institutions") or [],
673
+ "output_language": paper.get("output_language") or "",
674
+ "provider": paper.get("provider") or "",
675
+ "model": paper.get("model") or "",
676
+ "prompt_template": paper.get("prompt_template") or paper.get("template_tag") or "",
677
+ "extracted_at": paper.get("extracted_at") or "",
678
+ }
679
+
680
+ # Back-compat + convenience: summary/<paper_id>.json always exists and points to the preferred template.
681
+ _write_json(
682
+ static_root / "summary" / f"{paper_id}.json",
683
+ {
684
+ **base_summary_payload,
685
+ "template_tag": preferred_summary_template,
686
+ "summary": preferred_summary_markdown,
687
+ "available_templates": sorted(template_summaries.keys(), key=lambda item: item.lower()),
688
+ },
689
+ )
690
+
691
+ # Per-template summary exports.
692
+ summary_dir = static_root / "summary" / paper_id
693
+ for template_tag, summary_markdown in template_summaries.items():
694
+ _write_json(
695
+ summary_dir / f"{template_tag}.json",
696
+ {
697
+ **base_summary_payload,
698
+ "template_tag": template_tag,
699
+ "summary": summary_markdown,
700
+ },
701
+ )
702
+
703
+ folder_name, folder_name_short = _folder_names(first_author, year, title, paper_id)
704
+ pdf_filename = _sanitize_component(f"{first_author}_{year}_{title}") or f"{paper_id}"
705
+ pdf_filename = _truncate(pdf_filename, 120) + ".pdf"
706
+
707
+ manifest_payload = {
708
+ "paper_id": paper_id,
709
+ "folder_name": folder_name,
710
+ "folder_name_short": folder_name_short,
711
+ "assets": {
712
+ "pdf": {
713
+ "static_path": f"pdf/{pdf_hash}.pdf" if pdf_hash else None,
714
+ "zip_path": pdf_filename if pdf_hash else None,
715
+ "sha256": pdf_hash,
716
+ },
717
+ "source_md": {
718
+ "static_path": f"md/{source_md_hash}.md" if source_md_hash else None,
719
+ "zip_path": "source.md" if source_md_hash else None,
720
+ "sha256": source_md_hash,
721
+ },
722
+ "translated_md": [
723
+ {
724
+ "lang": lang,
725
+ "static_path": f"md_translate/{lang}/{md_hash}.md",
726
+ "zip_path": f"translated/{lang}.md",
727
+ "sha256": md_hash,
728
+ }
729
+ for lang, md_hash in sorted(translated_hashes.items())
730
+ ],
731
+ "summary": {
732
+ "static_path": f"summary/{paper_id}.json",
733
+ "zip_path": "summary.json",
734
+ },
735
+ "summary_templates": [
736
+ {
737
+ "template_tag": template_tag,
738
+ "static_path": f"summary/{paper_id}/{template_tag}.json",
739
+ "zip_path": f"summaries/{template_tag}.json",
740
+ }
741
+ for template_tag in sorted(template_summaries.keys(), key=lambda item: item.lower())
742
+ ],
743
+ },
744
+ "images": [
745
+ {
746
+ "static_path": item.get("path"),
747
+ "zip_path": item.get("path"),
748
+ "sha256": item.get("sha256"),
749
+ "ext": item.get("ext"),
750
+ "status": item.get("status"),
751
+ }
752
+ for item in images
753
+ ],
754
+ "conflicts": conflicts,
755
+ }
756
+ if images:
757
+ deduped: dict[str, dict[str, Any]] = {}
758
+ for item in manifest_payload["images"]:
759
+ key = str(item.get("static_path") or "")
760
+ if not key:
761
+ continue
762
+ if key not in deduped:
763
+ deduped[key] = item
764
+ elif deduped[key].get("status") != "available" and item.get("status") == "available":
765
+ deduped[key] = item
766
+ manifest_payload["images"] = list(deduped.values())
767
+ _write_json(static_root / "manifest" / f"{paper_id}.json", manifest_payload)
768
+
769
+ conn.execute(
770
+ """
771
+ INSERT OR REPLACE INTO paper(
772
+ paper_id, paper_key, paper_key_type, title, year, month, publication_date, venue, preferred_summary_template, summary_preview, paper_index,
773
+ source_hash, output_language, provider, model, prompt_template, extracted_at,
774
+ pdf_content_hash, source_md_content_hash
775
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
776
+ """,
777
+ (
778
+ paper_id,
779
+ preferred.paper_key,
780
+ preferred.key_type,
781
+ title,
782
+ year,
783
+ month,
784
+ pub_date,
785
+ venue,
786
+ preferred_summary_template,
787
+ summary_preview,
788
+ 0,
789
+ source_hash,
790
+ str(paper.get("output_language") or ""),
791
+ str(paper.get("provider") or ""),
792
+ str(paper.get("model") or ""),
793
+ str(paper.get("prompt_template") or paper.get("template_tag") or ""),
794
+ str(paper.get("extracted_at") or ""),
795
+ pdf_hash,
796
+ source_md_hash,
797
+ ),
798
+ )
799
+
800
+ for template_tag in sorted(template_summaries.keys(), key=lambda item: item.lower()):
801
+ conn.execute(
802
+ "INSERT OR IGNORE INTO paper_summary(paper_id, template_tag) VALUES (?, ?)",
803
+ (paper_id, template_tag),
804
+ )
805
+
806
+ for lang, md_hash in translated_hashes.items():
807
+ conn.execute(
808
+ "INSERT OR REPLACE INTO paper_translation(paper_id, lang, md_content_hash) VALUES (?, ?, ?)",
809
+ (paper_id, lang, md_hash),
810
+ )
811
+
812
+ for cand in candidates:
813
+ conn.execute(
814
+ """
815
+ INSERT OR REPLACE INTO paper_key_alias(paper_key, paper_id, paper_key_type, meta_fingerprint)
816
+ VALUES (?, ?, ?, ?)
817
+ """,
818
+ (
819
+ cand.paper_key,
820
+ paper_id,
821
+ cand.key_type,
822
+ cand.meta_fingerprint if cand.key_type == "meta" else None,
823
+ ),
824
+ )
825
+
826
+ def upsert_facet(table: str, join_table: str, id_col: str, value: str) -> None:
827
+ normalized = _normalize_facet_value(value)
828
+ if not normalized or normalized == "unknown":
829
+ return
830
+ conn.execute(
831
+ f"INSERT OR IGNORE INTO {table}(value) VALUES (?)",
832
+ (normalized,),
833
+ )
834
+ row = conn.execute(
835
+ f"SELECT {id_col} FROM {table} WHERE value = ?",
836
+ (normalized,),
837
+ ).fetchone()
838
+ if not row:
839
+ return
840
+ facet_id = int(row[0])
841
+ conn.execute(
842
+ f"INSERT OR IGNORE INTO {join_table}(paper_id, {id_col}) VALUES (?, ?)",
843
+ (paper_id, facet_id),
844
+ )
845
+
846
+ for author in authors:
847
+ upsert_facet("author", "paper_author", "author_id", str(author))
848
+ keywords = paper.get("keywords") or paper.get("_keywords") or []
849
+ if isinstance(keywords, list):
850
+ for kw in keywords:
851
+ upsert_facet("keyword", "paper_keyword", "keyword_id", str(kw))
852
+ institutions = paper.get("paper_institutions") or []
853
+ if isinstance(institutions, list):
854
+ for inst in institutions:
855
+ upsert_facet("institution", "paper_institution", "institution_id", str(inst))
856
+ tags = paper.get("ai_generated_tags") or paper.get("_tags") or []
857
+ if isinstance(tags, list):
858
+ for tag in tags:
859
+ upsert_facet("tag", "paper_tag", "tag_id", str(tag))
860
+ upsert_facet("venue", "paper_venue", "venue_id", venue)
861
+
862
+ graph_nodes: set[int] = set()
863
+
864
+ def add_graph_nodes(facet_type: str, values: Any) -> None:
865
+ if values is None:
866
+ return
867
+ if isinstance(values, (list, tuple, set)):
868
+ iterable = values
869
+ else:
870
+ iterable = [values]
871
+ for item in iterable:
872
+ node_id = get_facet_node_id(facet_type, item)
873
+ if node_id is not None:
874
+ graph_nodes.add(node_id)
875
+
876
+ add_graph_nodes("author", authors)
877
+ if isinstance(keywords, list):
878
+ add_graph_nodes("keyword", keywords)
879
+ if isinstance(institutions, list):
880
+ add_graph_nodes("institution", institutions)
881
+ if isinstance(tags, list):
882
+ add_graph_nodes("tag", tags)
883
+ add_graph_nodes("venue", venue)
884
+ add_graph_nodes("year", year)
885
+ add_graph_nodes("month", month)
886
+ add_graph_nodes("summary_template", list(template_summaries.keys()))
887
+ add_graph_nodes("output_language", paper.get("output_language"))
888
+ add_graph_nodes("provider", paper.get("provider"))
889
+ add_graph_nodes("model", paper.get("model"))
890
+ add_graph_nodes("prompt_template", paper.get("prompt_template") or paper.get("template_tag"))
891
+ add_graph_nodes("translation_lang", list(translated_hashes.keys()))
892
+
893
+ for node_id in graph_nodes:
894
+ conn.execute(
895
+ "INSERT OR IGNORE INTO paper_facet(paper_id, node_id) VALUES (?, ?)",
896
+ (paper_id, node_id),
897
+ )
898
+
899
+ node_list = sorted(graph_nodes)
900
+ if len(node_list) > 1:
901
+ edge_rows = []
902
+ for idx, left in enumerate(node_list):
903
+ for right in node_list[idx + 1 :]:
904
+ edge_rows.append((left, right))
905
+ conn.executemany(
906
+ """
907
+ INSERT INTO facet_edge(node_id_a, node_id_b, paper_count)
908
+ VALUES (?, ?, 1)
909
+ ON CONFLICT(node_id_a, node_id_b)
910
+ DO UPDATE SET paper_count = paper_count + 1
911
+ """,
912
+ edge_rows,
913
+ )
914
+
915
+ summary_text = markdown_to_plain_text(" ".join(template_summaries.values()))
916
+ source_text = ""
917
+ translated_text = ""
918
+ if source_md_hash and md_path:
919
+ source_text = markdown_to_plain_text(_safe_read_text(static_root / "md" / f"{source_md_hash}.md"))
920
+ if translated_hashes:
921
+ translated_parts: list[str] = []
922
+ for lang, md_hash in translated_hashes.items():
923
+ translated_parts.append(
924
+ markdown_to_plain_text(
925
+ _safe_read_text(static_root / "md_translate" / lang / f"{md_hash}.md")
926
+ )
927
+ )
928
+ translated_text = " ".join(part for part in translated_parts if part)
929
+
930
+ metadata_text = " ".join(
931
+ part
932
+ for part in [
933
+ title,
934
+ " ".join(str(a) for a in authors),
935
+ venue,
936
+ " ".join(str(k) for k in (keywords if isinstance(keywords, list) else [])),
937
+ " ".join(str(i) for i in (institutions if isinstance(institutions, list) else [])),
938
+ year,
939
+ ]
940
+ if part
941
+ )
942
+
943
+ conn.execute(
944
+ """
945
+ INSERT INTO paper_fts(paper_id, title, summary, source, translated, metadata)
946
+ VALUES (?, ?, ?, ?, ?, ?)
947
+ """,
948
+ (
949
+ paper_id,
950
+ insert_cjk_spaces(title),
951
+ insert_cjk_spaces(summary_text),
952
+ insert_cjk_spaces(source_text),
953
+ insert_cjk_spaces(translated_text),
954
+ insert_cjk_spaces(metadata_text),
955
+ ),
956
+ )
957
+ conn.execute(
958
+ "INSERT INTO paper_fts_trigram(paper_id, title, venue) VALUES (?, ?, ?)",
959
+ (paper_id, title.lower(), venue.lower()),
960
+ )
961
+
962
+ recompute_paper_index(conn)
963
+ recompute_facet_counts(conn)
964
+ finally:
965
+ conn.close()